In [1]:
# custom imports
import os
import sys

# import module 
import requests 
import pandas as pd 
from bs4 import BeautifulSoup 
import matplotlib.pyplot as plt 
from collections import defaultdict
import numpy as np

In [2]:
# natural language processing: n-gram ranking
import re
import unicodedata
import nltk
from nltk.corpus import stopwords

In [3]:
# pandas display settings
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.width = None

In [4]:
# read and store all keywords in lowercase
stopwords_path = os.path.abspath(os.path.join(os.path.abspath("cumulative_vocab_match_prefix_suffix.ipynb"), "../../../../Read_Files", "stopwords_cleaned.txt"))
with open(stopwords_path) as file:
    stopwords = [line.strip().lower() for line in file]

In [5]:
def extract_words_from_text(text):
    words = [w for w in text.split(' ')]
    return words

In [6]:
def extract_vocabulary():
    curr_path = os.path.abspath("cumulative_vocab_match_prefix_suffix.ipynb")
    df_path = os.path.abspath(os.path.join(curr_path, "../../../..", "Read_Files/fashion_vocabulary.csv"))
    df = pd.read_csv(df_path)

    vocabulary_list = [v.lower() for v in df['Specifications']]

    return vocabulary_list

In [7]:
def extract_nouns(vocabulary_list):
    nouns = []
    for v in vocabulary_list:
        last_word = [w for w in v.split(' ')][-1]
        nouns.append(last_word)
    return nouns

In [8]:
# getting page content
def getPageContent(link):
    html_response = requests.get(link)
    html_text = html_response.text
    soup = BeautifulSoup(html_text, "lxml")
    return soup

In [9]:
# function to cleanup text
def clean_text(text):
    """
    Function to clean up the passed text.\n
    All words are lemmatized afte encoding and basic regex parsing is performed.\n
    \n
    Parameters:
    text - Text to be worked with
    """

    # text cleaning
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower())
    words = re.sub(r'[^\w\s]', '', text).split()

    # word list
    return [word for word in words if word not in stopwords]

In [10]:
# various sources of text
def getContentFromTags(link):
    soup = getPageContent(link)

    para_texts = [element.text.strip() for element in soup.find_all("p")]
    for p in range(len(para_texts)):
        if para_texts[p] == "":
            continue
        if para_texts[p][-1] != '.':
            para_texts[p] += '. '

    header_texts = [element.text.strip() for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
    for p in range(len(header_texts)):
        if header_texts[p] == "":
            continue
        if header_texts[p][-1] != '.':
            header_texts[p] += '. '

    span_texts = [element.text.strip() for element in soup.find_all("span")]
    for p in range(len(span_texts)):
        if span_texts[p] == "":
            continue
        if span_texts[p][-1] != '.':
            span_texts[p] += '. '

    all_texts = para_texts + header_texts + span_texts
    all_texts_string = ""
    for v in all_texts:
        all_texts_string += v

    all_texts_string = all_texts_string.lower()

    return all_texts_string

In [11]:
def getTextFromAllBlogs():
    curr_path = os.path.abspath("cumulative_vocab_match_prefix_suffix.ipynb")
    df_path = os.path.abspath(os.path.join(curr_path, "../../", "alokeveer_analytics_data.csv"))
    df = pd.read_csv(df_path)

    merged_string = ""

    for link in df['Blog Link']:
        st = getContentFromTags(str(link))
        merged_string += st

    return merged_string

In [12]:
def getWordsToProcess():
    vocabulary = extract_vocabulary()
    nouns = extract_nouns(vocabulary)
    # all_texts_string = getContentFromTags(link)
    # words = clean_text(all_texts_string)
    merged_string = getTextFromAllBlogs()
    words = clean_text(merged_string)
    return words, nouns

In [13]:
def sort_dict_by_value(d, reverse = True):
  return dict(sorted(d.items(), key = lambda x: x[1], reverse = reverse))

In [14]:
def StoreResultInDict():
    words, nouns = getWordsToProcess()
    prefix_dict = defaultdict(dict)
    suffix_dict = defaultdict(dict)

    for noun in nouns:
        for i in range(len(words)):
            if words[i].startswith(noun) == True:
                if i-1 >= 0:
                    if words[i-1] in prefix_dict[words[i]].keys():
                        prefix_dict[words[i]][words[i-1]] += 1
                    else:
                        prefix_dict[words[i]][words[i-1]] = 1
                if i+1 < len(words):
                    if words[i+1] in suffix_dict[words[i]].keys():
                        suffix_dict[words[i]][words[i+1]] += 1
                    else:
                        suffix_dict[words[i]][words[i+1]] = 1

    return prefix_dict, suffix_dict

In [15]:
def getDataFrames():
    prefix_dict, suffix_dict = StoreResultInDict()

    all_keywords = list(set(prefix_dict.keys()).union(set(suffix_dict.keys())))
    all_keywords.sort()
    df = pd.DataFrame(columns=['Prefix3', 'Prefix2', 'Prefix1', 'Keyword', 'Suffix1', 'Suffix2', 'Suffix3'])

    for keyword in all_keywords:
        current_row = ["-", "-", "-", keyword, "-", "-", "-"]
        j = 2
        for (k, v) in sort_dict_by_value(prefix_dict[keyword]).items():
            if j == -1:
                break
            current_row[j] = f"({k}, {v})"
            j -= 1    

        j = 4
        for (k, v) in sort_dict_by_value(suffix_dict[keyword]).items():
            if j == 7:
                break
            current_row[j] = f"({k}, {v})"
            j += 1     

        df.loc[len(df.index)] = current_row

    return df.to_string()

In [16]:
# print summary
# set output path
curr_path = os.path.abspath("cumulative_vocab_match_prefix_suffix.ipynb")
print(curr_path)
output_path = os.path.abspath(os.path.join(curr_path, "../../", "Outputs/cumulative_vocab_match_prefix_suffix.txt"))

# write to file
original_stdout = sys.stdout
with open(output_path, "w") as f:
    sys.stdout = f

    print("Vocab match top 3 prefix suffix on my all blogs combined")

    cur_df = getDataFrames()
    print(cur_df)
    
    # reset stdout
    sys.stdout = original_stdout

e:\Projects\Curience-Work\Analytics\Alokeveer\vocab\cumulative_vocab_match_prefix_suffix.ipynb
