In [1]:
# custom imports
import os
import sys

# import module 
import requests 
import pandas as pd 
from bs4 import BeautifulSoup 
import matplotlib.pyplot as plt 

In [2]:
# natural language processing: n-gram ranking
import re
import unicodedata
import nltk
from nltk.corpus import stopwords

In [3]:
# store all stopwords
additional_stopwords = []
stopwords = nltk.corpus.stopwords.words('english') + additional_stopwords

In [4]:
# function to cleanup text and find words in it
def extract_sentences_from_text(text):
    return [sentence.strip() for sentence in text.split('.') if sentence != ""]

In [5]:
def extract_vocabulary():
    curr_path = os.path.abspath("cumulative_vocab_match_sentences.ipynb")
    df_path = os.path.abspath(os.path.join(curr_path, "../../../..", "Read_Files/fashion_vocabulary.csv"))
    df = pd.read_csv(df_path)

    vocabulary_list = [v.lower() for v in df['Specifications']]

    return vocabulary_list

In [6]:
def edit_distance(words, vocabulary):
    lis = []
    
    for extracted_word in words:
        for vocab in vocabulary:
            cnt = 0
            for i in range(max(len(vocab), len(extracted_word))):
                if i >= len(vocab) or i >= len(extracted_word):
                    cnt += 1
                elif vocab[i] != extracted_word[i]:
                    cnt += 1
            
            lis.append([vocab, extracted_word, cnt])

    df = pd.DataFrame(lis, columns=["Vocaubulary", "Extracted word", "Edit distance"])
    df.sort_values(by="Edit distance", inplace=True)

    print(df)

    all_matching_words = set()
    
    for index, row in df.iterrows():
        if row["Edit distance"] == 0:
            all_matching_words.add(row["Extracted word"])
    
    print(all_matching_words)


In [7]:
# getting page content
def getPageContent(link):
    html_response = requests.get(link)
    html_text = html_response.text
    soup = BeautifulSoup(html_text, "lxml")
    return soup

In [8]:
# various sources of text
def getContentFromTags(link):
    soup = getPageContent(link)

    para_texts = [element.text.strip() for element in soup.find_all("p")]
    for p in range(len(para_texts)):
        if para_texts[p] == "":
            continue
        if para_texts[p][-1] != '.':
            para_texts[p] += '. '

    header_texts = [element.text.strip() for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
    for p in range(len(header_texts)):
        if header_texts[p] == "":
            continue
        if header_texts[p][-1] != '.':
            header_texts[p] += '. '

    span_texts = [element.text.strip() for element in soup.find_all("span")]
    for p in range(len(span_texts)):
        if span_texts[p] == "":
            continue
        if span_texts[p][-1] != '.':
            span_texts[p] += '. '

    all_texts = para_texts + header_texts + span_texts
    all_texts_string = ""
    for v in all_texts:
        all_texts_string += v

    all_texts_string = all_texts_string.lower()

    return all_texts_string

In [9]:
def getTextFromAllBlogs():
    curr_path = os.path.abspath("cumulative_vocab_match_sentences.ipynb")
    df_path = os.path.abspath(os.path.join(curr_path, "../../", "alokeveer_analytics_data.csv"))
    df = pd.read_csv(df_path)

    merged_string = ""

    for link in df['Blog Link']:
        st = getContentFromTags(str(link))
        merged_string += st

    return merged_string

In [10]:
def FinalSentences():
    set_of_sentences = set()

    vocabulary = extract_vocabulary()
    sentences = extract_sentences_from_text(getTextFromAllBlogs())

    for v in vocabulary:
        l = [word for word in v.split(' ')]
        if len(l) > 2:
            continue
        for sentence in sentences:
            sentence_split = [word for word in sentence.split(' ')]
            cur = -1
            for i in range(len(sentence_split)):
                if sentence_split[i].startswith(l[-1]) == True:
                    cur = i
                    break
            
            if cur == -1:
                continue

            if len(l) == 1 or len(sentence_split) == 1:
                # set_of_sentences.add(sentence)
                continue
            else:
                j = i-1
                k = i+1

            while j >= 0 and i-j <= 3:
                if sentence_split[j].startswith(l[-2]) == True:
                    set_of_sentences.add(sentence)
                    break
                j -= 1

            while k < len(sentence_split) and k-i <= 3:
                if sentence_split[k].startswith(l[-2]) == True:
                    set_of_sentences.add(sentence)
                    break
                k += 1       

    return set_of_sentences    

In [11]:
# print summary
# set output path
curr_path = os.path.abspath("cumulative_vocab_match_sentences.ipynb")
print(curr_path)
output_path = os.path.abspath(os.path.join(curr_path, "../../", "Outputs/cumulative_vocab_match_sentences.txt"))

# write to file
original_stdout = sys.stdout
with open(output_path, "w") as f:
    sys.stdout = f

    print("Vocab match (Useful sentences) on all my blogs merged")
    print()

    set_of_sentences = FinalSentences()
    ind = 1
    for final_sentence in set_of_sentences:
        print(ind, ". ", final_sentence)
        ind += 1
    
    # reset stdout
    sys.stdout = original_stdout

e:\Projects\Curience-Work\Analytics\Alokeveer\vocab\cumulative_vocab_match_sentences.ipynb
