In [1]:
# standard imports
import os
import sys
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

# natural language processing
import re
import nltk
import unicodedata
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
# pandas display settings
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.width = None

In [2]:
# store all stopwords
additional_stopwords = []
stopwords = nltk.corpus.stopwords.words('english') + additional_stopwords

In [3]:
# number of ngrams to consider from each blog
terms_to_consider = 20

In [4]:
# get all links
curr_path = os.path.abspath("blogwise_tfidf_our_list.ipynb")
df_path = os.path.abspath(os.path.join(curr_path, "../..", "pallav_analytics_data.csv"))
df = pd.read_csv(df_path)
count = df.shape[0]

In [5]:
# read and store all keywords in lowercase
keywords_path = os.path.abspath(os.path.join(curr_path, "../../../..", "Read_Files", "fashion_vocabulary_keywords_list.txt"))
with open(keywords_path) as file:
    keywords = [line.strip().lower() for line in file]
keywords.reverse()

In [6]:
# function to get all page content from html response
def get_page_text(html_response):
    # getting page content
    html_text = html_response.text
    soup = BeautifulSoup(html_text, "lxml")

    # various sources of text
    para_text = [element.text.strip() for element in soup.find_all("p")]
    header_text = [element.text.strip() for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
    span_text = [element.text.strip() for element in soup.find_all("span")]
    all_text = para_text + header_text + span_text
    
    return all_text

In [7]:
def clean_text(text):
    # text cleaning
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower())
    # words = re.sub(r'[^\w\s]', '', text).split()
    words = re.sub(r'[^a-zA-Z\s]+', '', text).split()

    # word list
    return " ".join([word for word in words if word not in stopwords])

In [8]:
def clean_text_lemm(text):
    # lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()

    # text cleaning
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower())
    # words = re.sub(r'[^\w\s]', '', text).split()
    words = re.sub(r'[^a-zA-Z\s]+', '', text).split()

    # word list
    return " ".join([wnl.lemmatize(word) for word in words if wnl.lemmatize(word) not in stopwords])

In [9]:
def get_useful_sentences(clean_sentences):
    useful_sentences = []
    for sentence in clean_sentences:
        for keyword in keywords:
            if keyword in sentence:
                useful_sentences.append(sentence)
                break
    return useful_sentences

In [10]:
def get_top_keywords(corpus, n, no_of_blogs):
    # function to get tfidf scoring
    vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(n, n))
    tfIdf = vectorizer.fit_transform(corpus)

    # get feature names
    feature_names = vectorizer.get_feature_names()

    # store data
    top_keywords = []
    for i in range(no_of_blogs):
        df_tfidf_scores = pd.DataFrame(tfIdf[i].T.todense(), columns=["score"])
        df_tfidf_scores["keyword"] = feature_names
        df_tfidf_scores.sort_values("score", ascending=False, inplace=True)
        df_tfidf_scores.reset_index(inplace=True, drop=True)
        top_keywords.append(df_tfidf_scores["keyword"].tolist()[:terms_to_consider])

    return top_keywords

In [11]:
# function to find occurence of each noun
def fill_occurence_dict(word_lists):
    dictionary = {}

    # initialize dictionary
    for word_list in word_lists:
        for word in word_list:
            dictionary[word] = []

    # fill unique occurence
    for i in range(len(word_lists)):
        for word in word_lists[i]:
            if i not in dictionary[word]:
                dictionary[word].append(i)

    return dictionary

In [12]:
# function to print frequently occuring nouns
def print_top_occurences(dictionary):
    df_top_nouns = pd.DataFrame(columns=["ngram", "occurence"])
    for key, value in dictionary.items():
        if len(value) > 1:
            df_top_nouns.loc[len(df_top_nouns.index)] = [key, len(value)]
    df_top_nouns.sort_values("occurence", ascending=False, inplace=True)
    df_top_nouns.reset_index(inplace=True, drop=True)
    print(df_top_nouns)
    print()

In [13]:
# get document corpus
successful_blog_list = []
document_corpus = []
document_corpus_useful = []
document_corpus_lemm = []
document_corpus_useful_lemm = []

# iterate through all blogs
for blog_link in df["Blog Link"]:
    # getting page response
    html_response = requests.get(blog_link)
    if(html_response.status_code != 200):
        continue

    # get page content
    all_text = get_page_text(html_response)
    # seperate all sentences
    all_sentences = " ".join(all_text).split('.')
    # filter all sentences
    clean_sentences = [clean_text(sentence) for sentence in all_sentences]
    clean_sentences_lemm = [clean_text_lemm(sentence) for sentence in all_sentences]
    # get useful semtences
    useful_sentences = get_useful_sentences(clean_sentences)
    useful_sentences_lemm = get_useful_sentences(clean_sentences_lemm)

    # store data
    successful_blog_list.append(blog_link)
    document_corpus.append(" ".join(clean_sentences))
    document_corpus_useful.append(" ".join(useful_sentences))
    document_corpus_lemm.append(" ".join(clean_sentences_lemm))
    document_corpus_useful_lemm.append(" ".join(useful_sentences_lemm))

In [14]:
# get top keywords
top_keywords_useful_lemm_1 = get_top_keywords(document_corpus_useful_lemm, 1, len(successful_blog_list))
top_keywords_useful_lemm_2 = get_top_keywords(document_corpus_useful_lemm, 2, len(successful_blog_list))
top_keywords_useful_lemm_3 = get_top_keywords(document_corpus_useful_lemm, 3, len(successful_blog_list))

top_keywords_useful_1 = get_top_keywords(document_corpus_useful, 1, len(successful_blog_list))
top_keywords_useful_2 = get_top_keywords(document_corpus_useful, 2, len(successful_blog_list))
top_keywords_useful_3 = get_top_keywords(document_corpus_useful, 3, len(successful_blog_list))

top_keywords_lemm_1 = get_top_keywords(document_corpus_lemm, 1, len(successful_blog_list))
top_keywords_lemm_2 = get_top_keywords(document_corpus_lemm, 2, len(successful_blog_list))
top_keywords_lemm_3 = get_top_keywords(document_corpus_lemm, 3, len(successful_blog_list))

top_keywords_1 = get_top_keywords(document_corpus, 1, len(successful_blog_list))
top_keywords_2 = get_top_keywords(document_corpus, 2, len(successful_blog_list))
top_keywords_3 = get_top_keywords(document_corpus, 3, len(successful_blog_list))

In [15]:
# get all summaries
# fill all occurence dictionaries
occurence_count_useful_lemm_1 = fill_occurence_dict(top_keywords_useful_lemm_1)
occurence_count_useful_lemm_2 = fill_occurence_dict(top_keywords_useful_lemm_2)
occurence_count_useful_lemm_3 = fill_occurence_dict(top_keywords_useful_lemm_3)

occurence_count_useful_1 = fill_occurence_dict(top_keywords_useful_1)
occurence_count_useful_2 = fill_occurence_dict(top_keywords_useful_2)
occurence_count_useful_3 = fill_occurence_dict(top_keywords_useful_3)

occurence_count_lemm_1 = fill_occurence_dict(top_keywords_lemm_1)
occurence_count_lemm_2 = fill_occurence_dict(top_keywords_lemm_2)
occurence_count_lemm_3 = fill_occurence_dict(top_keywords_lemm_3)

occurence_count_1 = fill_occurence_dict(top_keywords_1)
occurence_count_2 = fill_occurence_dict(top_keywords_2)
occurence_count_3 = fill_occurence_dict(top_keywords_3)

In [22]:
# print output (lemm + useful)
# set output path
output_path = os.path.abspath(os.path.join(curr_path, "../..", "Outputs/blogwise_tfidf_our_list_useful_lemm.txt"))

# write to file
original_stdout = sys.stdout
with open(output_path, "w") as f:
    sys.stdout = f

    print("BLOGWISE TOP KEYWORDS USING TF-IDF (LEMMATIZE + USEFUL):")
    print()

    for i in range(len(successful_blog_list)):
        print(successful_blog_list[i])

        df_output = pd.DataFrame()
        df_output["1-gram"] = top_keywords_useful_lemm_1[i]
        df_output["2-gram"] = top_keywords_useful_lemm_2[i]
        df_output["3-gram"] = top_keywords_useful_lemm_3[i]
        print(df_output)
        print()
    
    # reset stdout
    sys.stdout = original_stdout

In [23]:
# print output (useful)
# set output path
output_path = os.path.abspath(os.path.join(curr_path, "../..", "Outputs/blogwise_tfidf_our_list_useful.txt"))

# write to file
original_stdout = sys.stdout
with open(output_path, "w") as f:
    sys.stdout = f

    print("BLOGWISE TOP KEYWORDS USING TF-IDF (USEFUL):")
    print()

    for i in range(len(successful_blog_list)):
        print(successful_blog_list[i])

        df_output = pd.DataFrame()
        df_output["1-gram"] = top_keywords_useful_1[i]
        df_output["2-gram"] = top_keywords_useful_2[i]
        df_output["3-gram"] = top_keywords_useful_3[i]
        print(df_output)
        print()
    
    # reset stdout
    sys.stdout = original_stdout

In [24]:
# print output (lemm)
# set output path
output_path = os.path.abspath(os.path.join(curr_path, "../..", "Outputs/blogwise_tfidf_our_list_lemm.txt"))

# write to file
original_stdout = sys.stdout
with open(output_path, "w") as f:
    sys.stdout = f

    print("BLOGWISE TOP KEYWORDS USING TF-IDF (LEMMATIZE):")
    print()

    for i in range(len(successful_blog_list)):
        print(successful_blog_list[i])

        df_output = pd.DataFrame()
        df_output["1-gram"] = top_keywords_lemm_1[i]
        df_output["2-gram"] = top_keywords_lemm_2[i]
        df_output["3-gram"] = top_keywords_lemm_3[i]
        print(df_output)
        print()
    
    # reset stdout
    sys.stdout = original_stdout

In [25]:
# print output (none)
# set output path
output_path = os.path.abspath(os.path.join(curr_path, "../..", "Outputs/blogwise_tfidf_our_list.txt"))

# write to file
original_stdout = sys.stdout
with open(output_path, "w") as f:
    sys.stdout = f

    print("BLOGWISE TOP KEYWORDS USING TF-IDF:")
    print()

    for i in range(len(successful_blog_list)):
        print(successful_blog_list[i])

        df_output = pd.DataFrame()
        df_output["1-gram"] = top_keywords_1[i]
        df_output["2-gram"] = top_keywords_2[i]
        df_output["3-gram"] = top_keywords_3[i]
        print(df_output)
        print()
    
    # reset stdout
    sys.stdout = original_stdout

In [26]:
# print summary
# set output path
output_path = os.path.abspath(os.path.join(curr_path, "../..", "Outputs/blogwise_tfidf_our_list_summary.txt"))

# write to file
original_stdout = sys.stdout
with open(output_path, "w") as f:
    sys.stdout = f

    print("KEYWORDS OCCURING IN MULTIPLE BLOGS (LEMMATIZE + USEFUL):")
    print()
    # print most frequent nouns
    print("For 1-grams:")    
    print_top_occurences(occurence_count_useful_lemm_1)
    print("For 2-grams:")
    print_top_occurences(occurence_count_useful_lemm_2)
    print("For 3-grams:")
    print_top_occurences(occurence_count_useful_lemm_3)

    print()
    print()
    print()

    print("KEYWORDS OCCURING IN MULTIPLE BLOGS (USEFUL):")
    print()
    # print most frequent nouns
    print("For 1-grams:")    
    print_top_occurences(occurence_count_useful_1)
    print("For 2-grams:")
    print_top_occurences(occurence_count_useful_2)
    print("For 3-grams:")
    print_top_occurences(occurence_count_useful_3)

    print()
    print()
    print()

    print("KEYWORDS OCCURING IN MULTIPLE BLOGS (LEMMATIZE):")
    print()
    # print most frequent nouns
    print("For 1-grams:")    
    print_top_occurences(occurence_count_lemm_1)
    print("For 2-grams:")
    print_top_occurences(occurence_count_lemm_2)
    print("For 3-grams:")
    print_top_occurences(occurence_count_lemm_3)

    print()
    print()
    print()

    print("KEYWORDS OCCURING IN MULTIPLE BLOGS:")
    print()
    # print most frequent nouns
    print("For 1-grams:")    
    print_top_occurences(occurence_count_1)
    print("For 2-grams:")
    print_top_occurences(occurence_count_2)
    print("For 3-grams:")
    print_top_occurences(occurence_count_3)

    # reset stdout
    sys.stdout = original_stdout