In [15]:
# standard imports
import os
import re
import sys
import requests
import unicodedata
import pandas as pd
from bs4 import BeautifulSoup

# natural language processing
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
# pandas display settings
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.width = None

In [17]:
# number of ngrams to consider from each blog
terms_to_consider = 20

In [18]:
# read and store all stopwords in lowercase
curr_path = os.path.abspath("blogwise_tfidf.ipynb")
stopwords_path = os.path.abspath(os.path.join(curr_path, "../../../..", "Read_Files", "stopwords_cleaned.txt"))
with open(stopwords_path) as file:
    stopwords = [line.strip().lower() for line in file]

In [19]:
# get all links
csv_path = os.path.abspath(os.path.join(curr_path, "../..", "pallav_analytics_data.csv"))
df = pd.read_csv(csv_path)
no_of_blogs = df.shape[0]

In [20]:
# function to get all page content from html response
def get_page_text(html_response):
    # getting page content
    html_text = html_response.text
    soup = BeautifulSoup(html_text, "lxml")

    # various sources of text
    para_text = [element.text.strip() for element in soup.find_all("p")]
    header_text = [element.text.strip() for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
    span_text = [element.text.strip() for element in soup.find_all("span")]
    all_text = para_text + header_text + span_text
    
    return " ".join(all_text)

In [21]:
# function to lemmatize and clean page text
def clean_page_text(text):
    # lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()

    # text cleaning
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower())
    text = text.replace("/", " ")
    text = text.replace("-", " ")
    words = re.sub(r'[^\w\s]', '', text).split()

    return " ".join([wnl.lemmatize(word) for word in words if wnl.lemmatize(word) not in stopwords])

In [22]:
# function to store top scoring words from each blog in corpus
def get_top_keywords(corpus, n):
    # function to get tfidf scoring
    vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(n, n))
    tfIdf = vectorizer.fit_transform(corpus)

    # get feature names
    feature_names = vectorizer.get_feature_names()

    # store data
    top_keywords = []
    for i in range(len(successful_blog_list)):
        # sort all keywords by score
        df_tfidf_scores = pd.DataFrame(tfIdf[i].T.todense(), columns=["score"])
        df_tfidf_scores["keyword"] = feature_names
        df_tfidf_scores.sort_values("score", ascending=False, inplace=True)
        df_tfidf_scores.reset_index(inplace=True, drop=True)
        
        # ensure length of list for appending
        keyword_list = df_tfidf_scores["keyword"].tolist()[:terms_to_consider]
        for i in range(max(terms_to_consider - len(keyword_list), 0)):
                keyword_list.append("")
        top_keywords.append(keyword_list)

    return top_keywords

In [23]:
# function to find occurence of each noun
def fill_occurence_dict(word_lists):
    dictionary = {}

    # initialize dictionary
    for word_list in word_lists:
        for word in word_list:
            dictionary[word] = []

    # fill unique occurence
    for i in range(len(word_lists)):
        for word in word_lists[i]:
            if i not in dictionary[word]:
                dictionary[word].append(i)

    return dictionary

In [24]:
# function to print frequently occuring nouns
def print_top_keywords(dictionary):
    df_top_nouns = pd.DataFrame(columns=["keyword", "occurence"])
    for key, value in dictionary.items():
        if len(value) > 1:
            df_top_nouns.loc[len(df_top_nouns.index)] = [key, len(value)]
    
    # sort based on frequency
    df_top_nouns.sort_values("occurence", ascending=False, inplace=True)
    df_top_nouns.reset_index(inplace=True, drop=True)
    
    print(df_top_nouns.to_string())
    print()

In [25]:
# get document corpus
successful_blog_list = []
document_corpus = []

# iterate through all blogs
for blog_link in df["Blog Link"]:
    # getting page response
    html_response = requests.get(blog_link)
    if(html_response.status_code != 200):
        continue

    # get page content
    all_text = get_page_text(html_response)

    # lemmatize and clean sentence
    clean_text = clean_page_text(all_text)

    # store data
    successful_blog_list.append(blog_link)
    document_corpus.append(clean_text)

In [26]:
# get top keywords
top_keywords_1 = get_top_keywords(document_corpus, 1)
top_keywords_2 = get_top_keywords(document_corpus, 2)
top_keywords_3 = get_top_keywords(document_corpus, 3)

In [27]:
# fill all occurence dictionaries
occurence_count_1 = fill_occurence_dict(top_keywords_1)
occurence_count_2 = fill_occurence_dict(top_keywords_2)
occurence_count_3 = fill_occurence_dict(top_keywords_3)

In [28]:
# print all output
# set output path
output_path = os.path.abspath(os.path.join(curr_path, "../..", "Outputs/blogwise_tfidf.txt"))

# write to file
original_stdout = sys.stdout
with open(output_path, "w") as f:
    sys.stdout = f

    print("KEYWORDS OCCURING IN MULTIPLE BLOGS:")
    print()
    # print summary
    print("1-gram:")    
    print_top_keywords(occurence_count_1)
    print("2-gram:")
    print_top_keywords(occurence_count_2)
    print("3-gram:")
    print_top_keywords(occurence_count_3)

    print()
    print()
    print()

    print("INDIVIDUAL BLOG ANALYSIS:")
    print()

    # print blog wise result
    for i in range(len(successful_blog_list)):    
        print(successful_blog_list[i])

        # create df for output
        df_output = pd.DataFrame()
        df_output["1-gram"] = top_keywords_1[i]
        df_output["2-gram"] = top_keywords_2[i]
        df_output["3-gram"] = top_keywords_3[i]
        
        # remove empty rows
        df_output = df_output.loc[df_output["1-gram"] + df_output["2-gram"] + df_output["3-gram"] != ""]
        
        print(df_output.to_string())
        print()
    
    # reset stdout
    sys.stdout = original_stdout