In [1]:
# standard imports
import os
import sys
import requests
from bs4 import BeautifulSoup
import pandas as pd

# natural language processing: n-gram ranking
import re
import nltk
import unicodedata
from nltk.corpus import stopwords

In [None]:
# pandas display settings
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.width = None

In [2]:
# store all stopwords
additional_stopwords = []
stopwords = nltk.corpus.stopwords.words('english') + additional_stopwords

In [3]:
# important variables
curr_path = os.path.abspath("nouns_from_ngrams_intern_list.ipynb")

# csv that will contain all blog links
csv_path = os.path.abspath(os.path.join(curr_path, "../../../..", "Read_Files/fashion_intern_forecasting_website_list.csv"))
df = pd.read_csv(csv_path)
no_of_blogs = df.shape[0]

# number of ngrams to consider from each blog
terms_to_consider = 15

# lists for storing data, _i stands for i-gram data
successful_url_list = []
noun_lists_1 = []
noun_lists_2 = []
noun_lists_3 = []
occurence_count_1 = {}
occurence_count_2 = {}
occurence_count_3 = {}

In [4]:
# function to get all page content from html response
def get_page_text(html_response):
    # getting page content
    html_text = html_response.text
    soup = BeautifulSoup(html_text, "lxml")

    # various sources of text
    para_text = [element.text.strip() for element in soup.find_all("p")]
    header_text = [element.text.strip() for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
    span_text = [element.text.strip() for element in soup.find_all("span")]
    all_text = para_text + header_text + span_text
    
    return all_text

In [5]:
# function to cleanup text and find words in it
def extract_words_from_text(text):
    # text cleaning
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore'))
    words = re.sub(r'[^\w\s]', '', text).split()

    # word list
    return [word for word in words if word.lower() not in stopwords]

In [6]:
# function to find most frequent ngrams from word list
def find_ngrams(words, n):
    # find term frequency
    series = (pd.Series(nltk.ngrams(words, n)).value_counts())[:terms_to_consider]
    
    # store top results
    ngrams = []
    for tup in series.index:
        ngrams.append(" ".join(tup))
    
    return ngrams

In [7]:
# function to return words having capital letters
def find_nouns(ngrams):
    nouns = [ngram for ngram in ngrams if not ngram.islower()]
    return nouns

In [8]:
# function to find occurence of each noun
def fill_occurence_dict(noun_lists):
    dictionary = {}

    # initialize dictionary
    for noun_list in noun_lists:
        for noun in noun_list:
            dictionary[noun] = []

    # fill unique occurence
    for i in range(len(noun_lists)):
        for noun in noun_lists[i]:
            if i not in dictionary[noun]:
                dictionary[noun].append(i)

    return dictionary

In [14]:
# function to print frequently occuring nouns
def print_top_nouns(dictionary):
    df_top_nouns = pd.DataFrame(columns=["ngram", "occurence"])
    for key, value in dictionary.items():
        if len(value) > 1:
            df_top_nouns.loc[len(df_top_nouns.index)] = [key, len(value)]
    df_top_nouns.sort_values("occurence", ascending=False, inplace=True)
    df_top_nouns.reset_index(inplace=True, drop=True)
    print(df_top_nouns)
    print()

In [15]:
# function to print blog wise top results
def print_blog_nouns(noun_lists):
    for i in range(len(successful_url_list)):
        print(successful_url_list[i])
        for noun in noun_lists[i]:
            print(noun)
        print()

In [11]:
# iterate through all blogs
for blog_link in df["Website URL"]:
    # getting page response
    html_response = requests.get(blog_link)
    if(html_response.status_code != 200):
        continue

    # get page content
    all_text = get_page_text(html_response)
    # get words list
    words = extract_words_from_text(" ".join(all_text))
    # get top ngrams for each blog
    ngrams_1 = find_ngrams(words, 1)
    ngrams_2 = find_ngrams(words, 2)
    ngrams_3 = find_ngrams(words, 3)
    # get nouns from ngrams
    nouns_1 = find_nouns(ngrams_1)
    nouns_2 = find_nouns(ngrams_2)
    nouns_3 = find_nouns(ngrams_3)

    # store data
    successful_url_list.append(blog_link)
    noun_lists_1.append(nouns_1)
    noun_lists_2.append(nouns_2)
    noun_lists_3.append(nouns_3)

In [12]:
# fill all occurence dictionaries
occurence_count_1 = fill_occurence_dict(noun_lists_1)
occurence_count_2 = fill_occurence_dict(noun_lists_2)
occurence_count_3 = fill_occurence_dict(noun_lists_3)

In [16]:
# print all output
# set output path
output_path = os.path.abspath(os.path.join(curr_path, "../..", "Outputs/nouns_from_ngrams_intern_list.txt"))

# write to file
original_stdout = sys.stdout
with open(output_path, "w") as f:
    sys.stdout = f

    print("NGRAMS OCCURING IN MULTIPLE BLOGS:")
    print()
    # print most frequent nouns
    print("For 1-grams:")    
    print_top_nouns(occurence_count_1)
    print("For 2-grams:")
    print_top_nouns(occurence_count_2)
    print("For 3-grams:")
    print_top_nouns(occurence_count_3)

    print()
    print()
    print()
    
    print("INDIVIDUAL BLOG ANALYSIS:")
    print()
    # print most frequent nouns per blog
    print("For 1-grams:")
    print()
    print_blog_nouns(noun_lists_1)
    print()
    print()
    print("For 2-grams:")
    print()
    print_blog_nouns(noun_lists_2)
    print()
    print()
    print("For 3-grams:")
    print()
    print_blog_nouns(noun_lists_3)
    
    # reset stdout
    sys.stdout = original_stdout