In [1]:
# standard imports
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt

# natural language processing
import re
import nltk
import unicodedata
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# get all links
curr_path = os.path.abspath("testing_tfidf_vogue.ipynb")
df_path = os.path.abspath(os.path.join(curr_path, "../../..", "Read_Files/fashion_intern_forecasting_website_list.csv"))
df = pd.read_csv(df_path)
count = df.shape[0]

In [3]:
# store all stopwords
additional_stopwords = []
stopwords = nltk.corpus.stopwords.words('english') + additional_stopwords

In [4]:
# function to cleanup text
def clean_text(text):
    """
    Function to clean up the passed text.\n
    All words are lemmatized afte encoding and basic regex parsing is performed.\n
    \n
    Parameters:
    text - Text to be worked with
    """

    # lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()

    # text cleaning
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower())
    words = re.sub(r'[^\w\s]', '', text).split()

    # word list
    return " ".join([wnl.lemmatize(word) for word in words])

In [5]:
# get document corpus
document_corpus = []
successful_blog_list = []
for i in range(count):
    # blog link
    blog_link = df["Website URL"][i]

    # getting page content
    html_response = requests.get(blog_link)
    if html_response.status_code != 200:
        continue

    # get soup object
    html_text = html_response.text
    soup = BeautifulSoup(html_text, "lxml")

    # get all text
    all_text = [element.text.strip() for element in soup.find_all(["p", "span", "h1", "h2", "h3", "h4", "h5", "h6"])]

    document_corpus.append(clean_text(text=" ".join(all_text)))
    successful_blog_list.append(blog_link)

In [6]:
# function to get tfidf scoring
def get_scores(corpus=document_corpus, n=1, m=1):
    vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(n, m))
    tfIdf = vectorizer.fit_transform(corpus)
    for blog in successful_blog_list:
        if(blog == "https://www.vogue.co.uk/fashion/gallery/spring-summer-2022-fashion-trends"):
            df = pd.DataFrame(tfIdf[0].T.todense(), index=vectorizer.get_feature_names(), columns=["Score"])
            df = df.sort_values("Score", ascending=False)
    print(df.head(25))

In [7]:
# get scores 1-grams
print("1-grams:")
get_scores(n=1, m=1)

1-grams:
                 Score
courtesy      0.342385
revolve       0.231665
perfect       0.208976
dress         0.198445
eloquii       0.185332
youll         0.139954
bra           0.139317
look          0.126283
fringe        0.112400
2022          0.108243
cutout        0.106854
party         0.104488
virgile       0.092666
effortlessly  0.092666
sleeve        0.092666
volume        0.092666
im            0.092666
gorgeous      0.092666
intermix      0.092666
em            0.092666
black         0.084300
fashion       0.082982
trend         0.082982
top           0.082982
stunning      0.079208


In [8]:
# get scores 2-grams
print("2-grams:")
get_scores(n=2, m=2)

2-grams:
                              Score
dress courtesy             0.142483
seeing everywhere          0.094989
dress eloquii              0.094989
victor virgile             0.094989
effortlessly cool          0.094989
youll seeing               0.094989
dress made                 0.094989
bra top                    0.081193
lowrise jean               0.081193
crop top                   0.071405
trend youll                0.057609
fashion trend              0.047820
sorbet colored             0.047494
summer bra                 0.047494
sign may                   0.047494
ryder fringe               0.047494
check shoe                 0.047494
metallic dress             0.047494
cosmopolitan participates  0.047494
im still                   0.047494
quick cantmiss             0.047494
50 zara                    0.047494
partyready time            0.047494
im sure                    0.047494
west 213                   0.047494


In [9]:
# get scores 3-grams
print("3-grams:")
get_scores(n=3, m=3)

3-grams:
                            Score
youll seeing everywhere  0.095447
trend youll seeing       0.095447
fashion trend youll      0.071749
looking perfect party    0.047723
versace sequin mini      0.047723
already even though      0.047723
knit dress eloquii       0.047723
fe noel 298              0.047723
also find resurgence     0.047723
courtesy lioness 59      0.047723
courtesy laquan smith    0.047723
courtesy grlfrnd 225     0.047723
courtesy rhode 395       0.047723
courtesy fe noel         0.047723
courtesy eloquii 68      0.047723
rejoice 2022 brings      0.047723
courtesy central park    0.047723
sorbet colored mini      0.047723
courtesy anine bing      0.047723
faux leather jacket      0.047723
courtesy zara 50         0.047723
nassir zadeh 11          0.047723
virgile volume better    0.047723
volume better puff       0.047723
sleeve cutout gown       0.047723


In [10]:
# get mixed scores
print("Mixed:")
get_scores(n=1, m=3)

Mixed:
                            Score
courtesy                 0.201254
revolve                  0.136173
perfect                  0.122836
dress                    0.116646
eloquii                  0.108938
youll                    0.082265
bra                      0.081891
dress courtesy           0.081704
look                     0.074229
fringe                   0.066069
2022                     0.063625
cutout                   0.062809
party                    0.061418
virgile                  0.054469
intermix                 0.054469
youll seeing             0.054469
dress made               0.054469
victor virgile           0.054469
gorgeous                 0.054469
em                       0.054469
effortlessly             0.054469
effortlessly cool        0.054469
volume                   0.054469
im                       0.054469
youll seeing everywhere  0.054469
