In [11]:
# standard imports
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt

# natural language processing
import re
import nltk
import unicodedata
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
# get all links
curr_path = os.path.abspath("testing_tfidf_glamour_magazine.ipynb")
df_path = os.path.abspath(os.path.join(curr_path, "../../..", "Read_Files/fashion_intern_forecasting_website_list.csv"))
df = pd.read_csv(df_path)
count = df.shape[0]

In [13]:
# store all stopwords
additional_stopwords = []
stopwords = nltk.corpus.stopwords.words('english') + additional_stopwords

In [14]:
# function to cleanup text
def clean_text(text):
    """
    Function to clean up the passed text.\n
    All words are lemmatized afte encoding and basic regex parsing is performed.\n
    \n
    Parameters:
    text - Text to be worked with
    """

    # lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()

    # text cleaning
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower())
    words = re.sub(r'[^\w\s]', '', text).split()

    # word list
    return " ".join([wnl.lemmatize(word) for word in words])

In [15]:
# get document corpus
document_corpus = []
successful_blog_list = []
for i in range(count):
    # blog link
    blog_link = df["Website URL"][i]

    # getting page content
    html_response = requests.get(blog_link)
    if html_response.status_code != 200:
        continue

    # get soup object
    html_text = html_response.text
    soup = BeautifulSoup(html_text, "lxml")

    # get all text
    all_text = [element.text.strip() for element in soup.find_all(["p", "span", "h1", "h2", "h3", "h4", "h5", "h6"])]

    document_corpus.append(clean_text(text=" ".join(all_text)))
    successful_blog_list.append(blog_link)

In [16]:
# function to get tfidf scoring
def get_scores(corpus=document_corpus, n=1, m=1):
    vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(n, m))
    tfIdf = vectorizer.fit_transform(corpus)
    for blog in successful_blog_list:
        if(blog == "https://www.glamourmagazine.co.uk/gallery/spring-summer-2022-fashion-trends"):
            df = pd.DataFrame(tfIdf[0].T.todense(), index=vectorizer.get_feature_names_out(), columns=["Score"])
            df = df.sort_values("Score", ascending=False)
    print(df.head(25))

In [17]:
# get scores 1-grams
print("1-grams:")
get_scores(n=1, m=1)

1-grams:
                 Score
courtesy      0.342646
revolve       0.231841
perfect       0.209135
dress         0.198596
eloquii       0.185473
youll         0.140060
bra           0.139423
look          0.126379
fringe        0.112486
2022          0.108325
cutout        0.106935
party         0.093449
virgile       0.092737
em            0.092737
gorgeous      0.092737
volume        0.092737
im            0.092737
intermix      0.092737
sleeve        0.092737
effortlessly  0.092737
black         0.084364
fashion       0.083045
top           0.083045
trend         0.083045
stunning      0.079268


In [18]:
# get scores 2-grams
print("2-grams:")
get_scores(n=2, m=2)

2-grams:
                      Score
dress courtesy     0.142446
youll seeing       0.094964
victor virgile     0.094964
dress eloquii      0.094964
dress made         0.094964
seeing everywhere  0.094964
effortlessly cool  0.094964
lowrise jean       0.081172
bra top            0.081172
crop top           0.071386
trend youll        0.057594
fashion trend      0.047808
top courtesy       0.047482
cute cardigan      0.047482
princess polly     0.047482
cutout pair        0.047482
better puff        0.047482
better stunning    0.047482
cut estrop         0.047482
top ha             0.047482
910 revolve        0.047482
cutout moment      0.047482
wear plain         0.047482
mini perfect       0.047482
cutout knit        0.047482


In [19]:
# get scores 3-grams
print("3-grams:")
get_scores(n=3, m=3)

3-grams:
                                  Score
trend youll seeing             0.095447
youll seeing everywhere        0.095447
fashion trend youll            0.071749
225 revolve love               0.047723
linen bralette courtesy        0.047723
gettyjeremy moeller im         0.047723
chicest trend sure             0.047723
reign 2022 hint                0.047723
estrop statementmaking cutout  0.047723
course youll also              0.047723
metallic dress perfect         0.047723
resurgence y2k fashion         0.047723
298 intermix gorgeous          0.047723
link page recommend            0.047723
function versace sequin        0.047723
whether youre searching        0.047723
sorbet colored mini            0.047723
275 intermix puffsleeve        0.047723
lioness 59 princess            0.047723
vaquera 14 fringe              0.047723
vanades frill crop             0.047723
bougie event cut               0.047723
throw oversized buttondown     0.047723
check shoe bag                 

In [20]:
# get mixed scores
print("Mixed:")
get_scores(n=1, m=3)

Mixed:
                            Score
courtesy                 0.201290
revolve                  0.136197
perfect                  0.122858
dress                    0.116667
eloquii                  0.108958
youll                    0.082279
bra                      0.081905
dress courtesy           0.081718
look                     0.074242
fringe                   0.066081
2022                     0.063636
cutout                   0.062820
party                    0.054897
sleeve                   0.054479
intermix                 0.054479
seeing everywhere        0.054479
victor virgile           0.054479
dress eloquii            0.054479
virgile                  0.054479
effortlessly             0.054479
effortlessly cool        0.054479
youll seeing everywhere  0.054479
youll seeing             0.054479
volume                   0.054479
gorgeous                 0.054479
