In [1]:
# standard imports
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt

# natural language processing
import re
import nltk
import unicodedata
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# get all links
curr_path = os.path.abspath("testing_tfidf_vogue.ipynb")
df_path = os.path.abspath(os.path.join(curr_path, "../../..", "Read_Files/fashion_intern_forecasting_website_list.csv"))
df = pd.read_csv(df_path)
count = df.shape[0]

In [3]:
# read and store all keywords in lowercase
keywords_path = os.path.abspath(os.path.join(os.path.abspath("testing_ngrams_vogue.ipynb"), "../../..", "Read_Files", "fashion_vocabulary_keywords_list.txt"))
with open(keywords_path) as file:
    keywords = [line.strip().lower() for line in file]
keywords.reverse()

In [4]:
# store all stopwords
additional_stopwords = []
stopwords = nltk.corpus.stopwords.words('english') + additional_stopwords

In [5]:
# function to cleanup text
def clean_sentence(text):
    """
    Function to clean up the passed text.\n
    \n
    Parameters:
    text - Text to be worked with
    """

    # text cleaning
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower())
    text = re.sub(r'[^\w\s]', '', text)

    return text

In [6]:
# function to cleanup text
def lemmatize_sentence(text):
    """
    Function to get words from passed text.\n
    \n
    Parameters:
    text - Text to be worked with
    """

    # lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()

    # get words
    words = text.split()

    # word list
    return " ".join([wnl.lemmatize(word) for word in words if wnl.lemmatize(word) not in stopwords])

In [7]:
# function to get tfidf scoring
def get_scores(corpus, n=1, m=1):
    vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(n, m))
    tfIdf = vectorizer.fit_transform(corpus)
    for i in range(len(successful_blog_list)):
        if(successful_blog_list[i] == "https://www.vogue.co.uk/fashion/gallery/spring-summer-2022-fashion-trends"):
            df = pd.DataFrame(tfIdf[i].T.todense(), index=vectorizer.get_feature_names(), columns=["Score"])
            df = df.sort_values("Score", ascending=False)
    print(df.head(25))
    print()

### Get Corpus

In [8]:
# get document corpus
document_corpus = []
document_corpus_useful = []
successful_blog_list = []
for i in range(count):
    # blog link
    blog_link = df["Website URL"][i]

    # getting page content
    html_response = requests.get(blog_link)
    if html_response.status_code != 200:
        continue
    else:
        successful_blog_list.append(blog_link)

    # get soup object
    html_text = html_response.text
    soup = BeautifulSoup(html_text, "lxml")

    # get all text
    all_text = [element.text.strip() for element in soup.find_all(["p", "span", "h1", "h2", "h3", "h4", "h5", "h6"])]

    all_sentences = " ".join(all_text).split('.')
    clean_sentences = [lemmatize_sentence(clean_sentence(sentence)) for sentence in all_sentences]
    useful_sentences = []
    for sentence in clean_sentences:
        for keyword in keywords:
            if keyword in sentence:
                useful_sentences.append(sentence)
                break
    document_corpus.append(" ".join(clean_sentences))
    document_corpus_useful.append(" ".join(useful_sentences))

In [9]:
# get scores 2-grams
print("2-grams:")
get_scores(corpus=document_corpus, n=2, m=2)

2-grams:
                      Score
alexander mcqueen  0.235508
bottega veneta     0.150978
miu miu            0.145027
louis vuitton      0.118024
dolce gabbana      0.117754
supriya lele       0.103271
christian dior     0.100652
mcqueen alexander  0.098128
tom ford           0.088518
richard quinn      0.083877
giorgio armani     0.078503
philosophy di      0.078503
lorenzo serafini   0.078503
alberta ferretti   0.078503
veneta bottega     0.078503
molly goddard      0.078503
di lorenzo         0.078503
gucci gucci        0.078503
emilia wickstead   0.073765
max mara           0.067101
saint sernin       0.067101
springsummer 2022  0.059513
fashion trend      0.059281
ludovic de         0.059012
de saint           0.059012



In [10]:
# get scores 2-grams
print("2-grams:")
get_scores(corpus=document_corpus_useful, n=2, m=2)

2-grams:
                      Score
alexander mcqueen  0.238815
miu miu            0.147064
bottega veneta     0.136087
louis vuitton      0.119681
supriya lele       0.104721
christian dior     0.102065
mcqueen alexander  0.099506
dolce gabbana      0.099506
tom ford           0.089761
richard quinn      0.085054
lorenzo serafini   0.079605
gucci gucci        0.079605
philosophy di      0.079605
giorgio armani     0.079605
di lorenzo         0.079605
alberta ferretti   0.079605
veneta bottega     0.079605
molly goddard      0.079605
emilia wickstead   0.074801
saint sernin       0.068043
max mara           0.068043
springsummer 2022  0.060349
ludovic de         0.059840
rejina pyo         0.059840
de saint           0.059840

