In [1]:
# standard imports
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt

# natural language processing
import re
import nltk
import unicodedata
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# get all links
curr_path = os.path.abspath("testing_tfidf_vogue.ipynb")
df_path = os.path.abspath(os.path.join(curr_path, "../../..", "Read_Files/fashion_intern_forecasting_website_list.csv"))
df = pd.read_csv(df_path)
count = df.shape[0]

In [3]:
# read and store all specifications in lowercase
specifications_path = os.path.abspath(os.path.join(os.path.abspath("testing_vocab_matching_vogue.ipynb"), "../../..", "Read_Files", "fashion_vocabulary_specifications_list.txt"))
with open(specifications_path) as file:
    specifications = [line.strip().lower() for line in file]
specifications.reverse()

In [4]:
# store all stopwords
additional_stopwords = []
stopwords = nltk.corpus.stopwords.words('english') + additional_stopwords

### Getting Page Content

In [5]:
# getting page content
html_response = requests.get("https://www.vogue.co.uk/fashion/gallery/spring-summer-2022-fashion-trends")
html_text = html_response.text
soup = BeautifulSoup(html_text, "lxml")

In [6]:
# various sources of text
para_text = [element.text.strip() for element in soup.find_all("p")]
header_text = [element.text.strip() for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
span_text = [element.text.strip() for element in soup.find_all("span")]
all_text = para_text + header_text + span_text

In [7]:
useful_sentences = []
all_sentences = " ".join(all_text).split('.')
for sentence in all_sentences:
    for specification in specifications:
        if specification in sentence:
            useful_sentences.append(sentence)
            break
useful_sentences

[' One can only imagine how Roland would react to the acres of flesh on show this past September, as dictated by the dominant spring/summer 2022 fashion trend',
 ' To put it in terms that Cardi B would appreciate, when it comes to the new season, fashion is feeling all “macaroni in a pot”',
 ' Short, sheer and second-skin takes on sexy are back',
 ' But, after a hot-vax summer during which young people threw their insecurities to the wind, embracing a daring vein of body positivity that had previously seemed like well-merchandised marketing spiel, designers found new ways to interpret sex',
 ' Young guns Nensi Dojaka and Supriya Lele have made this hedonistic approach their calling card, and their sheer, body-wrapping designs made headlines in London',
 ' Ludovic de Saint Sernin, the young designer with a small but devoted following, showed a collection made up of tiny strips of leather braided together',
 ' It was sponsored by Pornhub; a collaboration with the porn site is due in earl

In [8]:
useful_sentences_score = []
useful_sentences_dictionary = dict()
for sentence in useful_sentences:
    score = 0
    for specification in specifications:
        if specification in sentence:
            score = score + len(specification)
    useful_sentences_dictionary[sentence] = score
useful_sentences_dictionary = sorted(useful_sentences_dictionary.items(), key=lambda x:x[1], reverse=True)
useful_sentences_dictionary

[('” Think 18th-century pannier dresses with their hems chopped to reveal red satin superhero-style boots; cape-tuxedo hybrids paired with indigo jeans; gold-braid embellished waistcoats with short cargo skirts and floor-sweeping Morticia Addams capes',
  52),
 (' Our favourites: JW Anderson’s mini dresses, Louis Vuitton’s going-out tops (paired with deep-blue denim) and Gucci’s feathered coat',
  35),
 (' Crop tops and butterflies, Grecian draping and low-rise pants, body chains and skinny, completely useless scarves – it will be tough to resist the siren song of the 2000s come spring',
  32),
 (' Here’s one for the minimalists: a shock of Colgate white, worn head-to-toe, felt particularly fresh for spring/summer 2022',
  25),
 (' Take Valentino, where chocolate brown, raspberry and cerulean blue mingled to delicious effect',
  24),
 (' For as Tom Ford, quoting Diana Vreeland, succinctly put it in his show notes: “I know it’s a lot but is it enough?” Want the confidence-inducing cut t