In [37]:
# custom imports
import os
import sys

# import module 
import requests 
import pandas as pd 
from bs4 import BeautifulSoup 
import matplotlib.pyplot as plt 
from collections import defaultdict
import numpy as np

In [38]:
# natural language processing: n-gram ranking
import re
import unicodedata
import nltk
from nltk.corpus import stopwords

In [39]:
# pandas display settings
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.width = None

In [40]:
# read and store all keywords in lowercase
stopwords_path = os.path.abspath(os.path.join(os.path.abspath("testing_vocab_match.ipynb"), "../../../Read_Files", "stopwords_cleaned.txt"))
with open(stopwords_path) as file:
    stopwords = [line.strip().lower() for line in file]

In [41]:
def extract_words_from_text(text):

    words = [w for w in text.split(' ')]
    return words

In [42]:
def extract_vocabulary():
    curr_path = os.path.abspath("testing_vocab_match_glamour_magazine.ipynb")
    df_path = os.path.abspath(os.path.join(curr_path, "../../..", "Read_Files/fashion_vocabulary.csv"))
    df = pd.read_csv(df_path)

    vocabulary_list = [v.lower() for v in df['Specifications']]

    return vocabulary_list

In [43]:
def extract_nouns(vocabulary_list):
    nouns = []
    for v in vocabulary_list:
        last_word = [w for w in v.split(' ')][-1]
        nouns.append(last_word)
    return nouns

In [44]:
# getting page content
html_response = requests.get("https://www.vogue.co.uk/fashion/gallery/spring-summer-2022-fashion-trends")
html_text = html_response.text
soup = BeautifulSoup(html_text, "lxml")

In [45]:
# function to cleanup text
def clean_text(text):
    """
    Function to clean up the passed text.\n
    All words are lemmatized afte encoding and basic regex parsing is performed.\n
    \n
    Parameters:
    text - Text to be worked with
    """

    # text cleaning
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower())
    words = re.sub(r'[^\w\s]', '', text).split()

    # word list
    return [word for word in words if word not in stopwords]

In [46]:
# various sources of text
para_texts = [element.text.strip() for element in soup.find_all("p")]
for p in range(len(para_texts)):
    if para_texts[p] == "":
        continue
    if para_texts[p][-1] != '.':
        para_texts[p] += '. '

header_texts = [element.text.strip() for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
for p in range(len(header_texts)):
    if header_texts[p] == "":
        continue
    if header_texts[p][-1] != '.':
        header_texts[p] += '. '

span_texts = [element.text.strip() for element in soup.find_all("span")]
for p in range(len(span_texts)):
    if span_texts[p] == "":
        continue
    if span_texts[p][-1] != '.':
        span_texts[p] += '. '

all_texts = para_texts + header_texts + span_texts
all_texts_string = ""
for v in all_texts:
    all_texts_string += v

all_texts_string = all_texts_string.lower()

print(all_texts_string)

follow us. by ellie pithers. “is not the most erotic portion of the body where the garment gapes?” posited french theorist roland barthes. one can only imagine how roland would react to the acres of flesh on show this past september, as dictated by the dominant spring/summer 2022 fashion trend. to put it in terms that cardi b would appreciate, when it comes to the new season, fashion is feeling all “macaroni in a pot”. short, sheer and second-skin takes on sexy are back.it’s been a while. sure, we’ve had underwear-as-outerwear renditions in recent seasons, with the odd pair of stripper heels making an unsolicited appearance. but, after a hot-vax summer during which young people threw their insecurities to the wind, embracing a daring vein of body positivity that had previously seemed like well-merchandised marketing spiel, designers found new ways to interpret sex.frequently, the results were see-through. young guns nensi dojaka and supriya lele have made this hedonistic approach their

In [47]:
vocabulary = extract_vocabulary()
print(vocabulary)
nouns = extract_nouns(vocabulary)
print(nouns)
# words = extract_words_from_text(all_texts_string)
# print(words)
words = clean_text(all_texts_string)
print(words)

['three quarter length sleeve', 'unlined bottom', 'lined top', 'fit and flare', 'deep v neck', 'off the shoulder', 'peter pan collar', 'recycled cotton blends', 'organic cotton blends', 'recycled polyester blends', 'paper bag waist', 'contrast faux fur', 'black and white', 'blue and white', 'red and white', 'all over print', 'pop art print', 'roll up sleeve', 'non removable padding', 'soft & lightweight', '3 piece set', '2 piece set', 'track shorts', 'biker shorts', 'women tops', 'biker shorts', 'wide leg', 'flare leg', 'straight leg', 'cargo pants', 'track shorts', 'a line', 'mom fit', 'boyfriend fit', 'basic tops', 'pea coat', 'zip up', 'half placket', 'wide strap', 'push up', 'trench coat', 'faux fur', 'round neck', 'scoop neck', 'stand collar', 'spaghetti strap', 'one shoulder', 'v neck', 'high neck', 'baseball collar', 'square neck', 'asymmetrical neck', 'keyhole neckline', 'tie neck', 'cold shoulder', 'cowl neck', 'funnel neck', 'shawl collar', 'boat neck', 'sailor collar', 'mand

In [48]:
def sort_dict_by_value(d, reverse = False):
  return dict(sorted(d.items(), key = lambda x: x[1], reverse = reverse))

In [49]:
prefix_dict = defaultdict(dict)
suffix_dict = defaultdict(dict)

for noun in nouns:
    for i in range(len(words)):
        if words[i].startswith(noun) == True:
            if i-1 >= 0:
                if words[i-1] in prefix_dict[words[i]].keys():
                    prefix_dict[words[i]][words[i-1]] += 1
                else:
                    prefix_dict[words[i]][words[i-1]] = 1
            if i+1 < len(words):
                if words[i+1] in suffix_dict[words[i]].keys():
                    suffix_dict[words[i]][words[i+1]] += 1
                else:
                    suffix_dict[words[i]][words[i+1]] = 1

for k, v in prefix_dict.items():
    print(k)
    print(sort_dict_by_value(v, True))
    print()

for k, v in suffix_dict.items():
    print(k)
    print(sort_dict_by_value(v, True))
    print()

tops
{'goingout': 4, 'pastcrop': 4}

shoulders
{'puff': 4}

waistcoats
{'embellished': 6}

white
{'headtotoe': 8, 'colgate': 4, 'crisp': 4, 'updates': 4}

set
{'staple': 2}

pants
{'cotton': 3, 'lowrise': 3, 'cargo': 3}

lines
{'libertine': 1, 'armani': 1, '74183': 1}

fit
{'oversized': 4}

fits
{'sinuous': 4}

coat
{'feathered': 3}

updates
{'valentino': 3}

strap
{'single': 2}

leather
{'strips': 1, 'funky': 1}

lace
{'transparent': 3}

cottons
{'white': 2}

cotton
{'pair': 2}

mesh
{'sparkling': 3}

hemlines
{'micro': 4, 'aboutjust': 2}

hems
{'mullet': 2, 'dresses': 2}

outre
{'torch': 1}

backits
{'sexy': 2}

sequins
{'town': 2}

thighslits
{'embraced': 1}

bustiers
{'latex': 1}

lowrise
{'draping': 1, 'etro': 1}

stripes
{'selfassured': 3, 'fringed': 3}

brown
{'chocolate': 5}

browne
{'thom': 10}

blue
{'cerulean': 8}

pink
{'enjoyment': 6}

red
{'reveal': 2, 'guccis': 2}

reduced
{'pho': 2}

flying
{'flutes': 2}

comfortable
{'invest': 1}

comfortdriven
{'premise': 1}

skirts
{

In [50]:
all_keywords = list(set(prefix_dict.keys()).union(set(suffix_dict.keys())))
all_keywords.sort() 
print(all_keywords)
df = pd.DataFrame(columns=['Prefix3', 'Prefix2', 'Prefix1', 'Keyword', 'Suffix1', 'Suffix2', 'Suffix3'])

for keyword in all_keywords:
    current_row = ["-", "-", "-", keyword, "-", "-", "-"]
    j = 2
    for (k, v) in sort_dict_by_value(prefix_dict[keyword]).items():
        if j == -1:
            break
        current_row[j] = f"({k}, {v})"
        j -= 1    

    j = 4
    for (k, v) in sort_dict_by_value(suffix_dict[keyword]).items():
        if j == 7:
            break
        current_row[j] = f"({k}, {v})"
        j += 1     

    df.loc[len(df.index)] = current_row

['allure', 'backits', 'baggy', 'biker', 'blazer', 'blue', 'braided', 'brown', 'browne', 'bustiers', 'butterfly', 'caped', 'capes', 'capesbut', 'capetuxedo', 'card', 'cardi', 'cargo', 'carolina', 'carrying', 'cary', 'chainmail', 'chains', 'champagne', 'coat', 'comfortable', 'comfortdriven', 'corsetry', 'cotton', 'cottons', 'crop', 'cropped', 'denim', 'dress', 'dresses', 'dressmaking', 'elegant', 'fall', 'feathered', 'figure', 'fit', 'fits', 'flying', 'fringe', 'fringed', 'goldbraid', 'hemlines', 'hems', 'lace', 'lapelfree', 'lapels', 'leather', 'lines', 'loosening', 'lowrise', 'maximilian', 'maximum', 'mermaids', 'mesh', 'mini', 'minimalists', 'minis', 'no21', 'note', 'notes', 'outre', 'oversized', 'pants', 'pink', 'pleated', 'red', 'reduced', 'satin', 'sequins', 'set', 'sexy', 'sheer', 'short', 'shoulders', 'skinny', 'skirt', 'skirts', 'spring', 'springs', 'springsummer', 'straightforward', 'strap', 'stripes', 'summer', 'thighslits', 'tops', 'updates', 'waistcoats', 'warm', 'white', 'w

In [51]:
# print summary
# set output path
curr_path = os.path.abspath("testing_vocab_match.ipynb")
print(curr_path)
output_path = os.path.abspath(os.path.join(curr_path, "../", "Outputs/Glamour_Magazine_vocab_match_prefix_suffix.txt"))

# write to file
original_stdout = sys.stdout
with open(output_path, "w") as f:
    sys.stdout = f

    print('Fashion vocabulary with top 3 prefices and suffices')

    print(df.to_string())

    # print('WORDS with top 3 prefices and seuffices')

    # for(k, v) in prefix_dict.items():
    #     print("Word:", k)
    #     print()
    #     print("Top 3 prefices:")
    #     j = 1
    #     for (k1, v1) in sort_dict_by_value(v).items():
    #         if j > 3:
    #             break
    #         else:
    #             print(k1, v1)
    #     print()
    #     print("Top 3 suffices:")
    #     j = 1
    #     for (k1, v1) in sort_dict_by_value(suffix_dict[k]).items():
    #         if j > 3:
    #             break
    #         else:
    #             print(k1, v1)
    #     print()

    # reset stdout
    sys.stdout = original_stdout

e:\Projects\Curience-Work\Analytics\Alokeveer\testing_vocab_match.ipynb


In [52]:
pd.__version__

'1.3.5'