In [14]:
# custom imports
import os
import sys

# import module 
import requests 
import pandas as pd 
from bs4 import BeautifulSoup 
import matplotlib.pyplot as plt

In [15]:
# natural language processing: n-gram ranking
import re
import unicodedata
import nltk
from nltk.corpus import stopwords

In [16]:
# store all stopwords
additional_stopwords = []
stopwords = nltk.corpus.stopwords.words('english') + additional_stopwords

In [17]:
# function to cleanup text and find words in it
def extract_words_from_text(text):
    """
    Function to clean up the passed text.\n
    All words that are not designated as a stop word are lemmatized afte encoding and basic regex parsing is performed.\n
    \n
    Parameters:
    text - Text to be worked with
    keywords_excluded - 'True' if keywords are to be treaded as stopwords
    """

    # lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()

    # text cleaning
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower())
    words = re.sub(r'[^\w\s]', '', text).split()

    # word list
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [18]:
def extract_vocabulary():
    curr_path = os.path.abspath("testing_tfidf_glamour_magazine.ipynb")
    df_path = os.path.abspath(os.path.join(curr_path, "../../..", "Read_Files/fashion_vocabulary.csv"))
    df = pd.read_csv(df_path)

    vocabulary_list = [v for v in df['Specifications']]

    return vocabulary_list

In [23]:
def edit_distance(words, vocabulary):
    lis = []
    
    for extracted_word in words:
        for vocab in vocabulary:
            cnt = 0
            for i in range(max(len(vocab), len(extracted_word))):
                if i >= len(vocab) or i >= len(extracted_word):
                    cnt += 1
                elif vocab[i] != extracted_word[i]:
                    cnt += 1
            
            lis.append([vocab, extracted_word, cnt])

    df = pd.DataFrame(lis, columns=["Vocaubulary", "Extracted word", "Edit distance"])
    df.sort_values(by="Edit distance", inplace=True)

    print(df)

    all_matching_words = set()
    
    for index, row in df.iterrows():
        if row["Edit distance"] == 0:
            all_matching_words.add(row["Extracted word"])
    
    print(all_matching_words)


In [20]:
# getting page content
html_response = requests.get("https://www.glamourmagazine.co.uk/gallery/spring-summer-2022-fashion-trends")
html_text = html_response.text
soup = BeautifulSoup(html_text, "lxml")

In [21]:
# various sources of text
para_text = [element.text.strip() for element in soup.find_all("p")]
header_text = [element.text.strip() for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
span_text = [element.text.strip() for element in soup.find_all("span")]
all_text = para_text + header_text + span_text

In [24]:
words = extract_words_from_text(text=" ".join(all_text))
vocabulary = extract_vocabulary()
edit_distance(words=words, vocabulary=vocabulary)

                        Vocaubulary Extracted word  Edit distance
213926                       spring         spring              0
335098                       summer         summer              0
29265                        button         button              0
167139                      pleated        pleated              0
163180                        plain          plain              0
...                             ...            ...            ...
507507  three quarter length sleeve        fashion             27
124722  three quarter length sleeve      dominated             27
232206  three quarter length sleeve         length             27
479622  three quarter length sleeve            hit             27
0       three quarter length sleeve        product             27

[547560 rows x 3 columns]
{'black', 'long', 'top', 'skirt', 'sheer', 'summer', 'cropped', 'midi', 'pleated', 'lace', 'gold', 'sexy', 'silver', 'yes', 'spring', 'straight', 'metallic', 'beige', 'mini', 'croch