In [1]:
# standard imports
import os
import sys
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

# natural language processing
import re
import nltk
import unicodedata
from nltk.corpus import stopwords
from rake_nltk import Rake
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# pandas display settings
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.width = None

In [3]:
# read and store all keywords in lowercase
stopwords_path = os.path.abspath(os.path.join(os.path.abspath("test.ipynb"), "../../../../Read_Files", "stopwords_cleaned.txt"))
with open(stopwords_path) as file:
    stopwords = [line.strip().lower() for line in file]

In [4]:
# # store all ntlk stopwords
# additional_stopwords = []
# stopwords = nltk.corpus.stopwords.words('english') + additional_stopwords

In [5]:
def clean_text(text):
    # text cleaning
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower())
    # words = re.sub(r'[^\w\s]', '', text).split()
    words = re.sub(r'[^a-zA-Z\s]+', '', text).split()

    # word list
    return " ".join(words)

In [6]:
# function to get all page content from html response
def get_page_text(html_response):
    # getting page content
    html_text = html_response.text
    soup = BeautifulSoup(html_text, "lxml")
    
    # various sources of text
    para_text = [element.text.strip() for element in soup.find_all("p")]
    header_text = [element.text.strip() for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
    span_text = [element.text.strip() for element in soup.find_all("span")]
    all_text = para_text + header_text + span_text
    
    return all_text

In [7]:
url = "https://www.vogue.co.uk/fashion/gallery/spring-summer-2022-fashion-trends"

In [8]:
# getting page response
html_response = requests.get(url)
if(html_response.status_code == 200):
    # get page content
    all_text = " ".join(get_page_text(html_response))
    # filter all sentences
    final_text = clean_text(all_text)

In [9]:
# Uses stopwords for english from NLTK, and all puntuation characters by default
r = Rake(min_length=2, max_length=2, include_repeated_phrases=False)
# Extraction given the text.
r.extract_keywords_from_text(final_text)
# To get keyword phrases ranked highest to lowest.
keywords = r.get_ranked_phrases_with_scores()
for keyword in keywords:
    print(keyword)

(4.0, 'yumi nu')
(4.0, 'youre interested')
(4.0, 'young designers')
(4.0, 'young designer')
(4.0, 'worn low')
(4.0, 'wiped completely')
(4.0, 'wind embracing')
(4.0, 'wild buckle')
(4.0, 'whole thing')
(4.0, 'vivienne westwood')
(4.0, 'valentino updates')
(4.0, 'unsolicited appearance')
(4.0, 'underwearasouterwear renditions')
(4.0, 'tonal shades')
(4.0, 'tiny strips')
(4.0, 'tiktok generation')
(4.0, 'tangerine tangoed')
(4.0, 'sweet release')
(4.0, 'sure weve')
(4.0, 'supriya lele')
(4.0, 'super short')
(4.0, 'structured blazer')
(4.0, 'strange anymore')
(4.0, 'spring bringing')
(4.0, 'slightest excuse')
(4.0, 'size wasnt')
(4.0, 'siren song')
(4.0, 'single strap')
(4.0, 'similarly comfortdriven')
(4.0, 'show notes')
(4.0, 'sheer tights')
(4.0, 'sexy comes')
(4.0, 'seen since')
(4.0, 'secondskin takes')
(4.0, 'say hello')
(4.0, 'saint laurent')
(4.0, 'revolving doors')
(4.0, 'relied upon')
(4.0, 'recent seasons')
(4.0, 'proud imagine')
(4.0, 'pradas collection')
(4.0, 'prabal gurung'