In [7]:
import re
import os
import nltk
import requests
import numpy as np
import unicodedata
import pandas as pd
import networkx as nx
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
blog_link = "https://www.vogue.co.uk/fashion/gallery/spring-summer-2022-fashion-trends"

In [9]:
html_response = requests.get(blog_link)
text = ""
if(html_response.status_code == 200):
    # getting page content
    html_text = html_response.text
    soup = BeautifulSoup(html_text, "lxml")

    # various sources of text
    para_text = [element.text.strip() for element in soup.find_all("p")]
    header_text = [element.text.strip() for element in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
    span_text = [element.text.strip() for element in soup.find_all("span")]
    all_text = para_text + header_text + span_text
    
    text = " ".join(all_text)

In [10]:
sentences = []
sentences.append(sent_tokenize(text))
sentences = [y for x in sentences for y in x] # flatten list

In [11]:
word_embeddings = {}
curr_path = os.path.abspath("test.ipynb")
file_path = os.path.abspath(os.path.join(curr_path, "../../../..", "Read_Files/glove_embeddings", "glove.6B.100d.txt"))
f = open(file_path, encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [13]:
stopwords_path = os.path.abspath(os.path.join(curr_path, "../../../..", "Read_Files", "stopwords_cleaned.txt"))
with open(stopwords_path) as file:
    stopwords = [line.strip().lower() for line in file]

In [14]:
# function to lemmatize and clean page text
def clean_sentence(text):
    # text cleaning
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower())
    text = text.replace("/", " ")
    text = text.replace("-", " ")
    words = re.sub(r'[^\w\s]', '', text).split()

    return " ".join([word for word in words if word not in stopwords])

In [15]:
clean_sentences = [clean_sentence(sentence) for sentence in sentences]

In [16]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [17]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [18]:
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [19]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [20]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [23]:
# Extract top 10 sentences as the summary
for i in range(10):
    print(ranked_sentences[i][1])

Crop tops and butterflies, Grecian draping and low-rise pants, body chains and skinny, completely useless scarves – it will be tough to resist the siren song of the 2000s come spring.
If you’re interested in the key dress shape for spring, then just know that fashion’s got a taste for flesh.
Nowhere was this frenetic energy more evident than at Louis Vuitton, where Nicolas Ghesquière imagined “the figure of a vampire who travels through the ages, adapting to dress codes of the era.” Think 18th-century pannier dresses with their hems chopped to reveal red satin superhero-style boots; cape-tuxedo hybrids paired with indigo jeans; gold-braid embellished waistcoats with short cargo skirts and floor-sweeping Morticia Addams capes.
Short, sheer and second-skin takes on sexy are back.
But, after a hot-vax summer during which young people threw their insecurities to the wind, embracing a daring vein of body positivity that had previously seemed like well-merchandised marketing spiel, designers