In [1]:
import bs4 as bs
import urllib.request
import spacy

In [22]:
# read in urls and separate into paragraphs

def url_to_paragraph(url_):

    url_obj = urllib.request.urlopen(url_)

    page_data = bs.BeautifulSoup(url_obj, 'lxml')

    paragraphs = page_data.find_all('p')

    paragraph_list = []

    for p in paragraphs:
        paragraph_list.append(p.text)
    
    return paragraph_list

# URLs
jimi = 'https://en.wikipedia.org/wiki/Jimi_Hendrix'
jerry = 'https://en.wikipedia.org/wiki/Jerry_Garcia'
eric = 'https://en.wikipedia.org/wiki/Eric_Andr%C3%A9'



In [23]:
# Get intro paragraph for each
jimi_p = url_to_paragraph(jimi)[1]
jerry_p = url_to_paragraph(jerry)[1]
eric_p = url_to_paragraph(eric)[1]

all_p = jimi_p,jerry_p,eric_p

print(jimi_p)
print(jerry_p)
print(eric_p)

James Marshall "Jimi" Hendrix (born Johnny Allen Hendrix; November 27, 1942 – September 18, 1970) was an American musician, singer, and songwriter. Although his mainstream career spanned only four years, he is widely regarded as one of the most influential electric guitarists in the history of popular music, and one of the most celebrated musicians of the 20th century. The Rock and Roll Hall of Fame describes him as "arguably the greatest instrumentalist in the history of rock music".[1]

Jerome John Garcia (August 1, 1942 – August 9, 1995)[1][2][3][4] was an American guitarist, singer and songwriter, best known for being a principal songwriter, the lead guitarist and a vocalist with the rock band the Grateful Dead, of which he was a founding member and which came to prominence during the counterculture of the 1960s.[5][6] Although he disavowed the role, Garcia was viewed by many as the leader of the band.[7][8][9]

Eric Samuel André[1] (born April 4, 1983) is an American stand-up come

In [24]:
import regex as re

# remove references
def rm_refs(wiki_text):
    return re.sub(r'\[[0-9]*\]', '', wiki_text)

all_p = [rm_refs(i) for i in all_p]

print(all_p)

['James Marshall "Jimi" Hendrix (born Johnny Allen Hendrix; November 27, 1942\xa0– September 18, 1970) was an American musician, singer, and songwriter. Although his mainstream career spanned only four years, he is widely regarded as one of the most influential electric guitarists in the history of popular music, and one of the most celebrated musicians of the 20th century. The Rock and Roll Hall of Fame describes him as "arguably the greatest instrumentalist in the history of rock music".\n', 'Jerome John Garcia (August 1, 1942\xa0– August 9, 1995) was an American guitarist, singer and songwriter, best known for being a principal songwriter, the lead guitarist and a vocalist with the rock band the Grateful Dead, of which he was a founding member and which came to prominence during the counterculture of the 1960s. Although he disavowed the role, Garcia was viewed by many as the leader of the band.\n', 'Eric Samuel André (born April 4, 1983) is an American stand-up comedian, actor, tele

In [25]:
# parse each paragraph into separate docs with spaCy pipeline

nlp = spacy.load('en_core_web_lg')

all_docs = [nlp(i) for i in all_p]

len(all_docs)

3

In [31]:
# filter out stop words and punctuation
def rm_stop_punct(doc_):
    return [i for i in doc_ if not i.is_stop | i.is_punct | i.like_num]

all_docs_filtered = [rm_stop_punct(i) for i in all_docs]

all_docs_filtered

[[James,
  Marshall,
  Jimi,
  Hendrix,
  born,
  Johnny,
  Allen,
  Hendrix,
  November,
   ,
  September,
  American,
  musician,
  singer,
  songwriter,
  mainstream,
  career,
  spanned,
  years,
  widely,
  regarded,
  influential,
  electric,
  guitarists,
  history,
  popular,
  music,
  celebrated,
  musicians,
  century,
  Rock,
  Roll,
  Hall,
  Fame,
  describes,
  arguably,
  greatest,
  instrumentalist,
  history,
  rock,
  music,
  ],
 [Jerome,
  John,
  Garcia,
  August,
   ,
  August,
  American,
  guitarist,
  singer,
  songwriter,
  best,
  known,
  principal,
  songwriter,
  lead,
  guitarist,
  vocalist,
  rock,
  band,
  Grateful,
  Dead,
  founding,
  member,
  came,
  prominence,
  counterculture,
  1960s,
  disavowed,
  role,
  Garcia,
  viewed,
  leader,
  band,
  ],
 [Eric,
  Samuel,
  André,
  born,
  April,
  American,
  stand,
  comedian,
  actor,
  television,
  host,
  writer,
  producer,
  best,
  known,
  creator,
  host,
  co,
  writer,
  Adult,
  Swim

In [32]:
# Check length of vector 

print(all_docs_filtered[0][0].vector.shape)

(300,)


In [33]:
import numpy as np

# compute mean word vector across all word vectors in each paragraph

def mean_vector(paragraph):

    vec_list = [i.vector for i in paragraph]

    vec_matrix = np.vstack(vec_list) # source: https://numpy.org/doc/stable/reference/generated/numpy.vstack.html
    
    vec_mean = np.mean(vec_matrix, axis = 0)

    return vec_mean

vec_means = [mean_vector(i) for i in all_docs_filtered]

# check vec shape
print([i.shape for i in vec_means])



[(300,), (300,), (300,)]


In [34]:
from keras.losses import cosine_similarity
import pandas as pd
# Use cosine similarity to create similarity scores between each pair of mean word vectors.

jimi_jerry = vec_means[:2] 
jerry_eric = vec_means[1:]
eric_jimi = vec_means[::2]

vec_sim_df = pd.DataFrame(index=['jimi_jerry','jerry_eric','eric_jimi'],
                        data= [jimi_jerry, jerry_eric, eric_jimi] )


vec_sim_df['similarity_score'] = vec_sim_df\
    .apply(lambda row: cosine_similarity(row[0],row[1]).numpy(),axis=1)

vec_sim_df


Unnamed: 0,0,1,similarity_score
jimi_jerry,"[0.11499475, 0.21315666, 0.2386528, -0.0022376...","[0.11063382, 0.2448712, 0.09534839, 0.09273279...",-0.899244
jerry_eric,"[0.11063382, 0.2448712, 0.09534839, 0.09273279...","[-0.0011254473, 0.07978913, 0.055970144, -0.01...",-0.784791
eric_jimi,"[0.11499475, 0.21315666, 0.2386528, -0.0022376...","[-0.0011254473, 0.07978913, 0.055970144, -0.01...",-0.7968


# Similarity Between Articles:

##      The above results indicate that the articles are very similar. Each word was converted into a single row, 300 column vector where each value of that vector represents relatedness of other words. Each paragraph's list of vectorized words were averaged, representing an overall 'ness' of the words. The three averaged vectors representing each paragraph have high cosine similarity with each other. This means they share many words with high likelihood of being semantic neighbors. 

##      The most similar pair are Jimi Hendrix and Jerry Garcia. This makes sense as they are both highly influential musicians. The least similar are Eric Andre and Jerry Garcia, however, -0.78 out of a -1 to 1 is still relatively similar. This is most likely due to that fact that these two corpus's are both describing a famous person. The similarity would most likely be derrived from the adjectives/similar adjectives used to describe both of them. 

##      This model could be used on a large scale to do topic detection. Since topic modelling would result in a network of cosine similarity values that may become too complex to interpret, you could use this model to find the most semantically similar corpus's to a specified document.
