In [5]:
# # Downloading the small model containing tensors.
# python -m spacy download en_core_web_sm

# # Downloading over 1 million word vectors.
# python -m spacy download en_core_web_lg

import spacy

nlp = spacy.load('en_core_web_lg')
  
print("Enter two space-separated words")
words = input()
  
tokens = nlp(words)
  
for token in tokens:
    # Printing the following attributes of each token.
    # text: the word string, has_vector: if it contains
    # a vector representation in the model, 
    # vector_norm: the algebraic norm of the vector,
    # is_oov: if the word is out of vocabulary.
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)
  
token1, token2 = tokens[0], tokens[1]
  
print("Similarity:", token1.similarity(token2))

Enter two space-separated words


KeyboardInterrupt: Interrupted by user

In [30]:
from bs4 import BeautifulSoup
import re
import spacy
from collections import defaultdict

nlp = spacy.load('en_core_web_lg')


def html_to_words(html, return_set=True):
    """
    Returns a list of words from the html.
    """
    if return_set:
        return set(html.replace('\n', ' ').replace('\t', ' ').split(' '))
    else:
        return html.replace('\n', ' ').replace('\t', ' ').split(' ')

def similarity_heatmap_data(corpus, target):
    """
    Returns a list of lists of similarity scores for the words.

    corpus: a list of strings representing words
    target: the target word to compare to
    """
    sim_dict = dict()
    tokens = nlp(corpus)
    target_token = nlp(target)[0]
    
    # Create a dictionary of word to similarity scores.
    for token in tokens:
        if token.is_oov:
            continue
        sim_dict[token.text] = target_token.similarity(token)

    return sim_dict

In [34]:
delete_bracketed_text = '\[.*?\]'
delete_bracketed_text_regex = re.compile(delete_bracketed_text)

delete_numbers = '\d+'
delete_numbers_regex = re.compile(delete_numbers)

delete_punctuation = "[^\w\s]"
delete_punctuation_regex = re.compile(delete_punctuation)

html = open('Wikipedia Pages/China.html', 'r').read()

soup = BeautifulSoup(html, 'html.parser')

for script in soup.findAll('script', 'style'):
    script.extract()

text = soup.get_text()

# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)

# remove square brackets, numbers, punctuation
text = delete_bracketed_text_regex.sub('', text)
text = delete_numbers_regex.sub('', text)
text = delete_punctuation_regex.sub('', text)

words = html_to_words(text, False)
print(len(words))
heatmap_dict = similarity_heatmap_data(text, 'philosophy')

34820


In [35]:
heatmap_dict

{'China': 0.1239728331565857,
 'From': 0.17862920463085175,
 'Wikipedia': 0.26231348514556885,
 'the': 0.283076673746109,
 'free': 0.18994608521461487,
 'encyclopedia': 0.3135102093219757,
 'Jump': 0.11194156110286713,
 'to': 0.24434086680412292,
 'navigation': 0.11677315086126328,
 'search': 0.17031343281269073,
 'Country': 0.25938162207603455,
 'in': 0.27078551054000854,
 'East': 0.1387408822774887,
 'Asia': 0.12688718736171722,
 'PRC': -0.11744797974824905,
 'redirects': 0.05415421351790428,
 'here': 0.2601236402988434,
 'For': 0.16341349482536316,
 'other': 0.2695390284061432,
 'uses': 0.303772509098053,
 'see': 0.20731185376644135,
 'disambiguation': 0.0831671804189682,
 'and': 0.27559128403663635,
 'Peoples': 0.2934878170490265,
 'Republic': 0.26197904348373413,
 'of': 0.2886853516101837,
 'Pinyin': 0.04679359868168831,
 'Flag': 0.094709612429142,
 'National': 0.2656806707382202,
 'Emblem': 0.14069858193397522,
 'Anthem': 0.11970020830631256,
 'controlled': 0.1442524492740631,
 '