In [1]:
import re
import time

import numpy as np
import pandas as pd
import requests

In [2]:
# !pip3 install progressbar2
from progressbar import progressbar

## Scraping

### Scraping Functions

In [3]:
# function to scrape reddit page (takes a reddit .json url)
# returns posts 

def scraper_bike(url):
    headers = {'User-Agent' : 'override this bad boy!'}
    posts = []
    after = {}

    for page in progressbar(range(40)):
        params = {'after': after}
        pagepull = requests.get(url=url, params=params, headers=headers)
        page_dict = pagepull.json()
        posts.extend(page_dict['data']['children'])
        after = page_dict['data']['after']
        time.sleep(.2)
        
    return posts

In [4]:
# function to convert posts to DataFrame - won't allow duplicate posts since unique id 'name' is set as index
# Extract: name (as index) and subreddit, selftext, title (as columns)

def posts_to_df(post_list):
    i = 0
    post_dict = {}
    
    for post in post_list:
        ind = post_list[i]['data']
        post_dict[ind['name']] = [ind['subreddit'], ind['title'], ind['selftext']]
        i += 1

    df_name = pd.DataFrame(post_dict)
    df_name = df_name.T
    df_name.columns = ['subreddit', 'title', 'selftext']
    
    return df_name

In [5]:
# takes scraper function and url - outputs dataframe

def scrape_to_df(scrape_func, url):
    
    return posts_to_df(scrape_func(url))

In [6]:
#### If you want to scrape repeatedly over time and add to a csv
# scrape, import csv, concat, drop duplicate, and output to csv
# takes in scraper function, url, csv filename to import, csv filename to output
# Outputs - Concatenated DataFrame as csv

def scrape_add(scrape_func, url, import_file, export_file):
    scrape_df = posts_to_df(scrape_func(url))
    imported_df = pd.read_csv(import_file, index_col = 'Unnamed: 0')
    concat_df = pd.concat([imported_df, scrape_df])
    concat_df = concat_df[~concat_df.index.duplicated(keep='first')]
    concat_df.to_csv(export_file)

### Run Scrape

In [7]:
# You can also put in any 2 subreddits in as the URL and get results for those

nfltest = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/nfl.json')
nbatest = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/nba.json')

100% (40 of 40) |########################| Elapsed Time: 0:00:24 Time:  0:00:24
100% (40 of 40) |########################| Elapsed Time: 0:00:26 Time:  0:00:26


In [8]:
politics_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/politics.json')
conservative_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/conservative.json')

100% (40 of 40) |########################| Elapsed Time: 0:00:27 Time:  0:00:27
100% (40 of 40) |########################| Elapsed Time: 0:00:25 Time:  0:00:25


In [9]:
nbatest.shape

(774, 3)

In [10]:
nfltest.shape

(892, 3)

##### These scrape_add functions add to already built csvs

In [None]:
# scrape_add(scraper_bike, 'https://www.reddit.com/r/CollegeBasketball/new.json', 'NCAA_Posts_Update2.csv', 'NCAA_Posts_Update3.csv')
# scrape_add(scraper_bike, 'https://www.reddit.com/r/AskScience/new.json', 'AskSci_Posts_Update2.csv', 'AskSci_Posts_Update3.csv')
# scrape_add(scraper_bike, 'https://www.reddit.com/r/nba/new.json', 'NBA_Posts_Update2.csv', 'NBA_Posts_Update3.csv')
# scrape_add(scraper_bike, 'https://www.reddit.com/r/nfl/new.json', 'NFL_Posts_Update2.csv', 'NFL_Posts_Update3.csv')

### Data Cleaning / Preprocessing

In [11]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

pd.set_option('max_colwidth', 300)

In [12]:
nfltest.head()

Unnamed: 0,subreddit,title,selftext
t3_dva38v,nfl,Official /r/NFL Sidebar contest,"Hey there folks!\n\nHappy Tuesday to everyone! Did you see any good games this week? Did anything stand out and you think it should be highlighted with a nice write up on our side bar?\n\nIf so, give us a hand and write something up! Maybe include a picture. Keep it polite (no fan base attacks p..."
t3_dvfdhm,nfl,Official Week 10 R/NFL Power Rankings,"Make sure you're sitting down, and check for yourself if you don't believe me, but we're already at the double-digit mark for gameweeks in the football season. That's right folks, it's already time for the **Week 10 Official r/nfl Power Rankings**! It's amazing how fast time flies; three months ..."
t3_dvhdmh,nfl,"[Schefter] NFL clubs were informed today that a private workout will be held for Colin Kaepernick on Saturday in Atlanta. Session will include on-field work and an interview. All clubs are invited to attend, and video of both the workout and interview will be made available to clubs.",
t3_dvi5xb,nfl,[Schefter] NFL has flexed the Week 12 Packers-49ers game to Sunday Night Football on NBC.,
t3_dvrvvy,nfl,"[Falcons] YEAH, YOUNGHOE! 👉 @YoungHoeKoo has been named NFC Special Teams Player of the Week!",


In [13]:
# drop column

nfltest = nfltest.drop(columns='selftext')
nbatest = nbatest.drop(columns='selftext')

In [14]:
# merge subreddit data

train = pd.concat([nfltest, nbatest])

In [15]:
train

Unnamed: 0,subreddit,title
t3_dva38v,nfl,Official /r/NFL Sidebar contest
t3_dvfdhm,nfl,Official Week 10 R/NFL Power Rankings
t3_dvhdmh,nfl,"[Schefter] NFL clubs were informed today that a private workout will be held for Colin Kaepernick on Saturday in Atlanta. Session will include on-field work and an interview. All clubs are invited to attend, and video of both the workout and interview will be made available to clubs."
t3_dvi5xb,nfl,[Schefter] NFL has flexed the Week 12 Packers-49ers game to Sunday Night Football on NBC.
t3_dvrvvy,nfl,"[Falcons] YEAH, YOUNGHOE! 👉 @YoungHoeKoo has been named NFC Special Teams Player of the Week!"
t3_dvhocr,nfl,Rapaport: Cam Newton would accept trade to Chicago Bears
t3_dvnxdw,nfl,"[Josh Houtz] Andy Dalton says he was benched because they ""had to think about the draft"""
t3_dvjqtm,nfl,"[Colin K] I’m just getting word from my representatives that the NFL league office reached out to them about a workout in Atlanta on Saturday. I’ve been in shape and ready for this for 3 years, can’t wait to see the head coaches and GMs on Saturday."
t3_dvc1wv,nfl,Seahawks drop in CBS power rankings after defeating 8-0 Niners
t3_dvrgg3,nfl,The 2019 ESPN Playoff Machine is live


##### Tokenize (grab only word characters)

In [16]:
word_tokenizer = RegexpTokenizer(r'\w+')

In [17]:
train['title'] = train['title'].map(lambda x: word_tokenizer.tokenize(x.lower()))

In [18]:
# rejoin list of tokenized words into single string for each row

train['title'] = train['title'].map(lambda x: ' '.join(x))

In [19]:
train['title'][0:5]

t3_dva38v                                                                                                                                                                                                                                                            official r nfl sidebar contest
t3_dvfdhm                                                                                                                                                                                                                                                     official week 10 r nfl power rankings
t3_dvhdmh    schefter nfl clubs were informed today that a private workout will be held for colin kaepernick on saturday in atlanta session will include on field work and an interview all clubs are invited to attend and video of both the workout and interview will be made available to clubs
t3_dvi5xb                                                                                                                   

In [20]:
train['title'][0:5]

t3_dva38v                                                                                                                                                                                                                                                            official r nfl sidebar contest
t3_dvfdhm                                                                                                                                                                                                                                                     official week 10 r nfl power rankings
t3_dvhdmh    schefter nfl clubs were informed today that a private workout will be held for colin kaepernick on saturday in atlanta session will include on field work and an interview all clubs are invited to attend and video of both the workout and interview will be made available to clubs
t3_dvi5xb                                                                                                                   

### Train test split and converting series to list of strings then to array

In [21]:
X = train[['title']]
y = train['subreddit']

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.25,
                                                    random_state=42,
                                                    stratify=y)

In [23]:
# baseline is

y.value_counts(normalize=True)

nfl    0.535414
nba    0.464586
Name: subreddit, dtype: float64

In [24]:
# create our training data list - this is a list of strings, with each string being a post title

clean_train_data = []

for traindata in X_train['title']:
    clean_train_data.append(traindata)

In [25]:
len(clean_train_data)

1249

In [26]:
# create test data list

clean_test_data = []

for testdata in X_test['title']:
    clean_test_data.append(testdata)

In [27]:
len(clean_test_data)

417

In [28]:
clean_train_data

['charania houston fears guard eric gordon needs surgery on his knee that would sideline him at least one month league sources tell theathleticnba stadium gordon gathering more information today for final decision but leaning toward clean up procedure per sources',
 'in the 1940 nfl championship the bears compiled 73 points against the redskins 0 points for the biggest blow out in nfl history it was also the last game that an nfl player bears player dick plasman played without a helmet although helmets were not required in the nfl until 1943',
 'chargers travel to colorado to prepare for mnf game against chiefs in mexico city',
 'the dtysfunctionalality of the hornets is multi layered and is unlikely to be fixed for several decades',
 'game thread utah jazz 6 3 golden state warriors 2 8 november 12 2019',
 'highlight dak s hail mary gets picked vikings win',
 'grove baker mayfield after the bye week 73 111 65 7 705 yards 4 td 1 int not amazing but reports of his demise might ve been a 

### Count Vectorizer

In [29]:
# instantiate our CountVectorizer. This counts the number of appearances of all the words in our training data and
# eliminates common english stop words. 5000 max features works well for our purposes (tested various numbers). Our
# data is already preprocessed and tokenized manually earlier. ngram_range is 1,3, although all or nearly all our
# features are single words

vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words='english',
                             max_features=5000,
                             ngram_range=(1, 3))

In [31]:
# fit our training data and test data lists to our count_vectorizer

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

In [32]:
# convert to array

train_data_features = train_data_features.toarray()

In [33]:
# check shapes

train_data_features.shape, test_data_features.shape

((1249, 5000), (417, 5000))

In [35]:
train_data_features[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [36]:
# I wanted check that the features corpus was as expected - removed print statement for readability

vocab = vectorizer.get_feature_names()

In [37]:
vocab[0:200]

['000',
 '09',
 '10',
 '10 2019',
 '10 32',
 '10 blazers',
 '10 blazers vs',
 '10 chapter',
 '10 game',
 '10 games',
 '10 games seahawks',
 '10 games season',
 '10 hawks',
 '10 hawks vs',
 '10 mnf',
 '10 nfl',
 '10 points',
 '10 suns',
 '10 suns vs',
 '10 teams',
 '10 weeks',
 '10 year',
 '10 years',
 '100',
 '100 possessions',
 '100 yard',
 '100 yards',
 '100 years',
 '1000',
 '101',
 '102',
 '104',
 '104 year',
 '104 year old',
 '107',
 '108',
 '108 87',
 '109',
 '10th',
 '11',
 '11 10',
 '11 10 blazers',
 '11 10 hawks',
 '11 10 suns',
 '11 11',
 '11 11 mavs',
 '11 12',
 '11 12 heat',
 '11 2019',
 '11 asts',
 '11 mavs',
 '11 mavs vs',
 '11 rebounds',
 '11 rebounds assists',
 '11 rebs',
 '11 shooting',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '119',
 '12',
 '12 2019',
 '12 heat',
 '12 heat vs',
 '12 points',
 '120',
 '121',
 '122',
 '123',
 '125',
 '12th',
 '13',
 '13 2019',
 '13 carries',
 '13 points',
 '131',
 '136',
 '138',
 '14',
 '15',
 '15 fga',
 '15 shoo

## MODELING

### Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression

In [39]:
# fit logistic regression model

lr = LogisticRegression(penalty='l2')

In [40]:
# shape check

train_data_features.shape, y_train.shape

((1249, 5000), (1249,))

In [41]:
lr.fit(train_data_features, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
lr.score(train_data_features, y_train)

0.9991993594875901

In [43]:
lr.score(test_data_features, y_test)

0.9400479616306955

### Feature comparison

Creates a dataframe that matches features to coefficients

In [44]:
coef_list = lr.coef_.tolist()

In [45]:
coef_list = coef_list[0]

In [46]:
coef_df = pd.DataFrame({'features': vectorizer.get_feature_names(),
                        'coefs': coef_list})

In [47]:
coef_df.sort_values(by = ['coefs'])

Unnamed: 0,features,coefs
1871,nba,-2.443858
1522,knicks,-1.408794
3288,raptors,-1.315754
1573,lebron,-1.214959
1483,kawhi,-1.205320
4706,tonight,-1.186334
4527,suns,-0.974886
4854,warriors,-0.916840
409,basketball,-0.902750
649,clippers,-0.897251


### Let's throw out these unfair words and rerun

In [48]:
stopwords = set(stopwords.words('english'))

extra_stopwords = ['nba', 'basketball', 'football', 'nfl']

stopwords.update(extra_stopwords)

In [49]:
vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words=stopwords,
                             max_features=5000,
                             ngram_range=(1, 3))

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

((1249, 5000), (417, 5000))

In [50]:
lr.fit(train_data_features, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [51]:
lr.score(train_data_features, y_train)

0.9967974379503602

In [52]:
lr.score(test_data_features, y_test)

0.9232613908872902

In [53]:
coef_list = lr.coef_.tolist()
coef_list = coef_list[0]

coef_df = pd.DataFrame({'features' : vectorizer.get_feature_names(),
                       'coefs' : coef_list})

coef_df.sort_values(by = ['coefs'])

Unnamed: 0,features,coefs
3198,raptors,-1.407541
1591,knicks,-1.400391
1551,kawhi,-1.198934
1659,lebron,-1.124009
4683,tonight,-1.047564
4492,suns,-1.017546
672,clippers,-0.898872
1294,harden,-0.891899
4839,warriors,-0.885257
4715,trae,-0.883421


### Decision Tree

In [54]:
from sklearn.tree import DecisionTreeClassifier

In [55]:
tree = DecisionTreeClassifier()

In [56]:
tree.fit(train_data_features, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [57]:
tree.score(train_data_features, y_train)

1.0

In [58]:
tree.score(test_data_features, y_test)

0.8273381294964028

### Random Forest

In [59]:
from sklearn.ensemble import RandomForestClassifier

In [60]:
forest = RandomForestClassifier(n_estimators = 100)

In [61]:
forest.fit(train_data_features, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [62]:
forest.score(train_data_features, y_train)

1.0

In [63]:
forest.score(test_data_features, y_test)

0.8848920863309353

###  Matrix on Logistic Regression

In [64]:
from sklearn.metrics import confusion_matrix

In [65]:
y_pred = lr.predict(test_data_features)

In [66]:
cm = confusion_matrix(y_test, y_pred)

In [67]:
cm_df = pd.DataFrame(cm,
                    columns=['predict_neg', 'predict_pos'],
                    index = ['actual_neg', 'actual_pos'])

In [68]:
cm_df

Unnamed: 0,predict_neg,predict_pos
actual_neg,183,11
actual_pos,21,202


## Checking where our model failed

In [69]:
comparison_df = pd.DataFrame({'y_actual' : y_test,
             'y_predicted' : y_pred})

In [70]:
mismatch_df = comparison_df[comparison_df['y_actual'] != comparison_df['y_predicted']]

In [71]:
mismatch2_df = pd.concat([mismatch_df, X_test], axis = 1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [72]:
# All incorrect predictions with titles

mismatches = mismatch2_df.dropna()

In [73]:
mismatches

Unnamed: 0,y_actual,y_predicted,title
t3_duhm72,nfl,nba,highlight he picked that steven parker just ripped it from him
t3_duhu9j,nba,nfl,agness pacers without turner bitadze lamb sumner and oladipo in orlando turner played some 4 on 4 this week but more time is needed to recover
t3_dui3ai,nfl,nba,highlight hoyer gets destroyered
t3_duidfw,nba,nfl,ham marvin bagley said there is still no timetable for his return but he s putting in work while recovering from a broken right thumb wednesday will be the 3 week mark of a 4 6 week injury window sounds like he s still 2 weeks out
t3_duj3ul,nfl,nba,jamal adams mood
t3_dukcoy,nfl,nba,highlight cmc is stopped at the goal line
t3_dumo5m,nfl,nba,the snf top gun segment was using the music from hot shots
t3_dun5r6,nba,nfl,nehm middleton just got kneed in my quad tried to run it off for a second tried to come back here the locker room get it stretched out and i just couldn t do it when i asked if he thought he d be out long he said no and that he was hopeful to be good to go in a couple days
t3_duuf2q,nfl,nba,what goes into creating nfl redzone
t3_duuhb3,nfl,nba,nfl what ifs


### Let's try TF-IDF

Term Frequency / Inverse Document Frequency

TF(w) = (Number of times term w appears in a document) / (Total number of terms in the document)

IDF(w) = log_e(Total number of documents / Number of documents with term w in it)

In [76]:
tfidf_vec = TfidfVectorizer(analyzer="word",
                            tokenizer=None,
                            preprocessor=None,
                            stop_words=stopwords,
                            max_features=5000,
                            ngram_range=(1, 3))

In [77]:
train_data_features = tfidf_vec.fit_transform(clean_train_data)

test_data_features = tfidf_vec.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

((1249, 5000), (417, 5000))

In [78]:
lr.fit(train_data_features, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [79]:
lr.score(train_data_features, y_train)

0.9855884707766213

In [80]:
lr.score(test_data_features, y_test)

0.9112709832134293

### Let's try on some other subreddits

In [81]:
train = pd.concat([politics_test, conservative_test])

In [82]:
X = train[['title']]
y = train['subreddit']

In [83]:
# politics_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/politics.json')
# conservative_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/conservative.json')

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [85]:
politics_test = politics_test.drop(columns='selftext')
conservative_test = conservative_test.drop(columns='selftext')

train = pd.concat([politics_test, conservative_test])
tokenizer = RegexpTokenizer(r'\w+')

train['title'] = train['title'].map(lambda x: tokenizer.tokenize(x.lower()))
train['title'] = train['title'].map(lambda x: ' '.join(x))

In [86]:
# create our training data list - this is a list of strings, with each string being a post title

clean_train_data = []

for traindata in X_train['title']:
    clean_train_data.append(traindata)
    
    
# create test data list

clean_test_data = []

for testdata in X_test['title']:
    clean_test_data.append(testdata)

In [87]:
vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words='english',
                             max_features=5000,
                             ngram_range=(1, 3))

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

vocab = vectorizer.get_feature_names()

In [95]:
df = pd.DataFrame(train_data_features, columns=vocab)

In [98]:
df.trump.sum()

419

In [88]:
vocab

['000',
 '000 migrant',
 '000 migrant children',
 '000 scientists',
 '000 scientists just',
 '000 scottish',
 '000 scottish government',
 '10',
 '11',
 '11 000',
 '11 000 scientists',
 '14',
 '14 million',
 '14 million jobs',
 '15',
 '15 gun',
 '15 gun maker',
 '16',
 '16b',
 '18',
 '1st',
 '20',
 '20 series',
 '20 series regular',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2020 democratic',
 '2020 democratic presidential',
 '2020 presidential',
 '2020 presidential campaign',
 '2020 presidential race',
 '2025',
 '21',
 '21 gun',
 '21 gun salute',
 '225',
 '225 000',
 '225 000 scottish',
 '234',
 '234 000',
 '25',
 '30th',
 '30th anniversary',
 '3d',
 '3d printed',
 '3d printed guns',
 '40',
 '41',
 '41 county',
 '41 county seized',
 '470',
 '70',
 '70 000',
 '70 000 migrant',
 'abc',
 'abc insider',
 'abc insider released',
 'abc news',
 'abc news video',
 'abiding',
 'able',
 'aboard',
 'aboard deep',
 'aboard deep state',
 'abortion',
 'abortionist',
 'abortions',
 'abus

### Modeling

In [89]:
lr = LogisticRegression(penalty = 'l2')

In [90]:
train_data_features.shape, y_train.shape

((1315, 5000), (1315,))

In [91]:
lr.fit(train_data_features, y_train)

lr.score(train_data_features, y_train)



0.9832699619771863

In [92]:
lr.score(test_data_features, y_test)

0.7585421412300684

In [93]:
coef_list = lr.coef_.tolist()

coef_list = coef_list[0]

In [94]:
coef_df = pd.DataFrame({'features' : vectorizer.get_feature_names(),
                       'coefs' : coef_list})

coef_df.sort_values(by = ['coefs'])

Unnamed: 0,features,coefs
4912,whistleblower,-1.215440
244,baby,-1.076546
275,believe,-1.006591
122,alabama,-0.976421
4358,schiff,-0.934338
4657,tillerson,-0.914885
58,abc,-0.896134
465,clinton,-0.893849
1286,jr,-0.850057
1458,man,-0.840227


# Understanding word vectors

... for, like, actual poets. By [Allison Parrish](http://www.decontextualize.com/)


## Why word vectors?

Poetry is, at its core, the art of identifying and manipulating linguistic similarity. I have discovered a truly marvelous proof of this, which this notebook is too narrow to contain. (By which I mean: I will elaborate on this some other time)

## Animal similarity and simple linear algebra

We'll begin by considering a small subset of English: words for animals. Our task is to be able to write computer programs to find similarities among these words and the creatures they designate. To do this, we might start by making a spreadsheet of some animals and their characteristics. For example:

![Animal spreadsheet](http://static.decontextualize.com/snaps/animal-spreadsheet.png)

This spreadsheet associates a handful of animals with two numbers: their cuteness and their size, both in a range from zero to one hundred. (The values themselves are simply based on my own judgment. Your taste in cuteness and evaluation of size may differ significantly from mine. As with all data, these data are simply a mirror reflection of the person who collected them.)

These values give us everything we need to make determinations about which animals are similar (at least, similar in the properties that we've included in the data). Try to answer the following question: Which animal is most similar to a capybara? You could go through the values one by one and do the math to make that evaluation, but visualizing the data as points in 2-dimensional space makes finding the answer very intuitive:

![Animal space](http://static.decontextualize.com/snaps/animal-space.png)

The plot shows us that the closest animal to the capybara is the panda bear (again, in terms of its subjective size and cuteness). One way of calculating how "far apart" two points are is to find their *Euclidean distance*. (This is simply the length of the line that connects the two points.) For points in two dimensions, Euclidean distance can be calculated with the following Python function:

In [99]:
import numpy as np
def distance2d(x1, y1, x2, y2):
    return np.linalg.norm(np.array([x1, y1])-np.array([x2, y2]))

So, the distance between "capybara" (70, 30) and "panda" (74, 40):

In [100]:
distance2d(70, 30, 75, 40) # panda and capybara

11.180339887498949

... is less than the distance between "tarantula" and "elephant":

In [101]:
distance2d(8, 3, 65, 90) # tarantula and elephant

104.0096149401583

Modeling animals in this way has a few other interesting properties. For example, you can pick an arbitrary point in "animal space" and then find the animal closest to that point. If you imagine an animal of size 25 and cuteness 30, you can easily look at the space to find the animal that most closely fits that description: the chicken.

Reasoning visually, you can also answer questions like: what's halfway between a chicken and an elephant? Simply draw a line from "elephant" to "chicken," mark off the midpoint and find the closest animal. (According to our chart, halfway between an elephant and a chicken is a horse.)

You can also ask: what's the *difference* between a hamster and a tarantula? According to our plot, it's about seventy five units of cute (and a few units of size).

The relationship of "difference" is an interesting one, because it allows us to reason about *analogous* relationships. In the chart below, I've drawn an arrow from "tarantula" to "hamster" (in blue):

![Animal analogy](http://static.decontextualize.com/snaps/animal-space-analogy.png)

You can understand this arrow as being the *relationship* between a tarantula and a hamster, in terms of their size and cuteness (i.e., hamsters and tarantulas are about the same size, but hamsters are much cuter). In the same diagram, I've also transposed this same arrow (this time in red) so that its origin point is "chicken." The arrow ends closest to "kitten." What we've discovered is that the animal that is about the same size as a chicken but much cuter is... a kitten. To put it in terms of an analogy:

    Tarantulas are to hamsters as chickens are to kittens.
    
A sequence of numbers used to identify a point is called a *vector*, and the kind of math we've been doing so far is called *linear algebra.* (Linear algebra is surprisingly useful across many domains: It's the same kind of math you might do to, e.g., simulate the velocity and acceleration of a sprite in a video game.)

A set of vectors that are all part of the same data set is often called a *vector space*. The vector space of animals in this section has two *dimensions*, by which I mean that each vector in the space has two numbers associated with it (i.e., two columns in the spreadsheet). The fact that this space has two dimensions just happens to make it easy to *visualize* the space by drawing a 2D plot. But most vector spaces you'll work with will have more than two dimensions—sometimes many hundreds. In those cases, it's more difficult to visualize the "space," but the math works pretty much the same.

## Language with vectors: colors

So far, so good. We have a system in place—albeit highly subjective—for talking about animals and the words used to name them. I want to talk about another vector space that has to do with language: the vector space of colors.

Colors are often represented in computers as vectors with three dimensions: red, green, and blue. Just as with the animals in the previous section, we can use these vectors to answer questions like: which colors are similar? What's the most likely color name for an arbitrarily chosen set of values for red, green and blue? Given the names of two colors, what's the name of those colors' "average"?

We'll be working with this [color data](https://github.com/dariusk/corpora/blob/master/data/colors/xkcd.json) from the [xkcd color survey](https://blog.xkcd.com/2010/05/03/color-survey-results/). The data relates a color name to the RGB value associated with that color. [Here's a page that shows what the colors look like](https://xkcd.com/color/rgb/). Download the color data and put it in the same directory as this notebook.

A few notes before we proceed:

* The linear algebra functions implemented below (`addv`, `meanv`, etc.) are slow, potentially inaccurate, and shouldn't be used for "real" code—I wrote them so beginner programmers can understand how these kinds of functions work behind the scenes. Use [numpy](http://www.numpy.org/) for fast and accurate math in Python.
* If you're interested in perceptually accurate color math in Python, consider using the [colormath library](http://python-colormath.readthedocs.io/en/latest/).

Now, import the `json` library and load the color data:

In [102]:
import requests

In [103]:
resp = requests.get('https://raw.githubusercontent.com/dariusk/corpora/master/data/colors/xkcd.json')
color_data = resp.json()

The following function converts colors from hex format (`#1a2b3c`) to a tuple of integers:

In [125]:
def hex_to_int(s):
    return int(s[1:3], 16), int(s[3:5], 16), int(s[5:7], 16)

And the following cell creates a dictionary and populates it with mappings from color names to RGB vectors for each color in the data:

In [126]:
colors = dict()
for item in color_data['colors']:
    colors[item["color"]] = hex_to_int(item["hex"])

Testing it out:

In [127]:
colors['olive']

(110, 117, 14)

In [128]:
colors['red']

(229, 0, 0)

In [129]:
colors['black']

(0, 0, 0)

In [130]:
colors['blue']

(3, 67, 223)

In [109]:
colors.keys()

dict_keys(['cloudy blue', 'dark pastel green', 'dust', 'electric lime', 'fresh green', 'light eggplant', 'nasty green', 'really light blue', 'tea', 'warm purple', 'yellowish tan', 'cement', 'dark grass green', 'dusty teal', 'grey teal', 'macaroni and cheese', 'pinkish tan', 'spruce', 'strong blue', 'toxic green', 'windows blue', 'blue blue', 'blue with a hint of purple', 'booger', 'bright sea green', 'dark green blue', 'deep turquoise', 'green teal', 'strong pink', 'bland', 'deep aqua', 'lavender pink', 'light moss green', 'light seafoam green', 'olive yellow', 'pig pink', 'deep lilac', 'desert', 'dusty lavender', 'purpley grey', 'purply', 'candy pink', 'light pastel green', 'boring green', 'kiwi green', 'light grey green', 'orange pink', 'tea green', 'very light brown', 'egg shell', 'eggplant purple', 'powder pink', 'reddish grey', 'baby shit brown', 'liliac', 'stormy blue', 'ugly brown', 'custard', 'darkish pink', 'deep brown', 'greenish beige', 'manilla', 'off blue', 'battleship gre

### Vector math

Before we keep going, we'll need some functions for performing basic vector arithmetic. These functions will work with vectors in spaces of any number of dimensions.

The first function returns the Euclidean distance between two points:

In [111]:
import math
def distance(coord1, coord2):
    return np.linalg.norm(np.array(coord1)-np.array(coord2))
distance([10, 1, 200], [5, 2, 100])

100.12991560967181

The `subtractv` function subtracts one vector from another:

In [112]:
def subtractv(coord1, coord2):
    return list(np.array(coord1) - np.array(coord2))
subtractv([10, 1], [5, 2])

[5, -1]

The `addv` vector adds two vectors together:

In [113]:
def addv(coord1, coord2):
    return list(np.array(coord1) + np.array(coord2))
addv([10, 1], [5, 2])

[15, 3]

And the `meanv` function takes a list of vectors and finds their mean or average:

In [114]:
def meanv(coords):
    return list(np.mean(coords, axis=0))
meanv([[0, 1], [2, 2], [4, 3]])

[2.0, 2.0]

Just as a test, the following cell shows that the distance from "red" to "green" is greater than the distance from "red" to "pink":

In [115]:
distance(colors['red'], colors['green']) > distance(colors['red'], colors['pink'])

True

### Finding the closest item

Just as we wanted to find the animal that most closely matched an arbitrary point in cuteness/size space, we'll want to find the closest color name to an arbitrary point in RGB space. The easiest way to find the closest item to an arbitrary vector is simply to find the distance between the target vector and each item in the space, in turn, then sort the list from closest to farthest. The `closest()` function below does just that. By default, it returns a list of the ten closest items to the given vector.

> Note: Calculating "closest neighbors" like this is fine for the examples in this notebook, but unmanageably slow for vector spaces of any appreciable size. As your vector space grows, you'll want to move to a faster solution, like SciPy's [kdtree](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.KDTree.html) or [Annoy](https://pypi.python.org/pypi/annoy).

In [131]:
def closest(space, coord, n=10):
    return sorted(space.keys(), key=lambda x: distance(coord, space[x]))[:n]

Testing it out, we can find the ten colors closest to "red":

In [132]:
closest(colors, colors['white'])

['white',
 'pale grey',
 'very light pink',
 'off white',
 'ice blue',
 'very pale blue',
 'ice',
 'very light blue',
 'really light blue',
 'eggshell']

... or the ten colors closest to (150, 60, 150):

In [133]:
closest(colors, [150, 60, 150])

['warm purple',
 'medium purple',
 'ugly purple',
 'light eggplant',
 'purpleish',
 'purplish',
 'purply',
 'light plum',
 'purple',
 'muted purple']

### Color magic

The magical part of representing words as vectors is that the vector operations we defined earlier appear to operate on language the same way they operate on numbers. For example, if we find the word closest to the vector resulting from subtracting "red" from "purple," we get a series of "blue" colors:

In [134]:
subtractv(colors['green'], colors['purple'])

[-105, 146, -130]

In [135]:
closest(colors, subtractv(colors['green'], colors['purple']))

['true green',
 'racing green',
 'bottle green',
 'deep green',
 'darkgreen',
 'forest',
 'emerald green',
 'dark green',
 'vibrant green',
 'british racing green']

This matches our intuition about RGB colors, which is that purple is a combination of red and blue. Take away the red, and blue is all you have left.

You can do something similar with addition. What's blue plus green?

In [136]:
addv(colors['blue'], colors['green'])

[24, 243, 249]

In [137]:
colors['blue']

(3, 67, 223)

In [138]:
closest(colors, addv(colors['blue'], colors['green']))

['bright turquoise',
 'bright light blue',
 'bright aqua',
 'cyan',
 'neon blue',
 'aqua blue',
 'bright cyan',
 'bright sky blue',
 'aqua',
 'bright teal']

That's right, it's something like turquoise or cyan! What if we find the average of black and white? Predictably, we get gray:

In [139]:
# the average of black and white: medium grey
closest(colors, meanv([colors['black'], colors['white']]))

['medium grey',
 'purple grey',
 'steel grey',
 'battleship grey',
 'grey purple',
 'purplish grey',
 'greyish purple',
 'steel',
 'warm grey',
 'green grey']

Just as with the tarantula/hamster example from the previous section, we can use color vectors to reason about relationships between colors. In the cell below, finding the difference between "pink" and "red" then adding it to "blue" seems to give us a list of colors that are to blue what pink is to red (i.e., a slightly lighter, less saturated shade):

In [140]:
# an analogy: pink is to red as X is to blue
pink_to_red = subtractv(colors['pink'], colors['red'])
closest(colors, addv(pink_to_red, colors['blue']))

['neon blue',
 'bright sky blue',
 'bright light blue',
 'cyan',
 'bright cyan',
 'bright turquoise',
 'clear blue',
 'azure',
 'dodger blue',
 'lightish blue']

Another example of color analogies: Navy is to blue as true green/dark grass green is to green:

In [141]:
# another example: 
navy_to_blue = subtractv(colors['navy'], colors['blue'])
closest(colors, addv(navy_to_blue, colors['green']))

['true green',
 'dark grass green',
 'grassy green',
 'racing green',
 'forest',
 'bottle green',
 'dark olive green',
 'darkgreen',
 'forrest green',
 'grass green']

The examples above are fairly simple from a mathematical perspective but nevertheless *feel* magical: they're demonstrating that it's possible to use math to reason about how people use language.

### Interlude: A Love Poem That Loses Its Way

In [142]:
import random
red = colors['red']
blue = colors['blue']
for i in range(14):
    rednames = closest(colors, red)
    bluenames = closest(colors, blue)
    print(f"Roses are {rednames[0]}, violets are {bluenames[0]}")
    red = colors[random.choice(rednames[1:])]
    blue = colors[random.choice(bluenames[1:])]

Roses are red, violets are blue
Roses are fire engine red, violets are bright blue
Roses are bright red, violets are vibrant blue
Roses are cherry red, violets are blue
Roses are red, violets are cerulean blue
Roses are cherry, violets are blue
Roses are pinkish red, violets are electric blue
Roses are pinky red, violets are deep sky blue
Roses are pink red, violets are cerulean
Roses are neon red, violets are azure
Roses are strawberry, violets are bright sky blue
Roses are pinkish red, violets are aqua blue
Roses are cerise, violets are bright sky blue
Roses are pink red, violets are turquoise blue


### Doing bad digital humanities with color vectors

With the tools above in hand, we can start using our vectorized knowledge of language toward academic ends. In the following example, I'm going to calculate the average color of Bram Stoker's *Dracula*.

First, we'll load [spaCy](https://spacy.io/):

In [143]:
import spacy
nlp = spacy.load('en_core_web_sm')

To calculate the average color, we'll follow these steps:

1. Parse the text into words
2. Check every word to see if it names a color in our vector space. If it does, add it to a list of vectors.
3. Find the average of that list of vectors.
4. Find the color(s) closest to that average vector.

The following cell performs steps 1-3:

In [144]:
resp = requests.get('http://www.gutenberg.org/cache/epub/345/pg345.txt')
dracula = nlp(resp.text)
# use word.lower_ to normalize case
drac_colors = [colors[word.lower_] for word in dracula if word.lower_ in colors]
avg_color = meanv(drac_colors)
print(avg_color)

[147.44839067702551, 113.65371809100999, 100.13540510543841]


Now, we'll pass the averaged color vector to the `closest()` function, yielding... well, it's just a brown mush, which is kinda what you'd expect from adding a bunch of colors together willy-nilly.

In [145]:
closest(colors, avg_color)

['reddish grey',
 'brownish grey',
 'brownish',
 'brown grey',
 'mocha',
 'grey brown',
 'puce',
 'dull brown',
 'pinkish brown',
 'dark taupe']

On the other hand, here's what we get when we average the colors of Charlotte Perkins Gilman's classic *The Yellow Wallpaper*. The result definitely reflects the content of the story, so maybe we're on to something here.

In [146]:
resp = requests.get('http://www.gutenberg.org/cache/epub/1952/pg1952.txt')
yellow = nlp(resp.text)
wallpaper_colors = [colors[word.lower_] for word in yellow if word.lower_ in colors]
avg_color = meanv(wallpaper_colors)
closest(colors, avg_color)

['sickly yellow',
 'piss yellow',
 'puke yellow',
 'vomit yellow',
 'dirty yellow',
 'mustard yellow',
 'dark yellow',
 'olive yellow',
 'macaroni and cheese',
 'pea']

Exercise for the reader: Use the vector arithmetic functions to rewrite a text, making it...

* more blue (i.e., add `colors['blue']` to each occurrence of a color word); or
* more light (i.e., add `colors['white']` to each occurrence of a color word); or
* darker (i.e., attenuate each color. You might need to write a vector multiplication function to do this one right.)

## Distributional semantics

In the previous section, the examples are interesting because of a simple fact: colors that we think of as similar are "closer" to each other in RGB vector space. In our color vector space, or in our animal cuteness/size space, you can think of the words identified by vectors close to each other as being *synonyms*, in a sense: they sort of "mean" the same thing. They're also, for many purposes, *functionally identical*. Think of this in terms of writing, say, a search engine. If someone searches for "mauve trousers," then it's probably also okay to show them results for, say,

In [None]:
for cname in closest(colors, colors['mauve']):
    print(cname, "trousers")

That's all well and good for color words, which intuitively seem to exist in a multidimensional continuum of perception, and for our animal space, where we've written out the vectors ahead of time. But what about... arbitrary words? Is it possible to create a vector space for all English words that has this same "closer in space is closer in meaning" property?

To answer that, we have to back up a bit and ask the question: what does *meaning* mean? No one really knows, but one theory popular among computational linguists, computer scientists and other people who make search engines is the [Distributional Hypothesis](https://en.wikipedia.org/wiki/Distributional_semantics), which states that:

    Linguistic items with similar distributions have similar meanings.
    
What's meant by "similar distributions" is *similar contexts*. Take for example the following sentences:

    It was really cold yesterday.
    It will be really warm today, though.
    It'll be really hot tomorrow!
    Will it be really cool Tuesday?
    
According to the Distributional Hypothesis, the words `cold`, `warm`, `hot` and `cool` must be related in some way (i.e., be close in meaning) because they occur in a similar context, i.e., between the word "really" and a word indicating a particular day. (Likewise, the words `yesterday`, `today`, `tomorrow` and `Tuesday` must be related, since they occur in the context of a word indicating a temperature.)

In other words, according to the Distributional Hypothesis, a word's meaning is just a big list of all the contexts it occurs in. Two words are closer in meaning if they share contexts.

## Word vectors by counting contexts

So how do we turn this insight from the Distributional Hypothesis into a system for creating general-purpose vectors that capture the meaning of words? Maybe you can see where I'm going with this. What if we made a *really big* spreadsheet that had one column for every context for every word in a given source text. Let's use a small source text to begin with, such as this excerpt from Dickens:

    It was the best of times, it was the worst of times.

Such a spreadsheet might look something like this:

![dickens contexts](http://static.decontextualize.com/snaps/best-of-times.png)

The spreadsheet has one column for every possible context, and one row for every word. The values in each cell correspond with how many times the word occurs in the given context. The numbers in the columns constitute that word's vector, i.e., the vector for the word `of` is

    [0, 0, 0, 0, 1, 0, 0, 0, 1, 0]
    
Because there are ten possible contexts, this is a ten dimensional space! It might be strange to think of it, but you can do vector arithmetic on vectors with ten dimensions just as easily as you can on vectors with two or three dimensions, and you could use the same distance formula that we defined earlier to get useful information about which vectors in this space are similar to each other. In particular, the vectors for `best` and `worst` are actually the same (a distance of zero), since they occur only in the same context (`the ___ of`):

    [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
    
Of course, the conventional way of thinking about "best" and "worst" is that they're *antonyms*, not *synonyms*. But they're also clearly two words of the same kind, with related meanings (through opposition), a fact that is captured by this distributional model.

### Contexts and dimensionality

Of course, in a corpus of any reasonable size, there will be many thousands if not many millions of possible contexts. It's difficult enough working with a vector space of ten dimensions, let alone a vector space of a million dimensions! It turns out, though, that many of the dimensions end up being superfluous and can either be eliminated or combined with other dimensions without significantly affecting the predictive power of the resulting vectors. The process of getting rid of superfluous dimensions in a vector space is called [dimensionality reduction](https://en.wikipedia.org/wiki/Dimensionality_reduction), and most implementations of count-based word vectors make use of dimensionality reduction so that the resulting vector space has a reasonable number of dimensions (say, 100—300, depending on the corpus and application).

The question of how to identify a "context" is itself very difficult to answer. In the toy example above, we've said that a "context" is just the word that precedes and the word that follows. Depending on your implementation of this procedure, though, you might want a context with a bigger "window" (e.g., two words before and after), or a non-contiguous window (skip a word before and after the given word). You might exclude certain "function" words like "the" and "of" when determining a word's context, or you might [lemmatize](https://en.wikipedia.org/wiki/Lemmatisation) the words before you begin your analysis, so two occurrences with different "forms" of the same word count as the same context. These are all questions open to research and debate, and different implementations of procedures for creating count-based word vectors make different decisions on this issue.

### GloVe vectors

But you don't have to create your own word vectors from scratch! Many researchers have made downloadable databases of pre-trained vectors. One such project is Stanford's [Global Vectors for Word Representation (GloVe)](https://nlp.stanford.edu/projects/glove/). These 300-dimensional vectors are included with spaCy, and they're the vectors we'll be using for the rest of this tutorial.

## Word vectors in spaCy

Okay, let's have some fun with real word vectors. We're going to use the GloVe vectors that come with spaCy to creatively analyze and manipulate the text of Bram Stoker's *Dracula*. First, make sure you've got `spacy` imported:

In [147]:
dracula[560]

hour

In [151]:
dracula[560].vector.shape

(96,)

In [156]:
# previously we've used the _sm model, which doesn't include all vectors.
# !pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.0/en_core_web_lg-2.2.0.tar.gz
import en_core_web_lg
nlp = en_core_web_lg.load()

In [157]:
resp = requests.get('http://www.gutenberg.org/cache/epub/345/pg345.txt')
dracula = nlp(resp.text)


And the cell below creates a list of unique words (or tokens) in the text, as a list of strings.

In [158]:
# all of the words in the text file
tokens = list(set([w.text for w in dracula if w.is_alpha]))

You can see the vector of any word in spaCy's vocabulary using the `vocab` attribute, like so:

In [159]:
nlp.vocab['alligator'].vector

array([-4.9304e-01,  1.2459e-01,  2.6142e-01,  3.8393e-02,  1.4018e-01,
        5.7285e-01, -5.0449e-01,  1.5094e-01,  7.3356e-02,  2.3308e-01,
       -1.6048e-01, -5.1184e-01, -1.4028e-01, -3.0110e-01, -4.3446e-01,
       -2.8257e-02,  1.9100e-03,  6.5157e-01,  1.4855e-01, -3.7255e-01,
       -4.6619e-01, -1.7223e-01, -5.1794e-01, -2.5453e-01, -2.6785e-01,
       -6.7776e-02, -3.0085e-01, -3.3212e-01, -1.5862e-01, -1.2336e-01,
       -3.8935e-01, -3.5551e-01, -3.6182e-01,  2.3197e-02, -1.7486e-01,
        2.2345e-01,  6.7557e-01,  9.6939e-03, -9.5640e-02,  1.2073e-01,
       -1.2016e-01, -1.6161e-01,  1.0661e-01, -4.5233e-01,  4.5991e-01,
        1.0367e-02, -1.0842e-01, -3.6163e-01,  2.5105e-01,  9.5780e-03,
       -6.1363e-01,  2.7494e-01,  2.1783e-01, -1.0663e-01, -9.5146e-03,
        2.7570e-01,  2.5888e-02, -5.6749e-02,  6.7667e-03,  5.3735e-01,
       -1.2224e-01,  8.8448e-02,  4.6094e-01,  4.6881e-02, -6.6557e-01,
       -3.9493e-01,  2.7362e-01,  1.2302e-01,  2.8594e-01, -2.52

For the sake of convenience, the following function gets the vector of a given string from spaCy's vocabulary:

In [160]:
def vec(s):
    return nlp.vocab[s].vector

### Cosine similarity and finding closest neighbors

The cell below defines a function `cosine()`, which returns the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) of two vectors. Cosine similarity is another way of determining how similar two vectors are, which is more suited to high-dimensional spaces. [See the Encyclopedia of Distances for more information and even more ways of determining vector similarity.](http://www.uco.es/users/ma1fegan/Comunes/asignaturas/vision/Encyclopedia-of-distances-2009.pdf)

In [161]:
from sklearn.metrics.pairwise import cosine_similarity

In [212]:
# Helper function for building cosine similarity
def cos_sim(vec1, vec2):
    if not isinstance(vec1, np.ndarray):
        vec1 = np.asarray(vec1)
    if not isinstance(vec2, np.ndarray):
        vec2 = np.asarray(vec2)
    return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]

The following cell shows that the cosine similarity between `dog` and `puppy` is larger than the similarity between `trousers` and `octopus`, thereby demonstrating that the vectors are working how we expect them to:

In [204]:
cos_sim(vec('dog'), vec('puppy')) > cos_sim(vec('trousers'), vec('octopus'))

True

The following cell defines a function that iterates through a list of tokens and returns the token whose vector is most similar to a given vector.

In [205]:
def spacy_closest(token_list, vec_to_check, n=10):
    return sorted(token_list,
                  key=lambda x: cos_sim(vec_to_check, vec(x)),
                  reverse=True)[:n]

Using this function, we can get a list of synonyms, or words closest in meaning (or distribution, depending on how you look at it), to any arbitrary word in spaCy's vocabulary. In the following example, we're finding the words in *Dracula* closest to "basketball":

In [206]:
# what's the closest equivalent of basketball?
spacy_closest(tokens, vec("basketball"))

['tennis',
 'coach',
 'game',
 'teams',
 'Junior',
 'junior',
 'Team',
 'school',
 'boys',
 'leagues']

### Fun with spaCy, Dracula, and vector arithmetic

Now we can start doing vector arithmetic and finding the closest words to the resulting vectors. For example, what word is closest to the halfway point between day and night?

In [213]:
# halfway between day and night
spacy_closest(tokens, meanv([vec("day"), vec("night")]))

['night',
 'Day',
 'day',
 'evening',
 'Evening',
 'Morning',
 'morning',
 'afternoon',
 'Nights',
 'nights']

Variations of `night` and `day` are still closest, but after that we get words like `evening` and `morning`, which are indeed halfway between day and night!

Here are the closest words in _Dracula_ to "wine":

In [214]:
spacy_closest(tokens, vec("wine"))

['wine',
 'beer',
 'bottle',
 'Drink',
 'drink',
 'cellar',
 'fruit',
 'bottles',
 'brandy',
 'taste']

If you subtract "alcohol" from "wine" and find the closest words to the resulting vector, you're left with simply a lovely dinner:

In [215]:
spacy_closest(tokens, subtractv(vec("wine"), vec("alcohol")))

['wine',
 'cellar',
 'exquisite',
 'fabulous',
 'splendid',
 'magnificent',
 'delightful',
 'dinner',
 'Dinner',
 'sparkling']

The closest words to "water":

In [216]:
spacy_closest(tokens, vec("water"))

['water',
 'waters',
 'Salt',
 'salt',
 'pond',
 'dry',
 'liquid',
 'ocean',
 'boiling',
 'heat']

But if you add "frozen" to "water," you get "ice":

In [217]:
spacy_closest(tokens, addv(vec("water"), vec("frozen")))

['water',
 'cold',
 'ice',
 'Salt',
 'salt',
 'dry',
 'fresh',
 'liquid',
 'boiling',
 'milk']

You can even do analogies! For example, the words most similar to "grass":

In [218]:
spacy_closest(tokens, vec("grass"))

['grass',
 'lawn',
 'trees',
 'greens',
 'grassy',
 'GARDEN',
 'garden',
 'sand',
 'foliage',
 'tree']

If you take the difference of "blue" and "sky" and add it to grass, you get the analogous word ("green"):

In [219]:
# analogy: blue is to sky as X is to grass
blue_to_sky = subtractv(vec("blue"), vec("sky"))
spacy_closest(tokens, addv(blue_to_sky, vec("grass")))

['grass',
 'Green',
 'GREEN',
 'green',
 'yellow',
 'Red',
 'red',
 'purple',
 'lawn',
 'pink']

## Sentence similarity

To get the vector for a sentence, we simply average its component vectors, like so:

In [220]:
def sentvec(s):
    sent = nlp(s)
    return meanv([w.vector for w in sent])

Let's find the sentence in our text file that is closest in "meaning" to an arbitrary input sentence. First, we'll get the list of sentences:

In [222]:
sentences = list(dracula.sents)

The following function takes a list of sentences from a spaCy parse and compares them to an input sentence, sorting them by cosine similarity.

In [224]:
def spacy_closest_sent(space, input_str, n=10):
    input_vec = sentvec(input_str)
    return sorted(space,
                  key=lambda x: cos_sim(np.array(sentvec(str(x))), input_vec),
                  reverse=True)[:n]

Here are the sentences in *Dracula* closest in meaning to "My favorite food is strawberry ice cream." (Extra linebreaks are present because we didn't strip them out when we originally read in the source text.)

In [225]:
for sent in spacy_closest_sent(sentences, "My favorite food is strawberry ice cream."):
    print(sent.text)
    print("---")

This, with some cheese
and a salad and a bottle of old Tokay, of which I had two glasses, was
my supper.
---
We get hot soup, or coffee, or tea; and
off we go.
---
rather supper, a chicken done up some way with red pepper, which was
very good but thirsty.
---
I got a cup of tea at the Aërated Bread Company
and came down to Purfleet by the next train.


---
I dined on what they
called "robber steak"--bits of bacon, onion, and beef, seasoned with red
pepper, and strung on sticks and roasted over the fire, in the simple
style of the London cat's meat!
---
There is not even a toilet glass on my
table, and I had to get the little shaving glass from my bag before I
could either shave or brush my hair.
---
There was everywhere a bewildering mass of fruit blossom--apple,
plum, pear, cherry; and as we drove by I could see the green grass under
the trees spangled with the fallen petals.
---
I
saw it drip with the fresh blood!
---
Drink it off, like a good
child.
---
I left Quince

## Further resources

* [Word2vec](https://en.wikipedia.org/wiki/Word2vec) is another procedure for producing word vectors which uses a predictive approach rather than a context-counting approach. [This paper](http://clic.cimec.unitn.it/marco/publications/acl2014/baroni-etal-countpredict-acl2014.pdf) compares and contrasts the two approaches. (Spoiler: it's kind of a wash.)
* If you want to train your own word vectors on a particular corpus, the popular Python library [gensim](https://radimrehurek.com/gensim/) has an implementation of Word2Vec that is relatively easy to use. [There's a good tutorial here.](https://rare-technologies.com/word2vec-tutorial/)
* When you're working with vector spaces with high dimensionality and millions of vectors, iterating through your entire space calculating cosine similarities can be a drag. I use [Annoy](https://pypi.python.org/pypi/annoy) to make these calculations faster, and you should consider using it too.