In [13]:
import numpy as np
import pandas as pd
import scipy as sc
import sklearn
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score
import statsmodels.api as sm
import sys
### Gensim is outside the anaconda distribution ###
### uncomment to install Gensim ###
#!{sys.executable} -m pip install gensim
import gensim
import gensim.downloader as model_api

In [14]:
# Load pretrained word embeddings
# This will download 60mb of data the first time it's loaded
word_vectors = model_api.load("glove-wiki-gigaword-50")

There is some good information decomposing word embeddings on [Jay Alammar's blog](http://jalammar.github.io/illustrated-word2vec/).

Word embedding dimensions capture high level concepts, which let algebra "work" in cosine distance:

In [15]:
# Get the most similar word to an expression
word_vectors.most_similar_cosmul(positive=['king', 'woman'], negative=['man'])

[('queen', 0.9288907647132874),
 ('throne', 0.882325291633606),
 ('elizabeth', 0.878950297832489),
 ('princess', 0.876754879951477),
 ('daughter', 0.8705160617828369),
 ('prince', 0.8702554702758789),
 ('kingdom', 0.8607221841812134),
 ('eldest', 0.8595449328422546),
 ('monarch', 0.8584719896316528),
 ('widow', 0.8549266457557678)]

# Sentence embeddings

The simplest and most effective way to represent a sentence is to sum or average the sentence's words. There are [some better methods](https://openreview.net/forum?id=SyK00v5xx) using weights, or using deep learning language models, but sentence embeddings are often just as good while being simpler.

In [16]:
df = pd.read_csv("data/troll.csv")
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,insult,date,comment
0,1,20120618192155Z,"""You fuck your dad."""
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ..."
2,0,,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,,"""listen if you dont wanna get married to a man..."
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."
...,...,...,...
3942,1,20120502172717Z,"""you are both morons and that is never happening"""
3943,0,20120528164814Z,"""Many toolbars include spell check, like Yahoo..."
3944,0,20120620142813Z,"""@LambeauOrWrigley\xa0\xa0@K.Moss\xa0\nSioux F..."
3945,0,20120528205648Z,"""How about Felix? He is sure turning into one ..."


In [17]:
words = df.comment.str.split()
words = pd.DataFrame(words.tolist())

Let's clean the words up:

In [18]:
import re

replaceDict = dict({
'{':" ", '}':" ", ',':"", '.':" ", '!':" ", '\\':" ", '/':" ", '$':" ", '%':" ",
'^':" ", '?':" ", '\'':" ", '"':" ", '(':" ", ')':" ", '*':" ", '+':" ", '-':" ",
'=':" ", ':':" ", ';':" ", ']':" ", '[':" ", '`':" ", '~':" ",
})

rep = dict((re.escape(k), v) for k, v in replaceDict.items())
pattern = re.compile("|".join(rep.keys()))
def replacer(text):
    return rep[re.escape(text.group(0))]

words = df.comment.str.replace(pattern, replacer).str.lower().str.split()
words = pd.DataFrame(words.tolist())
words

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2481,2482,2483,2484,2485,2486,2487,2488,2489,2490
0,you,fuck,your,dad,,,,,,,...,,,,,,,,,,
1,i,really,don,t,understand,your,point,xa0,it,seems,...,,,,,,,,,,
2,a,xc2,xa0majority,of,canadians,can,and,has,been,wrong,...,,,,,,,,,,
3,listen,if,you,dont,wanna,get,married,to,a,man,...,,,,,,,,,,
4,c,xe1c,b,u1ea1n,xu,u1ed1ng,u0111,u01b0,u1eddng,bi,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3942,you,are,both,morons,and,that,is,never,happening,,...,,,,,,,,,,
3943,many,toolbars,include,spell,check,like,yahoo,for,example,you,...,,,,,,,,,,
3944,@lambeauorwrigley,xa0,xa0@k,moss,xa0,nsioux,falls,s,d,i,...,,,,,,,,,,
3945,how,about,felix,he,is,sure,turning,into,one,hell,...,,,,,,,,,,


### Sentence embeddings quickly

This is a short way to generate sentence embeddings from a column.

It's not very efficient and can be optimized a lot, though

In [19]:
def soft_get(w):
    try:
        return word_vectors[w]
    except KeyError:
        return np.zeros(word_vectors.vector_size)

def map_vectors(row):
    try:
        return np.sum(
            row.loc[words.iloc[0].notna()].apply(soft_get)
        )
    except:
        return np.zeros(word_vectors.vector_size)

emb = pd.DataFrame(words.apply(map_vectors, axis=1).tolist())
emb

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-1.552125,0.848600,0.910060,-2.855960,3.130450,-1.382110,-0.664681,0.110994,0.332940,1.048678,...,1.091860,-0.290650,-2.290890,2.069010,1.017153,-0.045770,1.597997,-2.269570,0.493802,2.511670
1,-0.809713,0.444310,-0.170361,0.519650,1.639630,-1.408988,-1.431460,-1.140770,-2.752543,0.100198,...,0.047036,1.614048,-1.388177,0.799910,0.799906,-0.294300,-0.281051,-2.259776,-0.534389,4.159340
2,0.925580,1.036030,-0.939170,0.281300,1.557990,1.474480,-0.349470,-0.786490,0.271930,-0.043840,...,-0.209140,0.654560,-0.567197,-0.038036,-0.429195,-0.031350,-0.096000,-0.269402,0.316212,-0.624150
3,0.984628,0.410783,1.452170,-2.068327,1.465770,-2.765390,-0.620257,1.445397,-1.201449,1.350200,...,0.443955,0.315920,-0.401230,2.645420,-0.151940,0.170872,0.417923,0.263880,-0.998752,3.392430
4,-0.789980,0.814320,0.615490,1.806600,1.655930,0.836760,1.297190,-2.180100,0.429000,1.341210,...,1.526580,0.654890,0.291433,0.167440,-0.857240,2.045300,0.228305,-1.293180,1.370850,1.473740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3942,1.028558,-0.162314,1.025981,-1.619540,1.495300,-0.094800,-0.645170,-1.350557,-2.186080,0.532649,...,-0.865516,-0.051755,0.820879,3.084972,0.362180,0.253310,-0.971350,-0.306874,-0.933339,0.610880
3943,2.516875,0.420063,-1.881124,-0.264210,-1.111890,0.797190,-2.294540,-1.572460,-0.762668,0.912920,...,-0.409087,1.071230,0.517720,2.127889,0.270960,1.529950,0.236790,0.207239,0.298230,-1.649290
3944,-0.645630,0.635280,-0.245820,0.158380,0.026548,0.614830,-1.722900,-0.663420,0.264600,-0.409590,...,0.474210,0.379990,-0.610310,0.195210,-0.187130,-0.193770,0.361500,-0.368440,-0.372750,-0.179140
3945,2.227540,0.215140,0.155670,-1.126360,2.026350,-0.482939,-1.700610,0.818070,-2.132360,0.212580,...,-1.046870,1.698601,0.623800,0.890860,0.305628,-0.110268,-0.707204,-0.980380,-1.087599,0.942129


In [20]:
sgdr = SGDRegressor()
sgdr.fit(emb, df.insult)
sgdr.score(emb, df.insult)

SGDRegressor()

35% R^2 versus 25% when we were using TF-IDF! A huge win.

We could also augment our embeddings with TF-IDF weights