For this presentation, we will use FLAIR: https://www.analyticsvidhya.com/blog/2019/02/flair-nlp-library-python/?utm_source=blog&utm_medium=top-pretrained-models-nlp-article


In [None]:
!pip install allennlp
!pip install flair

In [1]:
from flair.embeddings import Sentence

In [40]:
from flair.embeddings import WordEmbeddings
glove_embedding = WordEmbeddings('glove')

sentence = Sentence('The grass is green .')

glove_embedding.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

Token: 1 The
tensor([-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344,
        -0.5755,  0.0875,  0.2879, -0.0673,  0.3091, -0.2638, -0.1323, -0.2076,
         0.3340, -0.3385, -0.3174, -0.4834,  0.1464, -0.3730,  0.3458,  0.0520,
         0.4495, -0.4697,  0.0263, -0.5415, -0.1552, -0.1411, -0.0397,  0.2828,
         0.1439,  0.2346, -0.3102,  0.0862,  0.2040,  0.5262,  0.1716, -0.0824,
        -0.7179, -0.4153,  0.2033, -0.1276,  0.4137,  0.5519,  0.5791, -0.3348,
        -0.3656, -0.5486, -0.0629,  0.2658,  0.3020,  0.9977, -0.8048, -3.0243,
         0.0125, -0.3694,  2.2167,  0.7220, -0.2498,  0.9214,  0.0345,  0.4674,
         1.1079, -0.1936, -0.0746,  0.2335, -0.0521, -0.2204,  0.0572, -0.1581,
        -0.3080, -0.4162,  0.3797,  0.1501, -0.5321, -0.2055, -1.2526,  0.0716,
         0.7056,  0.4974, -0.4206,  0.2615, -1.5380, -0.3022, -0.0734, -0.2831,
         0.3710, -0.2522,  0.0162, -0.0171, -0.3898,  0.8742, -0.7257, -0.5106,
        -0.5203, -0.1459,  

Task: Compare the embeddings obtained using GloVe for the same word in different context (ie different sentences). Are they equal or different?

Now, let's average the vectors into a single vector that would represent our entire document, and use it for classification. We'll build a Logistic Regression classifier on top of it.

In [3]:
import numpy as np
import torch

def get_sentence_embedding(sentence):
    sentence = Sentence(sentence)
    glove_embedding.embed(sentence)
    sentence_embedding = np.mean( [np.array(token.embedding) for token in sentence], axis=0)
    return sentence_embedding

In [4]:
get_sentence_embedding('The grass is green .')

array([-0.48264474,  0.33375996,  0.348696  , -0.5163    ,  0.191962  ,
        0.12714759,  0.013061  ,  0.1766614 , -0.1873308 , -0.093839  ,
        0.0488024 , -0.0484856 ,  0.314986  ,  0.031634  ,  0.2535662 ,
       -0.059972  ,  0.38505   ,  0.06304   ,  0.027378  ,  0.06385148,
       -0.1046188 ,  0.131214  ,  0.39698398,  0.0049592 ,  0.48706597,
        0.27059498,  0.0188544 , -0.780686  , -0.160654  , -0.0207716 ,
       -0.2985124 ,  0.521548  ,  0.371312  ,  0.0037584 ,  0.24874802,
        0.3579286 , -0.187218  ,  0.484008  ,  0.1211252 ,  0.0338024 ,
       -0.32039762, -0.578998  ,  0.1858078 , -0.27883598,  0.07773139,
       -0.14281002,  0.23905559, -0.13043599, -0.1817726 , -0.49833995,
       -0.10820474, -0.30922002,  0.285602  ,  1.1599319 , -0.49102196,
       -2.58022   ,  0.021746  ,  0.043806  ,  1.479552  ,  0.427112  ,
       -0.02804599,  0.67730397, -0.0862168 ,  0.305978  ,  1.0884    ,
       -0.21497002,  0.2661428 , -0.022402  ,  0.3063696 , -0.29

In [7]:
import pandas as pd
df = pd.read_csv('20newsgroups.csv')
df.head()

Unnamed: 0,data,target,target_name,data_processed,num_chars
0,Does anyone have the scoop on Scot Erickson? ...,0,rec.sport.baseball,does anyone have the scoop on scot erickson ? ...,82
1,\nNot particularly *in* the World Series. Duri...,0,rec.sport.baseball,not particularly *in* the world series . durin...,3145
2,\nI think the three-headed GM's guiding princi...,1,rec.sport.hockey,i think the three-headed gm's guiding principl...,959
3,\n\n\nA suggestion: cameras panning over plant...,2,talk.politics.guns,a suggestion: cameras panning over planted aut...,151
4,\nDon't you Americans study history...the Fren...,1,rec.sport.hockey,don't you americans study history . . .the fre...,207


In [8]:
vectors = [get_sentence_embedding(x) for x in df['data_processed']]
vectors = np.array(vectors)
y_truth = df['target']

In [9]:
from sklearn import linear_model
from sklearn import metrics

clf = linear_model.LogisticRegression(C=1e5)
clf.fit(vectors, y_truth)



LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [10]:
y_predict = clf.predict(vectors)

In [11]:
metrics.f1_score(y_truth, y_predict, average='macro')

0.8789065712873826

## Training our own Word Vectors
It's very easy to train our own word vectors based on our custom task. This can lead to an increase in performance if our domain is different from that used for training common word vectors (usually Wikipedia).  

We will train word vectors using the Gensim package:

In [107]:
from sklearn.datasets import fetch_20newsgroups
types_to_remove = ('headers', 'footers', 'quotes')
newsgroups = fetch_20newsgroups(remove=types_to_remove)

In [112]:
def preprocess_text(txt):
    txt = txt.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('?', ' ?').replace('.', ' .').replace(',', ' ,')
    txt = txt.lower().strip()
    txt = txt.split(' ')
    txt = " ".join([w for w in txt if w!=''])
    return txt

In [153]:
import gensim
w2v = gensim.models.Word2Vec([preprocess_text(s).split() for s in newsgroups.data], iter=50, sg=1, min_count=5, size=100, window=3, workers=7)
w2v.init_sims(replace=True) # frees memory of word vectors but prevents further training

In [161]:
def evaluate_words(words_to_check = ['love', 'hate', 'lonely', 'heartache', 'success', 'guitar', 'god', 'beer', 'gun', 'police']):
    for word in words_to_check:
        print(word, ' -> ')
        try:
            print('\n'.join(['\t{} ({:.2f}), '.format(tup[0], tup[1]) for tup in w2v.wv.similar_by_word(word, topn=5)]))
        except:
            pass
        print()

In [162]:
evaluate_words()

love  -> 
	despise (0.61), 
	misread (0.57), 
	mourning (0.56), 
	hate (0.55), 
	trust (0.54), 

hate  -> 
	loathe (0.57), 
	love (0.55), 
	generalize (0.53), 
	prefer (0.53), 
	hating (0.51), 

lonely  -> 

heartache  -> 

success  -> 
	dearly (0.51), 
	results: (0.50), 
	incidence (0.50), 
	markedly (0.50), 
	concern (0.49), 

guitar  -> 
	moe (0.52), 
	puzzle (0.49), 
	correctable (0.48), 
	tetris (0.48), 
	rams (0.47), 

god  -> 
	jesus (0.76), 
	christ (0.73), 
	whosoever (0.72), 
	eternal (0.70), 
	god's (0.70), 

beer  -> 
	drinking (0.52), 
	smoked (0.52), 
	talon (0.49), 
	backyard (0.49), 
	cats (0.49), 

gun  -> 
	firearms (0.66), 
	homicide (0.65), 
	handgun (0.64), 
	handguns (0.63), 
	guns (0.61), 

police  -> 
	cops (0.63), 
	criminal (0.62), 
	precinct (0.60), 
	affidavit (0.60), 
	legislature (0.57), 



Let's compare this to the GloVe embeddings from Flair. Flair doesn't provide a way to get the most similar vectors, but we can implement this in a naive way ourselves. We'll create a dictionary that maps a word from our data to its word embedding, and then given a query, will compute the cosine distance of it to all other words and return the most similar words.

In [28]:
from collections import Counter
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

d = df['data_processed'].values
vocabulary = Counter([w for sent in d for w in sent.split()])
TH = 5
vocabulary = {k:v for k,v in vocabulary.most_common() if v>TH}

In [81]:
word_embeddings = {}
for w in vocabulary.keys():
    sentence = Sentence(w)
    glove_embedding.embed(sentence)
    for token in sentence:
        word_embeddings[token.text] = np.array(token.embedding)

In [91]:
query = 'hate'
cos_sim = cosine_similarity(word_embeddings[query].reshape(1, -1), np.array([v for k,v in word_embeddings.items()]))[0]
pd.Series(cos_sim, index=word_embeddings.keys()).sort_values()

narimanov       -0.377583
ahola           -0.359337
akgun           -0.354997
lssu            -0.340201
basim           -0.326174
macoun          -0.326096
akhalkalaki     -0.322035
gunduz          -0.321786
davidsson       -0.320443
aslin           -0.317507
bobbs-merrill   -0.312970
khmylev         -0.311811
villalta        -0.306674
babych          -0.305737
microdistrict   -0.304121
anania          -0.303885
.067            -0.302436
puppa           -0.302278
bortnick        -0.295164
snd             -0.293724
light-hitting   -0.293436
dolezal         -0.290676
basar           -0.289941
cuyler          -0.289331
erivan          -0.286078
gaudreau        -0.284338
stanky          -0.283377
fedyk           -0.282819
gurvitz         -0.282730
18:39           -0.282158
                   ...   
someone          0.576079
talk             0.578374
feel             0.578664
racial           0.579516
remember         0.583630
kind             0.585443
do               0.588497
why         

In [163]:
def get_sentence_embedding_from_custom_w2v(sentence):
    l = w2v.wv.get_vector('the').shape
    sentence_embedding = np.mean( [w2v.wv.get_vector(token) if token in w2v.wv.vocab else np.zeros(l) for token in sentence.split()], axis=0)
    return sentence_embedding

In [164]:
vectors = [get_sentence_embedding_from_custom_w2v(x) for x in df['data_processed']]
vectors = np.array(vectors)
y_truth = df['target']

In [165]:
from sklearn import linear_model
from sklearn import metrics

clf = linear_model.LogisticRegression(C=1e5)
clf.fit(vectors, y_truth)



LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [166]:
y_predict = clf.predict(vectors)

In [167]:
metrics.f1_score(y_truth, y_predict, average='macro')

0.838586471495323