In [None]:
!git clone https://github.com/omriallouche/text_classification_from_zero_to_hero.git --depth 1
import os
os.chdir('text_classification_from_zero_to_hero/notebooks')

# Word2vec, GloVe and Word Embeddings
## Part 2 of the Workshop "Text Classification - From Zero to Hero", by Dr. Omri Allouche, Gong.io, Bar Ilan University

For this presentation, we will use FLAIR: https://www.analyticsvidhya.com/blog/2019/02/flair-nlp-library-python/?utm_source=blog&utm_medium=top-pretrained-models-nlp-article

In [15]:
!pip install flair

Requirement already up-to-date: gensim in c:\programdata\anaconda3\lib\site-packages (3.8.1)


## Pre-trained word embeddings using Flair

In [None]:
from flair.embeddings import Sentence, WordEmbeddings
glove_embedding = WordEmbeddings('glove')

sentence = Sentence('The grass is green .')

glove_embedding.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

Task: Compare the embeddings obtained using GloVe for the same word in different context (ie different sentences). Are they equal or different?

## Sentence embedding

### Sentence embedding using the average of word vectors
Now, let's average the vectors into a single vector that would represent our entire document, and use it for classification. We'll build a Logistic Regression classifier on top of it.

In [2]:
import numpy as np
def get_sentence_embedding(sentence):
    sentence = Sentence(sentence)
    glove_embedding.embed(sentence)
    sentence_embedding = np.mean( [np.array(token.embedding) for token in sentence], axis=0)
    return sentence_embedding

In [10]:
get_sentence_embedding('The grass is green .')

array([-0.48264474,  0.33375996,  0.348696  , -0.5163    ,  0.191962  ,
        0.12714759,  0.013061  ,  0.1766614 , -0.1873308 , -0.093839  ,
        0.0488024 , -0.0484856 ,  0.314986  ,  0.031634  ,  0.2535662 ,
       -0.059972  ,  0.38505   ,  0.06304   ,  0.027378  ,  0.06385148,
       -0.1046188 ,  0.131214  ,  0.39698398,  0.0049592 ,  0.48706597,
        0.27059498,  0.0188544 , -0.780686  , -0.160654  , -0.0207716 ,
       -0.2985124 ,  0.521548  ,  0.371312  ,  0.0037584 ,  0.24874802,
        0.3579286 , -0.187218  ,  0.484008  ,  0.1211252 ,  0.0338024 ,
       -0.32039762, -0.578998  ,  0.1858078 , -0.27883598,  0.07773139,
       -0.14281002,  0.23905559, -0.13043599, -0.1817726 , -0.49833995,
       -0.10820474, -0.30922002,  0.285602  ,  1.1599319 , -0.49102196,
       -2.58022   ,  0.021746  ,  0.043806  ,  1.479552  ,  0.427112  ,
       -0.02804599,  0.67730397, -0.0862168 ,  0.305978  ,  1.0884    ,
       -0.21497002,  0.2661428 , -0.022402  ,  0.3063696 , -0.29

Now, let's load our own data:

In [11]:
from sklearn import linear_model
from sklearn import metrics
clf = linear_model.LogisticRegression(C=1e5)

In [12]:
import pandas as pd
df = pd.read_csv('../data/train.csv')
vectors = np.array([get_sentence_embedding(x) for x in df['text']])
y_truth = df['label']
clf.fit(vectors, y_truth)

y_predict = clf.predict(vectors)
metrics.accuracy_score(y_truth, y_predict)



1.0

And now let's check the performance on our test set:

In [13]:
df = pd.read_csv('../data/val.csv')
vectors = np.array([get_sentence_embedding(x) for x in df['text']])
y_truth = df['label']
y_predict = clf.predict(vectors)
metrics.accuracy_score(y_truth, y_predict)

0.7087812901155326

In [14]:
print(metrics.classification_report(y_truth, y_predict))

                       precision    recall  f1-score   support

   rec.sport.baseball       0.69      0.69      0.69        16
     rec.sport.hockey       0.71      0.75      0.73        20
   talk.politics.guns       0.79      0.68      0.73        22
talk.politics.mideast       0.65      0.72      0.68        18

            micro avg       0.71      0.71      0.71        76
            macro avg       0.71      0.71      0.71        76
         weighted avg       0.72      0.71      0.71        76

