In [None]:
!git clone https://github.com/omriallouche/text_classification_from_zero_to_hero.git
import os
os.chdir('text_classification_from_zero_to_hero')

# Word2vec, GloVe and Word Embeddings
## Part 3 of the Workshop "Text Classification - From Zero to Hero", by Dr. Omri Allouche, Gong.io, Bar Ilan University

For this presentation, we will use FLAIR: https://www.analyticsvidhya.com/blog/2019/02/flair-nlp-library-python/?utm_source=blog&utm_medium=top-pretrained-models-nlp-article

In [None]:
!pip install flair

## Pre-trained word embeddings using Flair

In [8]:
from flair.embeddings import Sentence, WordEmbeddings
glove_embedding = WordEmbeddings('glove')

sentence = Sentence('The grass is green .')

glove_embedding.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

Token: 1 The
tensor([-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344,
        -0.5755,  0.0875,  0.2879, -0.0673,  0.3091, -0.2638, -0.1323, -0.2076,
         0.3340, -0.3385, -0.3174, -0.4834,  0.1464, -0.3730,  0.3458,  0.0520,
         0.4495, -0.4697,  0.0263, -0.5415, -0.1552, -0.1411, -0.0397,  0.2828,
         0.1439,  0.2346, -0.3102,  0.0862,  0.2040,  0.5262,  0.1716, -0.0824,
        -0.7179, -0.4153,  0.2033, -0.1276,  0.4137,  0.5519,  0.5791, -0.3348,
        -0.3656, -0.5486, -0.0629,  0.2658,  0.3020,  0.9977, -0.8048, -3.0243,
         0.0125, -0.3694,  2.2167,  0.7220, -0.2498,  0.9214,  0.0345,  0.4674,
         1.1079, -0.1936, -0.0746,  0.2335, -0.0521, -0.2204,  0.0572, -0.1581,
        -0.3080, -0.4162,  0.3797,  0.1501, -0.5321, -0.2055, -1.2526,  0.0716,
         0.7056,  0.4974, -0.4206,  0.2615, -1.5380, -0.3022, -0.0734, -0.2831,
         0.3710, -0.2522,  0.0162, -0.0171, -0.3898,  0.8742, -0.7257, -0.5106,
        -0.5203, -0.1459,  

Task: Compare the embeddings obtained using GloVe for the same word in different context (ie different sentences). Are they equal or different?

## Sentence embedding

### Sentence embedding using the average of word vectors
Now, let's average the vectors into a single vector that would represent our entire document, and use it for classification. We'll build a Logistic Regression classifier on top of it.

In [9]:
import numpy as np
def get_sentence_embedding(sentence):
    sentence = Sentence(sentence)
    glove_embedding.embed(sentence)
    sentence_embedding = np.mean( [np.array(token.embedding) for token in sentence], axis=0)
    return sentence_embedding

In [10]:
get_sentence_embedding('The grass is green .')

array([-0.48264474,  0.33375996,  0.348696  , -0.5163    ,  0.191962  ,
        0.12714759,  0.013061  ,  0.1766614 , -0.1873308 , -0.093839  ,
        0.0488024 , -0.0484856 ,  0.314986  ,  0.031634  ,  0.2535662 ,
       -0.059972  ,  0.38505   ,  0.06304   ,  0.027378  ,  0.06385148,
       -0.1046188 ,  0.131214  ,  0.39698398,  0.0049592 ,  0.48706597,
        0.27059498,  0.0188544 , -0.780686  , -0.160654  , -0.0207716 ,
       -0.2985124 ,  0.521548  ,  0.371312  ,  0.0037584 ,  0.24874802,
        0.3579286 , -0.187218  ,  0.484008  ,  0.1211252 ,  0.0338024 ,
       -0.32039762, -0.578998  ,  0.1858078 , -0.27883598,  0.07773139,
       -0.14281002,  0.23905559, -0.13043599, -0.1817726 , -0.49833995,
       -0.10820474, -0.30922002,  0.285602  ,  1.1599319 , -0.49102196,
       -2.58022   ,  0.021746  ,  0.043806  ,  1.479552  ,  0.427112  ,
       -0.02804599,  0.67730397, -0.0862168 ,  0.305978  ,  1.0884    ,
       -0.21497002,  0.2661428 , -0.022402  ,  0.3063696 , -0.29

Now, let's load our own data:

In [11]:
from sklearn import linear_model
from sklearn import metrics
clf = linear_model.LogisticRegression(C=1e5)

In [12]:
import pandas as pd
df = pd.read_csv('data/train.csv')
vectors = np.array([get_sentence_embedding(x) for x in df['text']])
y_truth = df['label']
clf.fit(vectors, y_truth)

y_predict = clf.predict(vectors)
metrics.f1_score(y_truth, y_predict, average='macro')



1.0

And now let's check the performance on our test set:

In [13]:
df = pd.read_csv('data/val.csv')
vectors = np.array([get_sentence_embedding(x) for x in df['text']])
y_truth = df['label']
y_predict = clf.predict(vectors)
metrics.f1_score(y_truth, y_predict, average='macro')

0.7087812901155326

In [14]:
print(metrics.classification_report(y_truth, y_predict))

                       precision    recall  f1-score   support

   rec.sport.baseball       0.69      0.69      0.69        16
     rec.sport.hockey       0.71      0.75      0.73        20
   talk.politics.guns       0.79      0.68      0.73        22
talk.politics.mideast       0.65      0.72      0.68        18

            micro avg       0.71      0.71      0.71        76
            macro avg       0.71      0.71      0.71        76
         weighted avg       0.72      0.71      0.71        76



### Sentence Embedding using Flair's DocumentPoolEmbeddings Class
We've used Flair to get embeddings for each word in the sentence. However, for text classification of the entire document, we need a way to integrate all these vectors into a single document embedding. There are several methods for that, and those interested would find this article useful - https://towardsdatascience.com/document-embedding-techniques-fed3e7a6a25d

The most basic element is averaging the word embedding into a single document embedding. In FLAIR, we do this using `DocumentPoolEmbeddings`.

In [None]:
from flair.embeddings import DocumentPoolEmbeddings
document_embeddings = DocumentPoolEmbeddings([glove_embedding], pooling='mean')

document_embeddings.embed(sentence)
document_embeddings

In [None]:
# now check out the embedded sentence.
print(sentence.get_embedding().shape)
print(sentence.get_embedding())

#### By using an RNN
Alternatively, we can use an RNN that runs over the word embeddings. We will use the last hidden state as the document embedding. In this case it is very helpful to train the model using the true labels of our task, so that the RNN is optimized for our own data and task:

In [None]:
from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings
glove_embedding = WordEmbeddings('glove')
document_embeddings = DocumentRNNEmbeddings([glove_embedding], rnn_type='LSTM')

In [None]:
# create an example sentence
sentence = Sentence('The grass is green . And the sky is blue .')

# embed the sentence with our document embedding
document_embeddings.embed(sentence)

# now check out the embedded sentence.
print(sentence.get_embedding())

Note that while `DocumentPoolEmbeddings` are immediately meaningful, `DocumentRNNEmbeddings` need to be tuned on the downstream task. This happens automatically in Flair if you train a new model with these embeddings. You can find an example of training a text classification model [here](/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md#training-a-text-classification-model). Once the model is trained, you can access the tuned DocumentRNNEmbeddings object directly from the classifier object and use it to embed sentences.

`DocumentRNNEmbeddings` have a number of hyper-parameters that can be tuned to improve learning:

```
:param hidden_size: the number of hidden states in the rnn.
:param rnn_layers: the number of layers for the rnn.
:param reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear
layer before putting them into the rnn or not.
:param reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output
dimension as before will be taken.
:param bidirectional: boolean value, indicating whether to use a bidirectional rnn or not.
:param dropout: the dropout value to be used.
:param word_dropout: the word dropout value to be used, if 0.0 word dropout is not used.
:param locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used.
:param rnn_type: one of 'RNN' or 'LSTM'
```

#### Training RNN on our own dataset - Loading dataset
The simplest way to load our data in Flair is using a CSV file. You can learn about other method in [the documentation](https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md).

To create a `Corpus` for a text classification task, you need to have three files (train, dev, and test) in the 
above format located in one folder. This data folder structure could, for example, look like this for the IMDB task:
```text
/data/train.csv
/data/val.csv
/data/test.txt
```
Now create a `CSVClassificationCorpus` by pointing to this folder (`/data`). 
Thereby, each line in a file is converted to a `Sentence` object annotated with the labels.

Attention: A text in a line can in fact have multiple sentences. Thus, a `Sentence` object is actually a `Document` and can actually consist of multiple sentences.

In [None]:
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus

# this is the folder in which train, test and dev files reside
data_folder = 'data/'

# column format indicating which columns hold the text and label(s). This is 1-based and not 0-based
column_name_map = {3: "text", 2: "label"}

# load corpus containing training, test and dev data
corpus: Corpus = CSVClassificationCorpus(data_folder,
                                         column_name_map,
                                         skip_header=True,
                                      test_file='test.csv',
                                      dev_file='val.csv',
                                      train_file='train.csv')
    
label_dict = corpus.make_label_dictionary()

In [None]:
corpus.train[0]

#### Training RNN on our own dataset - Training our own model
We're going to use an RNN to run through the contextual word embeddings we got from ELMo. We will use the hidden state at the end of the document as an embedding for the entire document. We will train the RNN on our labeled dataset, so that the final hidden state carries the most relevant information for our custom classification task.  

For more information on training your own model using Flair, see [this tutorial](https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md).

In [None]:
#TODO: Change code below to fit our own dataset

In [None]:
from flair.data import Corpus
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

# 3. make a list of word embeddings
word_embeddings = [WordEmbeddings('glove')]

# 4. initialize document embedding by passing list of word embeddings
# Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
                                                                     hidden_size=512,
                                                                     reproject_words=True,
                                                                     reproject_words_dimension=256,
                                                                     )

# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

In [None]:
# 6. initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

# 7. start the training
trainer.train('data/',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=150)

In [None]:
# 8. plot weight traces (optional)
from flair.visual.training_curves import Plotter
plotter = Plotter()
plotter.plot_weights('data/weights.txt')