
## Prerequisites

gensim==3.6.0

In [87]:
import os

from ast import literal_eval

from string import punctuation

import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models import Word2Vec, KeyedVectors

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split


lemmatizer = WordNetLemmatizer() 
stop_words = set(stopwords.words('english'))

In [2]:
df = pd.read_csv("../jigsaw-toxic-comment-classification-challenge/train.csv")

In [3]:
def preprocess_text(tokenizer, lemmatizer, stop_words, punctuation, text): 
    tokens = tokenizer(text.lower())
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return [token for token in lemmas if token not in stop_words and token not in punctuation]

In [4]:
bool_load = True

if not bool_load:
    df['cleaned'] = df.comment_text.apply(lambda x: preprocess_text(word_tokenize, lemmatizer, stop_words, punctuation, x))

In [5]:
bool_save = False

if bool_save:
    df.to_csv("../jigsaw-toxic-comment-classification-challenge/train.csv")

In [6]:
df_sample = df.sample(100000)

In [7]:
df_sample.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned
42030,42030,701b9ab2feabab28,nangparbat ==\njust to let you know this aint ...,0,0,0,0,0,0,"['nangparbat', '==', 'let', 'know', 'aint', 't..."
91896,91896,f5abfbef3a3ba8f7,"""\nDear Lex, Thanks for your clarification of ...",0,0,0,0,0,0,"['``', 'dear', 'lex', 'thanks', 'clarification..."
66820,66820,b2d0d32f5f9ef7c9,You obviously have a penile problem. Think too...,1,0,0,0,1,0,"['obviously', 'penile', 'problem', 'think', 'm..."
19288,19288,32f56a03e9451df3,"Some sites, like PWInsider and Wrestling Obser...",0,0,0,0,0,0,"['site', 'like', 'pwinsider', 'wrestling', 'ob..."
88647,88647,ed240d3bb592e4a6,"}}\n\nI've been blocked longer than this, for ...",0,0,0,0,0,0,"[""'ve"", 'blocked', 'longer', 'le', 'consider',..."


### Train the model from scratch

Train our first model based on the vocabulary from df_sample: 

In [14]:
# With initialization model trained for 5 epochs 

df_sample_cleaned_list = [literal_eval(s) for s in df_sample.cleaned.tolist()]

model = Word2Vec(sentences=df_sample_cleaned_list, 
         size=100,      # embedding vector size
         min_count=5,   # consider words that occured at least 5 times
         window=5)

In [15]:
# Continue training the model 

model.train(sentences=df_sample_cleaned_list, 
            total_examples=model.corpus_count,
            epochs=30
           )

(100024954, 118365900)

In [16]:
model.wv.vocab # to look at vocabulary

{'nangparbat': <gensim.models.keyedvectors.Vocab at 0x1b99446a708>,
 '==': <gensim.models.keyedvectors.Vocab at 0x1b99446a748>,
 'let': <gensim.models.keyedvectors.Vocab at 0x1b99446a788>,
 'know': <gensim.models.keyedvectors.Vocab at 0x1b99446a7c8>,
 'aint': <gensim.models.keyedvectors.Vocab at 0x1b99446a848>,
 'tell': <gensim.models.keyedvectors.Vocab at 0x1b99446a8c8>,
 'flatter': <gensim.models.keyedvectors.Vocab at 0x1b99446a908>,
 'stalking': <gensim.models.keyedvectors.Vocab at 0x1b99446a948>,
 '...': <gensim.models.keyedvectors.Vocab at 0x1b99446a808>,
 '``': <gensim.models.keyedvectors.Vocab at 0x1b99446a888>,
 'dear': <gensim.models.keyedvectors.Vocab at 0x1b99446a988>,
 'lex': <gensim.models.keyedvectors.Vocab at 0x1b99446a9c8>,
 'thanks': <gensim.models.keyedvectors.Vocab at 0x1b99446aa08>,
 'clarification': <gensim.models.keyedvectors.Vocab at 0x1b99446aa48>,
 'judge': <gensim.models.keyedvectors.Vocab at 0x1b99446aa88>,
 'learned': <gensim.models.keyedvectors.Vocab at 0x1

In [19]:
model.wv.most_similar('people')

[('others', 0.637255072593689),
 ('person', 0.5866743326187134),
 ('thing', 0.5739054679870605),
 ('editor', 0.5307891368865967),
 ('everyone', 0.5240690112113953),
 ('admins', 0.5220236778259277),
 ('someone', 0.5200488567352295),
 ('really', 0.494351863861084),
 ('way', 0.4860228896141052),
 ('guy', 0.485309362411499)]

### The next approach is to try to use the already pretrained model, which can be downloaded from here:

https://github.com/RaRe-Technologies/gensim-data

model:   
GoogleNews-vectors-negative300.bin

In [20]:
#os.getcwd()

In [21]:
model = KeyedVectors.load_word2vec_format(
    os.getcwd() + os.sep + "GoogleNews-vectors-negative300.bin", binary=True
)

In [22]:
# You can try to use GloVe model too and experiment with it: <- later
# import gensim.downloader as api
# model = api.load('glove-wiki-gigaword-100')

## Words distance 

# 1 - Cosine similarity

To measure how similar two words are, we need a way to measure the degree of similarity between two embedding vectors for the two words. Given two vectors $u$ and $v$, cosine similarity is defined as follows: 

$$\text{CosineSimilarity(u, v)} = \frac {u . v} {||u||_2 ||v||_2} = cos(\theta) \tag{1}$$

where $u.v$ is the dot product (or inner product) of two vectors, $||u||_2$ is the norm (or length) of the vector $u$, and $\theta$ is the angle between $u$ and $v$. This similarity depends on the angle between $u$ and $v$. If $u$ and $v$ are very similar, their cosine similarity will be close to 1; if they are dissimilar, the cosine similarity will take a smaller value. 

<img src="cosine_sim.png" style="width:800px;height:250px;">
<caption><center> **Figure 1**: The cosine of the angle between two vectors is a measure of how similar they are</center></caption>

**Exercise**: Implement the function `cosine_similarity()` to evaluate similarity between word vectors.

**Reminder**: The norm of $u$ is defined as $ ||u||_2 = \sqrt{\sum_{i=1}^{n} u_i^2}$

In [23]:
def cosine_similarity(w1, w2):
    """
    Cosine similarity between w1 and w2
    
    Arguments:
        w1 : word vector        
        w2 : word vector 
    Returns:
        cosine_similarity 
    """
    if (not np.any(w1) or not np.any(w2)): # check input is not zero-vector
        return 0
    
    # Dot product between w1 and w2
    dot = np.dot(w1, w2)
    # L2 norm of w1
    norm_u = np.linalg.norm(w1) 
    # L2 norm of w2 
    norm_v = np.linalg.norm(w2) 
    # Cosine similarity 
    cosine_similarity = dot / (norm_u * norm_v)
    
    return cosine_similarity

In [24]:
father = model.get_vector("father")
mother = model.get_vector("mother")

ball = model.get_vector("ball")
crocodile = model.get_vector("crocodile")

france = model.get_vector("france")
paris = model.get_vector("paris")
italy = model.get_vector("italy")
rome = model.get_vector("rome")

kiev = model.get_vector("kiev")
ukraine = model.get_vector("ukraine")

In [25]:
fast_print = lambda u, v, tag1, tag2: print(
    "cosine_similarity({t1}, {t2}) = ".format(t1 = tag1, t2 = tag2), cosine_similarity(u, v)
)

fast_print(father, mother, "father", "mother")
fast_print(ball, crocodile, "ball", "crocodile")
fast_print(france - paris, rome - italy, "france - paris", "rome - italy")
fast_print(kiev, ukraine, "kiev", "ukraine")

cosine_similarity(father, mother) =  0.79014826
cosine_similarity(ball, crocodile) =  0.10283584
cosine_similarity(france - paris, rome - italy) =  -0.1988747
cosine_similarity(kiev, ukraine) =  0.3738725


**Approximate expected output**:

<table>
    <tr>
        <td>
            **cosine_similarity(father, mother)** =
        </td>
        <td>
         0.79014826
        </td>
    </tr>
        <tr>
        <td>
            **cosine_similarity(ball, crocodile)** =
        </td>
        <td>
         0.10283585
        </td>
    </tr>
        <tr>
        <td>
            **cosine_similarity(france - paris, rome - italy)** =
        </td>
        <td>
         -0.421037
        </td>
    </tr>
</table>

## 2 - Word analogy task

In the word analogy task, we complete the sentence <font color='brown'>"*a* is to *b* as *c* is to **____**"</font>. An example is <font color='brown'> '*man* is to *woman* as *king* is to *queen*' </font>. In detail, we are trying to find a word *d*, such that the associated word vectors $e_a, e_b, e_c, e_d$ are related in the following manner: $e_b - e_a \approx e_d - e_c$. We will measure the similarity between $e_b - e_a$ and $e_d - e_c$ using cosine similarity. 

**Exercise**: Complete the code below to be able to perform word analogies!

***Note***: here you will need to complete a function in the sections, which are marked as:

```
# ----- Start ----- #
Your code should be written in-between the lines
# ------ End ------ #
```


In [26]:
def find_word_analogy(word_1, word_2, word_3, model):
    """
    Finds the word to complete analogy (see explanation above): a is to b as c is to ____. 
    
    Arguments:
    word_1 -- a word, string
    word_2 -- a word, string
    word_3 -- a word, string
    model -- word embeddings model 
    
    Returns:
    best_word --  the word such that v_1 - v_2 is close to v_best_word - v_3, as measured by cosine similarity
    """
    # convert words to lower case
    word_1, word_2, word_3 = word_1.lower(), word_2.lower(), word_3.lower()
    
    # ----- Start ----- #
    # Get the word embeddings v_a, v_b and v_c (≈1-3 lines)
    fast_get = lambda word: model.get_vector(word)
    e_1, e_2, e_3 = tuple(map(fast_get, [word_1, word_2, word_3]))
    # ------ End ------ #
    
    words = list(model.vocab.keys())
    max_cosine_sim = -100              # Initialize max_cosine_sim to a large negative number
    best_word = None                   # Initialize best_word with None

    # Loop over the whole word vector set
    for w in words:        
        e_j = fast_get(w)
        # to avoid best_word being one of the input words, skip them and continue iteration.
        if w in [word_1, word_2, word_3]:
            continue
        
        # ----- Start ----- #
        # Compute cosine similarity between the vector (e_2 - e_1) and the vector ((w's vector) - e_3)
        cosine_sim = cosine_similarity(e_2 - e_1, e_j - e_3)
        
        # If the cosine_sim is more than the max_cosine_sim seen so far,
        # do not forget to set new max_cosine_sim to the current value and best_word as well
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            best_word = w
        # ------ End ------ #
        
    return best_word

In [27]:
triads_to_try = [
    ('man', 'woman', 'king'), 
    ('bad', 'good', 'sad'), 
    ('man', 'woman', 'boy'), 
    ('small', 'smaller', 'large')
]

for triad in triads_to_try:
    print('{} -> {} :: {} -> {}'.format(*triad, find_word_analogy(*triad, model)))

man -> woman :: king -> queen
bad -> good :: sad -> wonderful
man -> woman :: boy -> girl
small -> smaller :: large -> larger


**Expected Output**:

<table>
    <tr>
        <td>
            **man -> woman** ::
        </td>
        <td>
         king -> queen
        </td>
    </tr>
        <tr>
        <td>
            **bad -> good** ::
        </td>
        <td>
         sad -> wonderful
        </td>
    </tr>
        <tr>
        <td>
            **man -> woman ** ::
        </td>
        <td>
         boy -> girl
        </td>
    </tr>
        <tr>
        <td>
            **small -> smaller ** ::
        </td>
        <td>
         large -> larger
        </td>
    </tr>
</table>

#### The next part of the task is to:  

1. Train your own W2V model using the proposed method above. Use all of the tokens created after your preprocessing pipeline in the previous tasks. (deleting stop_words, punctuation, lowercasing, etc - play as you want).  
2. Use obtained vectors to obtain text vectors using such pipeline: 
  1. For each word in a preprocessed text, get a word vector from the W2V model. 
  2. Add them together to obtain vectors for texts (sum them together, or get mean vector) 
3. Use obtained text vectors as a text representation to perform a text classification task.  
   Proposed - use binary classification (for example: select only 'obscene' text and clean and try to distinguish them one from another)
4. Calculate the metrics - TP, FP, FN, TN, precision, recall, F1 score, F2 score, accurary. 


In [127]:
from gensim.models.callbacks import CallbackAny2Vec


class callback_custom(CallbackAny2Vec):
    def __init__(self):
         self.epoch = 0

    def on_epoch_end(self, model):
        print("Iteration {:3}".format(self.epoch+1))
        self.epoch += 1

In [128]:
# init w2v model
n_dimensions = 300

model_w2v = Word2Vec(sentences=df_sample_cleaned_list, 
                     size=n_dimensions, min_count=5, window=5,
                     callbacks=[callback_custom()]
                    )

Iteration   1
Iteration   2
Iteration   3
Iteration   4
Iteration   5


In [129]:
# model training
number_of_iterations = 50

model_w2v.train(sentences=df_sample_cleaned_list, 
            total_examples=model_w2v.corpus_count,
            epochs=number_of_iterations
           )

Iteration   6
Iteration   7
Iteration   8
Iteration   9
Iteration  10
Iteration  11
Iteration  12
Iteration  13
Iteration  14
Iteration  15
Iteration  16
Iteration  17
Iteration  18
Iteration  19
Iteration  20
Iteration  21
Iteration  22
Iteration  23
Iteration  24
Iteration  25
Iteration  26
Iteration  27
Iteration  28
Iteration  29
Iteration  30
Iteration  31
Iteration  32
Iteration  33
Iteration  34
Iteration  35
Iteration  36
Iteration  37
Iteration  38
Iteration  39
Iteration  40
Iteration  41
Iteration  42
Iteration  43
Iteration  44
Iteration  45
Iteration  46
Iteration  47
Iteration  48
Iteration  49
Iteration  50
Iteration  51
Iteration  52
Iteration  53
Iteration  54
Iteration  55


(166710008, 197276500)

In [130]:
#model_w2v.wv.vocab

In [131]:
model_w2v.wv.most_similar('people')

[('others', 0.5222204923629761),
 ('thing', 0.5027244687080383),
 ('person', 0.4635576903820038),
 ("n't", 0.4631219506263733),
 ('editor', 0.46135395765304565),
 ("'re", 0.45537233352661133),
 ('way', 0.4543498754501343),
 ('everyone', 0.43567657470703125),
 ('admins', 0.43075278401374817),
 ('someone', 0.41527408361434937)]

In [132]:
model_w2v.wv.most_similar('one')

[('two', 0.5149646997451782),
 ("n't", 0.48453447222709656),
 ('think', 0.4638374149799347),
 ("'s", 0.46338513493537903),
 ('even', 0.4523453712463379),
 ('thing', 0.44775620102882385),
 ('first', 0.44292593002319336),
 ('article', 0.4377855360507965),
 ('way', 0.4306492209434509),
 ('many', 0.42936423420906067)]

In [133]:
bool_save_model = False

if bool_save_model:
    model_w2v.wv.save_word2vec_format('w2v_df_t2_clnd_sample.bin', binary = True)

In [134]:
model_w2v_vectors = model_w2v.wv # getting keyed vectors from trained model

In [135]:
# building text vectors
def form_text_vector(words_from_text, w2v_model_keyed_vectors, num_dim):
    text_vectorized = np.zeros(num_dim)
    for word in words_from_text:
        try:
            text_vectorized += w2v_model_keyed_vectors.get_vector(word)
        except KeyError:
            continue
    return text_vectorized


def form_corpus_matrix(corpus, w2v_model_keyed_vectors, num_dim):
    corpus_len = len(corpus)
    corpus_vectorized = np.empty((corpus_len, num_dim))
    for j in range(corpus_len):
        corpus_vectorized[j] = form_text_vector(
            corpus[j], w2v_model_keyed_vectors, num_dim
        )
    return corpus_vectorized

In [138]:
temp = df[[text_categories[4], 'cleaned']]

temp_n = temp[~df[text_categories[:-1]].any(axis = 'columns')]
temp_i = temp[df_t2_clnd.insult != 0]

insulting_and_neutral = temp_i.append(temp_n).reset_index(drop = True)
insulting_and_neutral.columns = ['label', 'texts']

del temp, temp_n, temp_i

print(
    insulting_and_neutral.head(),
    insulting_and_neutral.tail(),
    sep = '\n\n'
)

   label                                              texts
0      1           ['cocksucker', 'piss', 'around', 'work']
1      1  ['gay', 'antisemmitian', 'archangel', 'white',...
2      1                ['fuck', 'filthy', 'mother', 'dry']
3      1  ['stupid', 'peace', 'shit', 'stop', 'deleting'...
4      1  ['=tony', 'sidaway', 'obviously', 'fistfuckee'...

        label                                              texts
151218      0  ['``', 'second', 'time', 'asking', 'view', 'co...
151219      0  ['ashamed', 'horrible', 'thing', 'put', 'talk'...
151220      0  ['spitzer', 'umm', 'actual', 'article', 'prost...
151221      0  ['look', 'like', 'wa', 'actually', 'put', 'spe...
151222      0  ['``', '...', 'really', "n't", 'think', 'under...


In [139]:
P = 0.25

X_train_t, X_test_t, Y_train, Y_test = train_test_split(
    insulting_and_neutral['texts'], insulting_and_neutral['label'],
    test_size = P,
    random_state = 1
)

X_train_t = [literal_eval(s) for s in X_train_t.reset_index(drop = True)]
X_test_t = [literal_eval(s) for s in X_test_t.reset_index(drop = True)]

Y_train = Y_train.reset_index(drop = True)
Y_test = Y_test.reset_index(drop = True)

X_train = form_corpus_matrix(X_train_t, model_w2v_vectors, n_dimensions)
X_test = form_corpus_matrix(X_test_t, model_w2v_vectors, n_dimensions)

In [140]:
X_train

array([[-21.66816815,  32.38354196,  17.25789585, ...,  19.81849068,
         -8.5553633 , -19.19644991],
       [ -8.36420262,   3.64458993,   4.4211579 , ...,   0.65700775,
         -9.33503565,  -5.41035524],
       [ -2.36577726,  -5.33778548, -12.16261954, ...,  18.81043604,
        -18.20870586,  -7.13032301],
       ...,
       [ -2.95281589,  -0.23064196,   1.23231921, ...,   1.02943078,
         -0.45546171,   0.83795777],
       [ -1.57892848,  -3.81017709,   2.10037866, ...,   5.77412083,
         -4.55522782,  -3.54083675],
       [-10.43035001,  -5.12954828,   1.24636903, ...,   0.16670493,
          2.85205146,  -5.93802206]])

In [141]:
X_test

array([[-1.55650281e+01,  2.50765029e-03, -3.43156406e+00, ...,
        -1.27878880e+00, -2.92159311e+00, -4.96064100e+00],
       [-5.05403483e+00,  1.42044460e+01, -1.05849296e+01, ...,
        -1.14749064e+00,  8.85966440e+00,  5.54408395e+00],
       [-1.42199390e+01,  5.19368354e+00, -4.85291371e+00, ...,
         5.77792327e+00, -5.53064266e+00, -7.61524982e+00],
       ...,
       [-3.84172717e+00, -2.06410417e+00, -2.50927678e+00, ...,
        -4.99234527e-01, -3.24616189e+00, -1.03070017e-01],
       [-4.78686325e-01, -1.94843510e+00,  1.16952186e+01, ...,
         1.91426969e+01, -2.10387569e+01, -5.02872800e+00],
       [ 1.56685558e+00, -2.64528278e+00,  4.36859816e-01, ...,
         1.20622572e+00, -5.29165797e+00,  6.54002298e+00]])

In [142]:
def basic_report(y_test, y_prediction):
    confusion_matr = confusion_matrix(y_test, y_prediction)
    print("CONFUSION MATRIX:\n{matr}".format(matr=confusion_matr))
    accuracy_of_model = accuracy_score(y_test, y_prediction)
    print("ACCURACY:\n{acc}".format(acc = accuracy_of_model))
    sklearn_report = classification_report(y_test, y_prediction)
    print("TABLE:\n{tab}".format(tab=sklearn_report))

In [143]:
# include classifier: RF
random_forest_cls = RandomForestClassifier()
random_forest_cls.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [144]:
Y_prediction = random_forest_cls.predict(X_test)

In [145]:
# tests
basic_report(Y_test, Y_prediction)

CONFUSION MATRIX:
[[35763    52]
 [ 1033   958]]
ACCURACY:
0.9713008517166587
TABLE:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     35815
           1       0.95      0.48      0.64      1991

    accuracy                           0.97     37806
   macro avg       0.96      0.74      0.81     37806
weighted avg       0.97      0.97      0.97     37806



In [146]:
# include classifier: LR
logit_cls = LogisticRegression()
logit_cls.fit(X_train, Y_train) # well

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [147]:
Y_prediction = logit_cls.predict(X_test)

In [148]:
# tests
basic_report(Y_test, Y_prediction)

CONFUSION MATRIX:
[[35645   170]
 [  659  1332]]
ACCURACY:
0.9780722636618526
TABLE:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     35815
           1       0.89      0.67      0.76      1991

    accuracy                           0.98     37806
   macro avg       0.93      0.83      0.88     37806
weighted avg       0.98      0.98      0.98     37806



#### The second part of the task is: 

1. While performing a step 2 for text vectorization, for each word add its vector with tf-idf weight -> weighted average. 
2. Perform a same text classification task as it was required above. 
3. Calculate the metrics, compare with a vectorization approach without weightning. 

In [None]:
### Your code here 

#### The third part of the task is: 

1. Use a pre-trained W2V model for obtaining a word vectors for each of the tokens in your dataset, create text vectors WITHOUT weightning. 
2. Train text classification model.
3. Calculate the metrics.

In [149]:
X_train = form_corpus_matrix(X_train_t, model, n_dimensions)
X_test = form_corpus_matrix(X_test_t, model, n_dimensions)

In [None]:
# include classifier: RF
random_forest_cls = RandomForestClassifier()
random_forest_cls.fit(X_train, Y_train)

In [None]:
Y_prediction = random_forest_cls.predict(X_test)

In [None]:
# tests
basic_report(Y_test, Y_prediction)

#### The fourth part of the task is: 

1. Use a pre-trained W2V model for obtaining a word vectors for each of the tokens in your dataset, create text vectors WITH tf-idf weightning. 
2. Train a text classification model. 
3. Calculate the metrics. 

In [None]:
### Your code here

### Visualizations part 

Use dimentionality reduction methods such as t-SNE or PCA to make your 300 dim vectors available for 2D plotting. 

Select top (10-20) words for each cathegory BY TF-IDF SCORE, not counts!!! 

Plot on the ONE plot all of this words but colors must be different for top-words for obscene cathegory, clean, toxic, etc... 

See, if words from one cathegory are closer to each other than to others. 
Or you observe ~2 clusters: all of the toxic words, clean words.  
Explain what you see and why. 


In [None]:
### Your code here 

### Additional part: 

1. Find a pre-trained FastText vetors, understand it's difference from W2V vectors. 
2. Vectorize all of your texts using FT model, perform a text classification, calculate the metrics, compare with W2V approach. 

Or/And you can:

1. Train your own FT model and make the same. 
2. Compare it with previous approaches.

In [None]:
### Your code here 

### Conclusions: 

Please, provide a clear table or dataframe with all of the metrics for all of the trained/used models available.   

Compare them to each other.   

Make conclusions which one from your models worked better for this particular task.   
BE CAREFUL: Having a better model performance on this particular task does not matter that this model is better than others in GENERAL. You need to make your own conclusions about this particular model applied to this particular task. Please, think and understand WHY.   
Write your thoughts down below: 



In [None]:
### Your conclusions here.

In [None]:
### Your thoughts about the last question here. 