# Домашнее задание № 3. Векторные представления

1. Преобразуйте тексты в векторы в каждой паре 4 методами  - SVD, NMF, Word2Vec, Fastext. Для SVD и NMF сделайте две пары векторов - через TfidfVectorizer и CountVectorizer. Для word2vec сделайте две пары векторов - с взвешиванием по tfidf и без. Для Fastext постройте две модели - без нормализации и с нормализацией, а через каждую модель постройте две пары векторов -  с взвешиванием по tfidf и без. Для обучения этих моделей можете воспользоваться корпусом новостных текстов, с которым мы работали на семинаре. А можете использовать любой другой корпус (сами тексты соревнования использовать не надо).

2. У вас должно получиться 10 пар векторов для каждой строчки в датасете. Между векторами каждой пары вычислите косинусную близость (получится 10 чисел для каждой пары текстов). 

3. Постройте обучающую выборку из этих близостей. Обучите любую модель (Логрег, Рандом форест или что-то ещё) на этой выборке и оцените качество на кросс-валидации (используйте микросреднюю f1-меру).   

4. С помощью кросс-валидации подберите параметры моделей (количество компонент, размерность в w2v, min_n - в fastext и т.д).

In [265]:
import warnings
warnings.filterwarnings("ignore")

In [266]:
import pandas as pd
from lxml import html
from string import punctuation
from pymystem3 import Mystem
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier
import gensim
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter,defaultdict
import os
from nltk.corpus import stopwords
%matplotlib inline

**Preprocessing**

In [267]:
russian_stopwords = set(stopwords.words('russian'))

In [268]:
punct = punctuation + '«»—…“”*№–'
mystem = Mystem()

In [269]:
def normalize(text):
    tokens = mystem.lemmatize(text)
    tokens = [token for token in tokens if token not in russian_stopwords \
              and token != ' '  \
              and token.strip() not in punct]
    
    return ' '.join(tokens)

In [270]:
corpus_xml = html.fromstring(open('paraphraser/paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1' : texts_1, 'text_2' : texts_2, 'label' : classes})

In [271]:
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

In [272]:
data.head()

Unnamed: 0,text_1,text_2,label,text_1_norm,text_2_norm
0,Полицейским разрешат стрелять на поражение по ...,Полиции могут разрешить стрелять по хулиганам ...,0,полицейский разрешать стрелять поражение гражд...,полиция мочь разрешать стрелять хулиган травма...
1,Право полицейских на проникновение в жилище ре...,Правила внесудебного проникновения полицейских...,0,право полицейский проникновение жилище решать ...,правило внесудебный проникновение полицейский ...
2,Президент Египта ввел чрезвычайное положение в...,Власти Египта угрожают ввести в стране чрезвыч...,0,президент египет вводить чрезвычайный положени...,власть египет угрожать вводить страна чрезвыча...
3,Вернувшихся из Сирии россиян волнует вопрос тр...,Самолеты МЧС вывезут россиян из разрушенной Си...,-1,вернуться сирия россиянин волновать вопрос тру...,самолет мчс вывозить россиянин разрушать сирия
4,В Москву из Сирии вернулись 2 самолета МЧС с р...,Самолеты МЧС вывезут россиян из разрушенной Си...,0,москва сирия вернуться 2 самолет мчс россиянин...,самолет мчс вывозить россиянин разрушать сирия


In [273]:
y = data['label'].values
y.shape

(7227,)

**Обучающий корпус**

In [13]:
data_rt = pd.read_csv('news_texts.csv')

In [14]:
data_rt.dropna(inplace=True)

In [15]:
data_rt['content_norm'] = data_rt['content'].apply(normalize)

In [16]:
data_rt.head()

Unnamed: 0,content,content_norm
0,Канцлер Германии Ангела Меркель в ходе брифинг...,канцлер германия ангел меркель ход брифинг пре...
1,Российские и белорусские войска успешно заверш...,российский белорусский войско успешно завершат...
2,"Дзюба, Шатов и Анюков оказались не нужны «Зени...",дзюба шатов анюков оказываться нужный зенит ро...
3,"В Испанию без фанатов\nПожалуй, главной пятнич...",испания фанат пожалуй главный пятничный новост...
4,"Постпред России при ООН Виталий Чуркин, говоря...",постпред россия оон виталий чуркин говорить ве...


`CountVectorizer`

In [322]:
cv = CountVectorizer(max_features=25000, min_df=5, max_df=0.3)
X_cv = cv.fit_transform(data_rt['content_norm'])

In [323]:
X_cv.shape

(7212, 25000)

`TfidfVectorizer`

In [324]:
tfidf = TfidfVectorizer(max_features=25000, min_df=5, max_df=0.3)
X_tfidf = tfidf.fit_transform(data_rt['content_norm'])

In [325]:
X_tfidf.shape

(7212, 25000)

## Матричные разложения

## SVD

In [373]:
# svd = TruncatedSVD(200)

In [398]:
svd = TruncatedSVD(100)

In [399]:
svd_cv = svd.fit(X_cv)

In [400]:
id2word = {i:w for i, w in enumerate(cv.get_feature_names())}
word2id = {w:i for i, w in id2word.items()}

In [401]:
id2vec_svd_cv = svd_cv.components_.T

In [402]:
id2vec_svd_cv

array([[ 2.41534265e-03, -6.48203597e-03, -1.37011631e-03, ...,
        -1.48205509e-02,  3.36881441e-03,  8.27630806e-03],
       [ 7.13254030e-04, -1.60944585e-03, -3.96059363e-05, ...,
         1.34776320e-04, -1.82137359e-03, -1.06975742e-03],
       [ 3.12128376e-05, -1.50032357e-04, -1.85430737e-04, ...,
         8.01392151e-04,  5.56368751e-04,  2.41479701e-04],
       ...,
       [ 1.40193677e-04, -5.29234098e-04, -8.51837961e-04, ...,
         6.03649799e-04, -1.04440254e-04,  8.15950285e-04],
       [ 4.18618801e-04, -7.15927320e-04,  5.19265852e-04, ...,
        -2.09985371e-03,  1.99516325e-03, -2.10304586e-03],
       [ 2.93899296e-04, -9.17392469e-04,  8.87922747e-04, ...,
        -1.05162688e-03, -1.04546109e-03,  4.17432539e-04]])

In [403]:
svd_tfidf = svd.fit(X_tfidf)

In [404]:
id2word_tfidf = {i:w for i, w in enumerate(tfidf.get_feature_names())}
word2id_tfidf = {w:i for i, w in id2word_tfidf.items()}

In [405]:
id2vec_svd_tfidf = svd_tfidf.components_.T

In [406]:
id2vec_svd_tfidf

array([[ 7.69172155e-03, -5.10472874e-03,  3.43478922e-03, ...,
         1.63740964e-03, -3.94333774e-03,  4.77463889e-03],
       [ 2.06438312e-03, -1.16265669e-03, -2.94723222e-05, ...,
        -5.47429764e-03, -1.39607513e-03, -1.87790362e-03],
       [ 2.12949350e-04, -1.82626523e-04,  4.25569335e-04, ...,
         1.13159323e-03, -1.38074693e-03,  2.75504006e-04],
       ...,
       [ 8.95309054e-04, -6.17879668e-04,  2.08424019e-03, ...,
         2.95955444e-04,  1.38401833e-03,  3.83496190e-04],
       [ 1.51850048e-03, -7.88303339e-04, -8.89760545e-04, ...,
        -1.43524020e-03, -1.57015252e-03, -6.56444224e-03],
       [ 1.88179728e-03, -9.39920802e-04, -1.63877147e-03, ...,
        -1.36807687e-03,  9.71108368e-04, -2.64889606e-03]])

## NMF

In [None]:
nmf = NMF(50)

In [326]:
nmf = NMF(200)

In [327]:
nmf_cv = nmf.fit(X_cv)

In [328]:
id2vec_nmf_cv = nmf_cv.components_.T

In [329]:
nmf_tfidf = nmf.fit(X_tfidf)

In [330]:
id2vec_nmf_tfidf = nmf_tfidf.components_.T

In [331]:
def most_similar(word, id2vec):
    similar = [id2word[i] for i in cosine_distances(id2vec[word2id[word]].reshape(1, -1), id2vec).argsort()[0][:10]]
    return similar

In [332]:
most_similar('спорт', id2vec_svd_cv)

['спорт',
 'вид',
 'фехтование',
 'артистка',
 'скалолазание',
 'триатлон',
 'плавание',
 'многократный',
 'поло',
 'синхронистка']

In [333]:
most_similar('спорт', id2vec_svd_tfidf)

['спорт',
 'спортивный',
 'вид',
 'колобок',
 'карате',
 'соревнование',
 'спортсмен',
 'фехтование',
 'журов',
 'плавание']

In [334]:
most_similar('спорт', id2vec_nmf_cv)

['спорт',
 'кратный',
 'спортивный',
 'пловчиха',
 'sport',
 'пятикратный',
 'анфиса',
 'анастасия',
 'тренировочный',
 'уподобляться']

In [335]:
most_similar('спорт', id2vec_nmf_tfidf)

['спорт',
 'фехтование',
 'спортивный',
 'многократный',
 'колобок',
 'ои',
 'гребной',
 'допинговый',
 'допинг',
 'стивенс']

## Word2Vec

In [455]:
w2v = gensim.models.Word2Vec([text.split() for text in data_rt['content_norm']], size=50, sg=1)



In [None]:
# w2v = gensim.models.Word2Vec([text.split() for text in data_rt['content_norm']], size=100, sg=1)

In [456]:
w2v.most_similar('спорт')

[('колобок', 0.8065358996391296),
 ('колобков', 0.8039644956588745),
 ('велосипедный', 0.7796916365623474),
 ('лыжный', 0.7776771187782288),
 ('фигурный', 0.7748597264289856),
 ('синхронный', 0.7699201703071594),
 ('санный', 0.7696366310119629),
 ('атлетика', 0.768450915813446),
 ('гимнастика', 0.7623220682144165),
 ('бутырский', 0.7610881924629211)]

**TF-IDF**

In [74]:
# import math

In [85]:
# def getTF(words_arr):
#     tf_dict = Counter(words_arr)
    
#     for value in tf_dict:
#         tf_dict[value] = tf_dict[value] / float(len(words_arr))
        
#     return tf_dict


# def getIDF(documents):
#     idf_dict = defaultdict(lambda: 0)
#     count_docs = len(documents)
    
#     for doc in documents:
#         for word in doc:
#             idf_dict[word] += 1
#             idf_dict[word] = math.log(count_docs / float(1 + idf_dict[word]))
        
#     return idf_dict

# def getTFIDF(documents):
#     for doc in documents:
#         tfidf_dict = {}
#         tf = getTF(doc)
        
#         for item in tf:
#             tfidf_dict[item] = tf[item] * float([x for x in getIDF(documents).values()][0])
        
#     return tfidf_dict

## fastText

**С нормализацией**

In [496]:
fast_text_norm = gensim.models.FastText([text.split() for text in data_rt['content_norm']], size=50, min_n=4, max_n=8)



In [476]:
# fast_text_norm = gensim.models.FastText([text.split() for text in data_rt['content_norm']], size=100, min_n=4, max_n=8)

**Без нормализации**

In [90]:
def tokenize(text):
    words = [word.strip(punct) for word in text.lower().split()]

    return ' '.join(words)

In [495]:
corpus = [text.split() for text in data_rt['content'].apply(tokenize)]
fast_text = gensim.models.FastText(corpus, size=50, min_n=4, max_n=8)



In [None]:
# corpus = [text.split() for text in data_rt['content'].apply(tokenize)]
# fast_text = gensim.models.FastText(corpus, size=100, min_n=4, max_n=8)

In [92]:
fast_text.most_similar('спорт')

[('спортзале', 0.9600382447242737),
 ('велоспорт', 0.9357381463050842),
 ('спорту', 0.9300536513328552),
 ('спорте', 0.9198392629623413),
 ('спорышев', 0.8934230804443359),
 ('спорта', 0.887935996055603),
 ('спору', 0.8767692446708679),
 ('спортсмен', 0.8596939444541931),
 ('велоспорте', 0.8562847375869751),
 ('р-спорт', 0.8537545800209045)]

In [89]:
fast_text_norm.most_similar('спорт')

[('спортзал', 0.9714047908782959),
 ('велоспорт', 0.9464095234870911),
 ('спортинг', 0.915722668170929),
 ('спорышев', 0.8824635744094849),
 ('автоспорт', 0.8666187524795532),
 ('мегаспорт', 0.8333557844161987),
 ('спор', 0.805184006690979),
 ('атлетика', 0.7830331921577454),
 ('рапорт', 0.7690808773040771),
 ('паралимпиец', 0.7659856677055359)]

## Векторные представления

## SVD

**CV**

In [407]:
X_text_1_cv = svd_cv.transform(cv.transform(data['text_1_norm']))
X_text_2_cv = svd_cv.transform(cv.transform(data['text_2_norm']))

X_text_cv = np.concatenate([X_text_1_cv, X_text_2_cv], axis=1)

In [408]:
train_X_cv, valid_X_cv, train_y_cv, valid_y_cv = train_test_split(X_text_cv, y, random_state=1)

In [385]:
clf_cv = RandomForestClassifier(
    max_depth=100,
    n_estimators=500,
    class_weight='balanced'
)

clf_cv.fit(train_X_cv, train_y_cv)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [277]:
from sklearn import metrics

**`svd = TruncatedSVD(200)`**

In [278]:
print('micro-f1 test', metrics.f1_score(valid_y_cv, clf_cv.predict(valid_X_cv), average='micro'))

micro-f1 test 0.5107913669064749


**`svd = TruncatedSVD(100)`**

In [358]:
print('micro-f1 test', metrics.f1_score(valid_y_cv, clf_cv.predict(valid_X_cv), average='micro'))

micro-f1 test 0.5069175428887659


**TF-IDF**

In [409]:
X_text_1_tfidf = svd_tfidf.transform(tfidf.transform(data['text_1_norm']))
X_text_2_tfidf = svd_tfidf.transform(tfidf.transform(data['text_2_norm']))

X_text_tfidf = np.concatenate([X_text_1_tfidf, X_text_2_tfidf], axis=1)

In [387]:
train_X_tfidf, valid_X_tfidf, train_y_tfidf, valid_y_tfidf = train_test_split(X_text_tfidf, y, random_state=1)

In [388]:
clf_tfidf = RandomForestClassifier(
    max_depth=100,
    n_estimators=500,
    class_weight='balanced'
)

clf_tfidf.fit(train_X_tfidf, train_y_tfidf)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

**`svd = TruncatedSVD(200)`**

In [282]:
print('micro-f1 test', metrics.f1_score(valid_y_tfidf, clf_tfidf.predict(valid_X_tfidf), average='micro'))

micro-f1 test 0.5179856115107914


**`svd = TruncatedSVD(100)`**

In [362]:
print('micro-f1 test', metrics.f1_score(valid_y_tfidf, clf_tfidf.predict(valid_X_tfidf), average='micro'))

micro-f1 test 0.5146651909241837


## NMF

**CV**

In [336]:
X_text_1_nmf_cv = nmf_cv.transform(cv.transform(data['text_1_norm']))
X_text_2_nmf_cv = nmf_cv.transform(cv.transform(data['text_2_norm']))

X_text_nmf_cv = np.concatenate([X_text_1_nmf_cv, X_text_2_nmf_cv], axis=1)

In [337]:
train_X_nmf_cv, valid_X_nmf_cv, train_y_nmf_cv, valid_y_nmf_cv = train_test_split(X_text_nmf_cv, y, random_state=1)

In [338]:
clf_nmf_cv = RandomForestClassifier(
    max_depth=100,
    n_estimators=500,
    class_weight='balanced'
)

clf_nmf_cv.fit(train_X_nmf_cv, train_y_nmf_cv)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

**`nmf = NMF(50)`**

In [286]:
print('micro-f1 test', metrics.f1_score(valid_y_nmf_cv, clf_nmf_cv.predict(valid_X_nmf_cv), average='micro'))

micro-f1 test 0.5146651909241837


**`nmf = NMF(200)`**

In [339]:
print('micro-f1 test', metrics.f1_score(valid_y_nmf_cv, clf_nmf_cv.predict(valid_X_nmf_cv), average='micro'))

micro-f1 test 0.5279468732706143


**TF-IDF**

In [340]:
X_text_1_nmf_tfidf = nmf_tfidf.transform(tfidf.transform(data['text_1_norm']))
X_text_2_nmf_tfidf = nmf_tfidf.transform(tfidf.transform(data['text_2_norm']))

X_text_nmf_tfidf = np.concatenate([X_text_1_nmf_tfidf, X_text_2_nmf_tfidf], axis=1)

In [341]:
train_X_nmf_tfidf, valid_X_nmf_tfidf, train_y_nmf_tfidf, valid_y_nmf_tfidf = train_test_split(
                                                                                X_text_nmf_tfidf, y, random_state=1)

In [342]:
clf_nmf_tfidf = RandomForestClassifier(
    max_depth=100,
    n_estimators=500,
    class_weight='balanced'
)

clf_nmf_tfidf.fit(train_X_nmf_tfidf, train_y_nmf_tfidf)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

**`nmf = NMF(50)`**

In [290]:
print('micro-f1 test', metrics.f1_score(valid_y_nmf_tfidf, clf_nmf_tfidf.predict(valid_X_nmf_tfidf), average='micro'))

micro-f1 test 0.5080243497509684


**`nmf = NMF(200)`**

In [343]:
print('micro-f1 test', metrics.f1_score(valid_y_nmf_tfidf, clf_nmf_tfidf.predict(valid_X_nmf_tfidf), average='micro'))

micro-f1 test 0.5229662423907028


## Word2Vec

In [457]:
def get_embedding(text, model, dim):
    text = text.split()
    # чтобы не доставать одно слово несколько раз
    # сделаем счетчик, а потом векторы домножим на частоту
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i,word in enumerate(words):
        try:
            v = model[word]
            vectors[i] = v * (words[word] / total) # просто умножаем вектор на частоту
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [458]:
def get_embedding_tfidf(text, model, dim, documents):
    text = text.split()
    # чтобы не доставать одно слово несколько раз
    # сделаем счетчик, а потом векторы домножим на частоту
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    count_docs = len(documents)
    
    for i, word in enumerate(words):
        try:
            v = model[word]
            idf = math.log10(count_docs / sum([1 for i in documents if word in i]))
            vectors[i] = v * (words[word] / total) * idf

        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

**Без TF-IDF**

In [459]:
dim = 50
X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v[i] = get_embedding(text, w2v, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v[i] = get_embedding(text, w2v, dim)

In [460]:
X_text_w2v = np.concatenate([X_text_1_w2v, X_text_2_w2v], axis=1)

In [461]:
train_X_w2v, valid_X_w2v, train_y_w2v, valid_y_w2v = train_test_split(X_text_w2v, y, random_state=1)

In [462]:
clf_w2v = RandomForestClassifier(
    max_depth=100,
    n_estimators=500,
    class_weight='balanced'
)

clf_w2v.fit(train_X_w2v, train_y_w2v)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [463]:
print('micro-f1 test', metrics.f1_score(valid_y_w2v, clf_w2v.predict(valid_X_w2v), average='micro'))

micro-f1 test 0.5478693967902601


**С TF-IDF**

In [464]:
X_text_1_w2v_tfidf = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v_tfidf = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v_tfidf[i] = get_embedding_tfidf(text, w2v, dim, data['text_1_norm'])
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v_tfidf[i] = get_embedding_tfidf(text, w2v, dim, data['text_2_norm'])

In [465]:
X_text_w2v_tfidf = np.concatenate([X_text_1_w2v_tfidf, X_text_2_w2v_tfidf], axis=1)

In [466]:
train_X_w2v_tfidf, valid_X_w2v_tfidf, train_y_w2v_tfidf, valid_y_w2v_tfidf = train_test_split(
    X_text_w2v_tfidf, y, random_state=1)

In [467]:
clf_w2v_tfidf = RandomForestClassifier(
    max_depth=100,
    n_estimators=500,
    class_weight='balanced'
)

clf_w2v_tfidf.fit(train_X_w2v_tfidf, train_y_w2v_tfidf)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [468]:
print('micro-f1 test', metrics.f1_score(valid_y_w2v_tfidf, clf_w2v_tfidf.predict(valid_X_w2v_tfidf), average='micro'))

micro-f1 test 0.557830658550083


## fastText

### Без нормализации

**Без TF-IDF**

In [497]:
data['text_1_notnorm'] = data['text_1'].apply(tokenize)
data['text_2_notnorm'] = data['text_2'].apply(tokenize)

X_text_1_ft = np.zeros((len(data['text_1_notnorm']), dim))
X_text_2_ft = np.zeros((len(data['text_2_notnorm']), dim))

for i, text in enumerate(data['text_1_notnorm'].values):
    X_text_1_ft[i] = get_embedding(text, fast_text, dim)
    
for i, text in enumerate(data['text_2_notnorm'].values):
    X_text_2_ft[i] = get_embedding(text, fast_text, dim)

In [498]:
X_text_ft = np.concatenate([X_text_1_ft, X_text_2_ft], axis=1)

In [499]:
train_X_ft, valid_X_ft, train_y_ft, valid_y_ft = train_test_split(X_text_ft, y, random_state=1)

In [500]:
clf_ft = RandomForestClassifier(
    max_depth=100,
    n_estimators=500,
    class_weight='balanced'
)

clf_ft.fit(train_X_ft, train_y_ft)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

**`size = 50`**

In [501]:
print('micro-f1 test', metrics.f1_score(valid_y_ft, clf_ft.predict(valid_X_ft), average='micro'))

micro-f1 test 0.5318206972883232


**`size = 100`**

In [481]:
print('micro-f1 test', metrics.f1_score(valid_y_ft, clf_ft.predict(valid_X_ft), average='micro'))

micro-f1 test 0.5279468732706143


**С TF-IDF**

In [482]:
X_text_1_ft_tfidf = np.zeros((len(data['text_1_notnorm']), dim))
X_text_2_ft_tfidf = np.zeros((len(data['text_2_notnorm']), dim))

for i, text in enumerate(data['text_1_notnorm'].values):
    X_text_1_ft_tfidf[i] = get_embedding_tfidf(text, fast_text, dim, data['text_1_notnorm'])
    
for i, text in enumerate(data['text_2_notnorm'].values):
    X_text_2_ft_tfidf[i] = get_embedding_tfidf(text, fast_text, dim, data['text_2_notnorm'])

In [483]:
X_text_ft_tfidf = np.concatenate([X_text_1_ft_tfidf, X_text_2_ft_tfidf], axis=1)

In [484]:
train_X_ft_tfidf, valid_X_ft_tfidf, train_y_ft_tfidf, valid_y_ft_tfidf = train_test_split(
    X_text_ft_tfidf, y, random_state=1)

In [485]:
clf_ft_tfidf = RandomForestClassifier(
    max_depth=100,
    n_estimators=500,
    class_weight='balanced'
)

clf_ft_tfidf.fit(train_X_ft_tfidf, train_y_ft_tfidf)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

**`size=50`**

In [303]:
print('micro-f1 test', metrics.f1_score(valid_y_ft_tfidf, clf_ft_tfidf.predict(valid_X_ft_tfidf), average='micro'))

micro-f1 test 0.5356945213060321


**`size=100`**

In [486]:
print('micro-f1 test', metrics.f1_score(valid_y_ft_tfidf, clf_ft_tfidf.predict(valid_X_ft_tfidf), average='micro'))

micro-f1 test 0.5323741007194245


### С нормализацией

**Без TF-IDF**

In [140]:
X_text_1_ft_norm = np.zeros((len(data['text_1_norm']), dim))
X_text_2_ft_norm = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_ft_norm[i] = get_embedding(text, fast_text, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_ft_norm[i] = get_embedding(text, fast_text, dim)

In [149]:
X_text_ft_norm = np.concatenate([X_text_1_ft_norm, X_text_2_ft_norm], axis=1)

In [304]:
train_X_ft_norm, valid_X_ft_norm, train_y_ft_norm, valid_y_ft_norm = train_test_split(X_text_ft_norm, y, random_state=1)

In [305]:
clf_ft_norm = RandomForestClassifier(
    max_depth=100,
    n_estimators=500,
    class_weight='balanced'
)

clf_ft_norm.fit(train_X_ft_norm, train_y_ft_norm)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [306]:
print('micro-f1 test', metrics.f1_score(valid_y_ft_norm, clf_ft_norm.predict(valid_X_ft_norm), average='micro'))

micro-f1 test 0.5428887659103486


**С TF-IDF**

In [139]:
X_text_1_ft_norm_tfidf = np.zeros((len(data['text_1_norm']), dim))
X_text_2_ft_norm_tfidf = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_ft_norm_tfidf[i] = get_embedding_tfidf(text, fast_text, dim, data['text_1_norm'])
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_ft_norm_tfidf[i] = get_embedding_tfidf(text, fast_text, dim, data['text_2_norm'])

In [150]:
X_text_ft_norm_tfidf = np.concatenate([X_text_1_ft_norm_tfidf, X_text_2_ft_norm_tfidf], axis=1)

In [307]:
train_X_ft_norm_tfidf, valid_X_ft_norm_tfidf, train_y_ft_norm_tfidf, valid_y_ft_norm_tfidf = train_test_split(
    X_text_ft_norm_tfidf, y, random_state=1)

In [308]:
clf_ft_norm_tfidf = RandomForestClassifier(
    max_depth=100,
    n_estimators=500,
    class_weight='balanced'
)

clf_ft_norm_tfidf.fit(train_X_ft_norm_tfidf, train_y_ft_norm_tfidf)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [309]:
print('micro-f1 test', metrics.f1_score(valid_y_ft_norm_tfidf, clf_ft_norm_tfidf.predict(valid_X_ft_norm_tfidf), 
                                        average='micro'))

micro-f1 test 0.5268400664084117


### Все переменные с векторами:

**SVD**

`X_text_cv (X_text_1_cv, X_text_2_cv)
X_text_tfidf (X_text_1_tfidf, X_text_2_tfidf)`

**NMF**

`X_text_nmf_cv (X_text_1_nmf_cv, X_text_2_nmf_cv)
X_text_nmf_tfidf (X_text_1_nmf_tfidf, X_text_2_nmf_tfidf)`

**Word2Vec**

`X_text_w2v (X_text_1_w2v, X_text_2_w2v)
X_text_w2v_tfidf (X_text_1_w2v_tfidf, X_text_2_w2v_tfidf)`

**FastText**

`X_text_ft (X_text_1_ft, X_text_2_ft)
X_text_ft_tfidf (X_text_1_ft_tfidf, X_text_2_ft_tfidf)
X_text_ft_norm (X_text_1_ft_norm, X_text_2_ft_norm)
X_text_ft_norm_tfidf (X_text_1_ft_norm_tfidf, X_text_2_ft_norm_tfidf)`

**Косинусные близости**

In [159]:
from sklearn.metrics.pairwise import cosine_similarity

In [469]:
def compute_cosine_similarity(text1, text2):
    return [float(cosine_similarity([t1], [t2])) for t1, t2 in zip(text1, text2)]

In [487]:
SVD_cv = compute_cosine_similarity(X_text_1_cv, X_text_2_cv)
SVD_tfidf = compute_cosine_similarity(X_text_1_tfidf, X_text_2_tfidf)
NMF_cv = compute_cosine_similarity(X_text_1_nmf_cv, X_text_2_nmf_cv)
NMF_tfidf = compute_cosine_similarity(X_text_1_nmf_tfidf, X_text_2_nmf_tfidf)
W2V = compute_cosine_similarity(X_text_1_w2v, X_text_2_w2v)
W2V_tfidf = compute_cosine_similarity(X_text_1_w2v_tfidf, X_text_2_w2v_tfidf)
FT = compute_cosine_similarity(X_text_1_ft, X_text_2_ft)
FT_tfidf = compute_cosine_similarity(X_text_1_ft_tfidf, X_text_2_ft_tfidf)
FT_norm = compute_cosine_similarity(X_text_1_ft_norm, X_text_2_ft_norm)
FT_norm_tfidf = compute_cosine_similarity(X_text_1_ft_norm_tfidf, X_text_2_ft_norm_tfidf)

In [488]:
cosine_data = list(zip(SVD_cv, SVD_tfidf, NMF_cv, NMF_tfidf, W2V, W2V_tfidf, FT, FT_tfidf, FT_norm, FT_norm_tfidf))

In [489]:
cosine_df = pd.DataFrame(cosine_data, columns=['SVD_cv', 'SVD_tfidf', 'NMF_cv', 'NMF_tfidf', 'W2V',
                                       'W2V_tfidf', 'FT', 'FT_tfidf', 'FT_norm', 'FT_norm_tfidf'])

In [490]:
cosine_df.head()

Unnamed: 0,SVD_cv,SVD_tfidf,NMF_cv,NMF_tfidf,W2V,W2V_tfidf,FT,FT_tfidf,FT_norm,FT_norm_tfidf
0,0.483287,0.623122,0.87426,0.932015,0.936977,0.923466,0.908411,0.613016,0.842846,0.854598
1,0.773074,0.847185,0.932658,0.954364,0.921322,0.935636,0.882652,0.662348,0.802194,0.829099
2,0.630779,0.785893,0.962041,0.991323,0.961131,0.961387,0.951098,0.755105,0.751043,0.737266
3,0.75307,0.688527,0.849303,0.739186,0.763902,0.727795,0.816893,0.802922,0.821385,0.827139
4,0.992755,0.988164,0.979622,0.977436,0.914311,0.908674,0.451732,0.841454,0.611627,0.734232


In [491]:
df_train, df_test, y_train, y_test = train_test_split(cosine_df, y, random_state=1, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 5420
test 1807


In [492]:
clf = RandomForestClassifier(
    max_depth=100,
    n_estimators=500,
    class_weight='balanced'
)

clf.fit(df_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

**`svd = TruncatedSVD(200), nmf = NMF(50)`**

In [321]:
print('micro-f1 test', metrics.f1_score(y_test, clf.predict(df_test), average='micro'))

micro-f1 test 0.5694521306032098


**`svd = TruncatedSVD(100), nmf = NMF(200)`**

In [372]:
print('micro-f1 test', metrics.f1_score(y_test, clf.predict(df_test), average='micro'))

micro-f1 test 0.5794133923630327


**`svd = TruncatedSVD(200), nmf = NMF(200)`**

In [397]:
print('micro-f1 test', metrics.f1_score(y_test, clf.predict(df_test), average='micro'))

micro-f1 test 0.5722191477587161


**`svd = TruncatedSVD(200), nmf = NMF(200), fastText(size=100)`**

In [493]:
print('micro-f1 test', metrics.f1_score(y_test, clf.predict(df_test), average='micro'))

micro-f1 test 0.5744327614831212


**Итог:** лучше с **`svd = TruncatedSVD(100), nmf = NMF(200)`**

Изменение значения параметра `size` для `Word2Vec` и `fastText` (с 50 на 100) прироста в качестве не дало.