In [4]:
import pandas as pd
import gensim
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
from string import punctuation
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics.pairwise import cosine_distances
from gensim.models.wrappers import FastText
from sklearn.linear_model import LogisticRegression

In [7]:
def opener(fname):
    f = open(fname, encoding = 'utf-8')
    text = f.read()
    f.close()
    return text

morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def tokenize(text):
    words = [word.strip(punct).lower() for word in text.split()]
    words = [word for word in words if word]
    return words

def opt_normalize(texts):
    uniq = set()
    for text in texts:
        uniq.update(text)
    uniq = list(uniq)
    
    norm_uniq = {word:morph.parse(word)[0].normal_form for word in uniq}
    
    norm_texts = []
    for text in texts:
        norm_words = [norm_uniq.get(word) for word in text]
        norm_words = [word for word in norm_words if word and word not in stops]
        norm_texts.append(norm_words)
        
    return norm_texts

def cleaner(text):
    text = text.split()
    clean = []
    for word in text:
        word = word.split('_')[0]
        clean.append(word)
    return ' '.join(clean)

In [8]:
data = pd.read_csv('data_paraphraser_norm.csv')

In [9]:
data['text_1_norm1'] = data['text_1_norm'].apply(cleaner)
data['text_2_norm1'] = data['text_2_norm'].apply(cleaner)

In [9]:
data.head(1)

Unnamed: 0,label,text_1,text_2,text_1_norm,text_2_norm,text_1_norm1,text_2_norm1
0,0,Полицейским разрешат стрелять на поражение по ...,Полиции могут разрешить стрелять по хулиганам ...,полицейский_NOUN разрешать_VERB стрелять_VERB ...,полиция_NOUN мочь_VERB разрешать_VERB стрелять...,полицейский разрешать стрелять на поражение по...,полиция мочь разрешать стрелять по хулиган с т...


## Предобученная word2vec модель

In [5]:
pretrained_model = gensim.models.KeyedVectors.load_word2vec_format('184/model.bin', binary=True)

In [6]:
def get_embedding(text, model, dim):
    text = text.split()
    
    # чтобы не доставать одно слово несколько раз
    # сделаем счетчик, а потом векторы домножим на частоту
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i,word in enumerate(words):
        try:
            v = model[word]
            vectors[i] = v*(words[word]/total) # просто умножаем вектор на частоту
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [0]:
dim = 300
X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v[i] = get_embedding(text, pretrained_model, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v[i] = get_embedding(text, pretrained_model, dim)

In [0]:
X_text_w2v = np.concatenate([X_text_1_w2v, X_text_2_w2v], axis=1)
y = data['label'].values

In [0]:
clf = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_leaf=15,
                             class_weight='balanced')

In [0]:
scores = cross_val_score(clf, X_text_w2v, y, cv=5, scoring='f1_macro')

In [0]:
print('Macro f1-score')
for score in scores:
    print(score)
print('\nMean score')
print(np.mean(scores))

Macro f1-score
0.4622166050852432
0.48773401791635845
0.4585686319597005
0.3561232968732524
0.3641705559374478

Mean score
0.4257626215544005


## Моя word2vec модель

Корпус - 25000  текстов из Лента.ру

In [0]:
corpus = opener('some_corpus.txt').split('.')

In [0]:
norm_corpus = opt_normalize([tokenize(text)for text in corpus])

In [0]:
my_w2v = gensim.models.Word2Vec(norm_corpus, size=300, sg=1)

In [15]:
dim = 300
X_text_1_w2v_my = np.zeros((len(data['text_1_norm1']), dim))
X_text_2_w2v_my = np.zeros((len(data['text_2_norm1']), dim))

for i, text in enumerate(data['text_1_norm1'].values):
    X_text_1_w2v_my[i] = get_embedding(text, my_w2v, dim)
    
for i, text in enumerate(data['text_2_norm1'].values):
    X_text_2_w2v_my[i] = get_embedding(text, my_w2v, dim)

  if sys.path[0] == '':


In [0]:
X_text_w2v_my = np.concatenate([X_text_1_w2v_my, X_text_2_w2v_my], axis=1)

In [0]:
clf2 = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_leaf=15,
                             class_weight='balanced')

In [0]:
scores2 = cross_val_score(clf2, X_text_w2v_my, y, cv=5, scoring='f1_macro')

In [0]:
print('Macro f1-score')
for score in scores2:
    print(score)
print('\nMean score')
print(np.mean(scores2))

Macro f1-score
0.4496111957436166
0.4812633364731411
0.47000590697285655
0.3607766732863378
0.36493711732395245

Mean score
0.42531884595998093


Классификатор, обученный на векторах модели с rusvectores показывает macro F-score 0.4257626215544005

Классификатор, обученный на векторах собственной модели показывает macro F-score 0.42531884595998093

## Cosine distances

In [0]:
new_train = np.zeros((7227, 5))

SVD

In [0]:
svd = TruncatedSVD(100, n_iter = 8, random_state = 1)

In [0]:
tfidf1 = TfidfVectorizer(min_df=3, max_df=0.4, max_features=1000)
tfidf1.fit(data['text_1_norm'])
X_text_1 = svd.fit_transform(tfidf1.transform(data['text_1_norm']))

In [0]:
tfidf2 = TfidfVectorizer(min_df=3, max_df=0.4, max_features=1000)
tfidf2.fit(data['text_2_norm'])
X_text_2 = svd.fit_transform(tfidf2.transform(data['text_2_norm']))

In [0]:
for i in range(X_text_1.shape[0]):
    cs = cosine_distances([X_text_1[i]], [X_text_2[i]])
    new_train[i][0] = cs

NMF

In [0]:
nmf = NMF(50, solver='mu', max_iter = 200, random_state = 1)

In [0]:
tfidf1 = TfidfVectorizer(min_df=3, max_df=0.4, max_features=1000)
tfidf1.fit(data['text_1_norm'])
X_text_1_nmf = nmf.fit_transform(tfidf1.transform(data['text_1_norm']))

In [0]:
tfidf2 = TfidfVectorizer(min_df=3, max_df=0.4, max_features=1000)
tfidf2.fit(data['text_2_norm'])
X_text_2_nmf = nmf.transform(tfidf2.transform(data['text_2_norm']))

In [0]:
for i in range(X_text_1_nmf.shape[0]):
    cs = cosine_distances([X_text_1_nmf[i]], [X_text_2_nmf[i]])
    new_train[i][1] = cs

Готовый W2V

In [0]:
for i in range(X_text_1_w2v.shape[0]):
    cs = cosine_distances([X_text_1_w2v[i]], [X_text_2_w2v[i]])
    new_train[i][2] = cs

Мой W2V

In [0]:
for i in range(X_text_1_w2v_my.shape[0]):
    cs = cosine_distances([X_text_1_w2v_my[i]], [X_text_2_w2v_my[i]])
    new_train[i][3] = cs

FastText

In [0]:
fast_text = gensim.models.FastText(norm_corpus, size=300, min_n=4, max_n=8) 

In [29]:
dim = 300
X_text_1_ft = np.zeros((len(data['text_1_norm']), dim))
X_text_2_ft = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_ft[i] = get_embedding(text, fast_text, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_ft[i] = get_embedding(text, fast_text, dim)

  if sys.path[0] == '':


In [0]:
for i in range(X_text_1_ft.shape[0]):
    cs = cosine_distances([X_text_1_ft[i]], [X_text_2_ft[i]])
    new_train[i][4] = cs

Классификатор

In [0]:
clf3 = LogisticRegression(C=1000)

In [0]:
y = data['label'].values

In [None]:
scores3 = cross_val_score(clf3, new_train, y, cv=5, scoring='f1_micro')

In [77]:
print('Micro f1-score')
for score in scores3:
    print(score)
print('\nMean score')
print(np.mean(scores3))

Micro f1-score
0.5712309820193637
0.6203319502074689
0.6221453287197232
0.48304498269896196
0.5169550173010381

Mean score
0.5627416521893112


Удалось немного повысить значение F-score увеличив размерность для SVD и NMF, количество итераций для SVD. Увеличение количества признаков в tfidf векторайзере ухудшало результат