In [1]:
import pandas as pd
from lxml import html

In [86]:
corpus_xml = html.fromstring(open('paraphraser/paraphrases.xml', 'rb').read())

In [87]:
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])

In [88]:
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [5]:
data

Unnamed: 0,label,text_1,text_2
0,0,Полицейским разрешат стрелять на поражение по ...,Полицейским разрешат стрелять на поражение по ...
1,0,Право полицейских на проникновение в жилище ре...,Право полицейских на проникновение в жилище ре...
2,0,Президент Египта ввел чрезвычайное положение в...,Президент Египта ввел чрезвычайное положение в...
3,-1,Вернувшихся из Сирии россиян волнует вопрос тр...,Вернувшихся из Сирии россиян волнует вопрос тр...
4,0,В Москву из Сирии вернулись 2 самолета МЧС с р...,В Москву из Сирии вернулись 2 самолета МЧС с р...
5,1,Приставы соберут отпечатки пальцев российских ...,Приставы соберут отпечатки пальцев российских ...
6,-1,На саратовского дебошира с борта самолета Моск...,На саратовского дебошира с борта самолета Моск...
7,0,ЦИК хочет отказаться от электронной системы по...,ЦИК хочет отказаться от электронной системы по...
8,-1,Суд Петербурга оставил на потом дело о гибели ...,Суд Петербурга оставил на потом дело о гибели ...
9,-1,Страны ОПЕК сократили добычу нефти на 1 млн ба...,Страны ОПЕК сократили добычу нефти на 1 млн ба...


In [6]:
texts = pd.concat([data['text_1'], data['text_2']], ignore_index=True)

In [89]:

from string import punctuation
import os
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]

    return ' '.join(words)

In [8]:


data_rt = pd.concat([pd.read_json('data/'+file, lines=True) for file in os.listdir('data')], ignore_index=True)

In [9]:
data_rt['content_norm'] = data_rt['content'].apply(normalize)

In [10]:
data_rt.shape

(7217, 6)

In [11]:
texts_norm = texts.apply(normalize)

In [12]:
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier

In [13]:
tfidf = TfidfVectorizer(min_df=3, max_df=0.4, max_features=10000)
X = tfidf.fit_transform(data_rt['content_norm'])

In [14]:
X.shape

(7217, 10000)

In [92]:
svd = TruncatedSVD(50)

In [93]:
nmf = NMF(50)

In [95]:
svd.fit(X)

TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
       random_state=None, tol=0.0)

In [96]:
nmf.fit(X)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=50, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [19]:
id2vec = nmf.components_.T

In [20]:
id2vec = svd.components_.T

In [119]:
id2word = {i:w for i,w in enumerate(tfidf.get_feature_names())}
word2id = {w:i for i,w in id2word.items()}

In [22]:
[id2word[i] for i in cosine_distances(id2vec[word2id['налоговый']].reshape(1, -1), id2vec).argsort()[0][:10]]

['налоговый',
 'налог',
 'налогообложение',
 'амнистия',
 'уплата',
 'доход',
 'казна',
 'стимулирование',
 'бремя',
 'декрет']

In [127]:
cluster = MiniBatchKMeans(500, n_init=2, verbose=1, max_no_improvement=100, reassignment_ratio=0.4)
cluster.fit(svd.components_.T)

Init 1/2 with method: k-means++


  init_size=init_size)


Inertia for init 1/2: 0.085021
Init 2/2 with method: k-means++


  init_size=init_size)


Inertia for init 2/2: 0.186657
Minibatch iteration 1/10000: mean batch inertia: 0.002839, ewa inertia: 0.002839 
Minibatch iteration 2/10000: mean batch inertia: 0.001092, ewa inertia: 0.002804 
Minibatch iteration 3/10000: mean batch inertia: 0.000949, ewa inertia: 0.002767 
Minibatch iteration 4/10000: mean batch inertia: 0.000789, ewa inertia: 0.002728 
Minibatch iteration 5/10000: mean batch inertia: 0.001346, ewa inertia: 0.002700 
Minibatch iteration 6/10000: mean batch inertia: 0.001524, ewa inertia: 0.002676 
Minibatch iteration 7/10000: mean batch inertia: 0.001003, ewa inertia: 0.002643 
Minibatch iteration 8/10000: mean batch inertia: 0.000970, ewa inertia: 0.002609 
Minibatch iteration 9/10000: mean batch inertia: 0.001068, ewa inertia: 0.002579 
[MiniBatchKMeans] Reassigning 50 cluster centers.
Minibatch iteration 10/10000: mean batch inertia: 0.000966, ewa inertia: 0.002546 
Minibatch iteration 11/10000: mean batch inertia: 0.001519, ewa inertia: 0.002526 
Minibatch itera

Minibatch iteration 96/10000: mean batch inertia: 0.001080, ewa inertia: 0.001748 
Minibatch iteration 97/10000: mean batch inertia: 0.001576, ewa inertia: 0.001745 
Minibatch iteration 98/10000: mean batch inertia: 0.000589, ewa inertia: 0.001721 
Minibatch iteration 99/10000: mean batch inertia: 0.000601, ewa inertia: 0.001699 
Minibatch iteration 100/10000: mean batch inertia: 0.003360, ewa inertia: 0.001732 
Minibatch iteration 101/10000: mean batch inertia: 0.002749, ewa inertia: 0.001753 
Minibatch iteration 102/10000: mean batch inertia: 0.001112, ewa inertia: 0.001740 
Minibatch iteration 103/10000: mean batch inertia: 0.005703, ewa inertia: 0.001819 
[MiniBatchKMeans] Reassigning 50 cluster centers.
Minibatch iteration 104/10000: mean batch inertia: 0.000650, ewa inertia: 0.001796 
Minibatch iteration 105/10000: mean batch inertia: 0.000851, ewa inertia: 0.001777 
Minibatch iteration 106/10000: mean batch inertia: 0.001660, ewa inertia: 0.001774 
Minibatch iteration 107/10000:

Minibatch iteration 198/10000: mean batch inertia: 0.001032, ewa inertia: 0.001910 
Minibatch iteration 199/10000: mean batch inertia: 0.001195, ewa inertia: 0.001896 
Converged (lack of improvement in inertia) at iteration 199/10000
Computing label assignment and total inertia


MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=100,
        n_clusters=500, n_init=2, random_state=None,
        reassignment_ratio=0.4, tol=0.0, verbose=1)

In [128]:

cls = defaultdict(list)

for i, cl in enumerate(cluster.labels_):
    cls[cl].append(id2word[i])

f = open('cluster_svd.txt', 'w')
for cl in cls:
    f.write('### '+ str(cl) + ' ###\n')
    f.write('\n'.join(cls[cl]))
    f.write('\n\n')
f.close()

In [122]:
import gensim
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter,defaultdict

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('налоговый', 0.8855769038200378),
 ('уплата', 0.8631002902984619),
 ('подоходный', 0.8487942218780518),
 ('выплачивать', 0.8414701819419861),
 ('льгота', 0.8335127234458923),
 ('выплата', 0.8300567269325256),
 ('списание', 0.8197252750396729),
 ('тунеядство', 0.8170132637023926),
 ('облагаться', 0.8167826533317566),
 ('доход', 0.8118306398391724)]

In [90]:
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

In [97]:
X_text_1 = svd.transform(tfidf.transform(data['text_1_norm']))
X_text_2 = svd.transform(tfidf.transform(data['text_2_norm']))

X_text = np.concatenate([X_text_1, X_text_2], axis=1)

In [149]:
X_text.shape

(7227, 400)

In [99]:
y = data['label'].values
print(y.shape)

(7227,)


In [100]:
train_X, valid_X, train_y, valid_y = train_test_split(X_text, y, random_state=1)
clf = LogisticRegression(C=1000, class_weight='balanced',  multi_class='auto')
clf.fit(train_X, train_y)
preds = clf.predict(valid_X)
print(classification_report(valid_y, preds))




              precision    recall  f1-score   support

          -1       0.44      0.42      0.43       629
           0       0.43      0.51      0.47       737
           1       0.33      0.25      0.29       441

   micro avg       0.42      0.42      0.42      1807
   macro avg       0.40      0.39      0.39      1807
weighted avg       0.41      0.42      0.41      1807



In [101]:
train_X, valid_X, train_y, valid_y = train_test_split(X_text, y, random_state=1)
clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=10,
                             class_weight='balanced')
clf.fit(train_X, train_y)
preds = clf.predict(valid_X)
print(classification_report(valid_y, preds))


              precision    recall  f1-score   support

          -1       0.58      0.60      0.59       629
           0       0.50      0.47      0.48       737
           1       0.38      0.41      0.40       441

   micro avg       0.50      0.50      0.50      1807
   macro avg       0.49      0.49      0.49      1807
weighted avg       0.50      0.50      0.50      1807



In [102]:
X_text_1_nmf = nmf.transform(tfidf.transform(data['text_1_norm']))
X_text_2_nmf = nmf.transform(tfidf.transform(data['text_2_norm']))

X_text_nmf = np.concatenate([X_text_1_nmf, X_text_2_nmf], axis=1)

In [103]:
train_X, valid_X, train_y, valid_y = train_test_split(X_text_nmf, y,random_state=1)
clf = LogisticRegression(C=10000, class_weight='balanced',  multi_class='auto')
clf.fit(train_X, train_y)
preds = clf.predict(valid_X)
print(classification_report(valid_y, preds))


              precision    recall  f1-score   support

          -1       0.43      0.42      0.43       629
           0       0.44      0.53      0.48       737
           1       0.32      0.22      0.26       441

   micro avg       0.41      0.41      0.41      1807
   macro avg       0.39      0.39      0.39      1807
weighted avg       0.40      0.41      0.41      1807





In [104]:
train_X, valid_X, train_y, valid_y = train_test_split(X_text_nmf, y,random_state=1)
clf = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_leaf=15,
                             class_weight='balanced')
clf.fit(train_X, train_y)
preds = clf.predict(valid_X)
print(classification_report(valid_y, preds))


              precision    recall  f1-score   support

          -1       0.51      0.48      0.50       629
           0       0.46      0.42      0.44       737
           1       0.36      0.44      0.40       441

   micro avg       0.45      0.45      0.45      1807
   macro avg       0.44      0.45      0.44      1807
weighted avg       0.45      0.45      0.45      1807



In [137]:
fast_text = gensim.models.FastText([text.split() for text in data_rt['content_norm']], size=200, min_n=4, max_n=8)

In [27]:
w2v = gensim.models.Word2Vec([text.split() for text in data_rt['content_norm']], size=200, sg=1)


In [28]:
def get_embedding(text, model, dim):
    text = text.split()
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i,word in enumerate(words):
        try:
            v = model[word]
            vectors[i] = v*(words[word]/total)
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector
        

In [105]:
dim = 200
X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v[i] = get_embedding(text, w2v, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v[i] = get_embedding(text, w2v, dim)

  if __name__ == '__main__':


In [106]:
X_text_w2v = np.concatenate([X_text_1_w2v, X_text_2_w2v], axis=1)

In [107]:
train_X, valid_X, train_y, valid_y = train_test_split(X_text_w2v, y,random_state=3)
clf = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_leaf=15,
                             class_weight='balanced')
clf.fit(train_X, train_y)
preds = clf.predict(valid_X)
print(classification_report(valid_y, preds))


              precision    recall  f1-score   support

          -1       0.60      0.45      0.51       663
           0       0.46      0.53      0.49       700
           1       0.35      0.40      0.38       444

   micro avg       0.47      0.47      0.47      1807
   macro avg       0.47      0.46      0.46      1807
weighted avg       0.48      0.47      0.47      1807



In [108]:
train_X, valid_X, train_y, valid_y = train_test_split(X_text_w2v, y,random_state=1)
clf = LogisticRegression(C=1000,  multi_class='auto')
clf.fit(train_X, train_y)
preds = clf.predict(valid_X)
print(classification_report(valid_y, preds))




              precision    recall  f1-score   support

          -1       0.48      0.50      0.49       629
           0       0.44      0.55      0.49       737
           1       0.33      0.17      0.23       441

   micro avg       0.44      0.44      0.44      1807
   macro avg       0.42      0.41      0.40      1807
weighted avg       0.43      0.44      0.43      1807



In [None]:
X_text_nmf = np.concatenate([X_text_1_nmf, X_text_2_nmf], axis=1)

In [115]:
i = 5
print(fast_text.wv.wmdistance(data['text_1_norm'][i].split(), data['text_2_norm'][i].split()), data['label'][i])

6.7863368772822925 1


In [84]:
data['text_2_norm'][0].split()

['полицейский',
 'разрешить',
 'стрелять',
 'поражение',
 'гражданин',
 'травматика']

In [215]:
X_text_w2v = np.concatenate([X_text_1_w2v, X_text_2_w2v], axis=1)

AxisError: axis 1 is out of bounds for array of dimension 1

In [138]:
matrix = np.zeros((len(id2word), 200))

for i, word in id2word.items():
    if word in fast_text:
        matrix[i] = fast_text[word]

  after removing the cwd from sys.path.
  """


In [139]:
cluster = MiniBatchKMeans(1000, n_init=2, verbose=1, max_no_improvement=100, reassignment_ratio=0.4)
cluster.fit(matrix)

Init 1/2 with method: k-means++


  init_size=init_size)


Inertia for init 1/2: 2091.569199
Init 2/2 with method: k-means++


  init_size=init_size)


Inertia for init 2/2: 2654.220768
Minibatch iteration 1/10000: mean batch inertia: 44.127543, ewa inertia: 44.127543 
Minibatch iteration 2/10000: mean batch inertia: 37.804940, ewa inertia: 44.001103 
Minibatch iteration 3/10000: mean batch inertia: 38.688051, ewa inertia: 43.894853 
Minibatch iteration 4/10000: mean batch inertia: 26.969574, ewa inertia: 43.556381 
Minibatch iteration 5/10000: mean batch inertia: 37.426144, ewa inertia: 43.433789 
Minibatch iteration 6/10000: mean batch inertia: 34.219725, ewa inertia: 43.249526 
Minibatch iteration 7/10000: mean batch inertia: 37.154670, ewa inertia: 43.127641 
Minibatch iteration 8/10000: mean batch inertia: 36.273954, ewa inertia: 42.990581 
Minibatch iteration 9/10000: mean batch inertia: 30.231793, ewa inertia: 42.735431 
[MiniBatchKMeans] Reassigning 50 cluster centers.
Minibatch iteration 10/10000: mean batch inertia: 28.715429, ewa inertia: 42.455059 
Minibatch iteration 11/10000: mean batch inertia: 46.815302, ewa inertia: 4

Minibatch iteration 105/10000: mean batch inertia: 30.369461, ewa inertia: 34.357798 
Minibatch iteration 106/10000: mean batch inertia: 27.911005, ewa inertia: 34.228875 
Minibatch iteration 107/10000: mean batch inertia: 34.357855, ewa inertia: 34.231455 
Minibatch iteration 108/10000: mean batch inertia: 32.949968, ewa inertia: 34.205827 
Minibatch iteration 109/10000: mean batch inertia: 34.544193, ewa inertia: 34.212594 
[MiniBatchKMeans] Reassigning 50 cluster centers.
Minibatch iteration 110/10000: mean batch inertia: 29.845598, ewa inertia: 34.125263 
Minibatch iteration 111/10000: mean batch inertia: 29.377149, ewa inertia: 34.030310 
Minibatch iteration 112/10000: mean batch inertia: 34.717326, ewa inertia: 34.044049 
Minibatch iteration 113/10000: mean batch inertia: 24.921046, ewa inertia: 33.861607 
Minibatch iteration 114/10000: mean batch inertia: 30.813571, ewa inertia: 33.800653 
Minibatch iteration 115/10000: mean batch inertia: 27.581700, ewa inertia: 33.676286 
Mini

Minibatch iteration 201/10000: mean batch inertia: 39.479277, ewa inertia: 32.622800 
Minibatch iteration 202/10000: mean batch inertia: 36.517420, ewa inertia: 32.700684 
Minibatch iteration 203/10000: mean batch inertia: 31.273548, ewa inertia: 32.672144 
[MiniBatchKMeans] Reassigning 50 cluster centers.
Minibatch iteration 204/10000: mean batch inertia: 33.671338, ewa inertia: 32.692126 
Minibatch iteration 205/10000: mean batch inertia: 22.457188, ewa inertia: 32.487448 
Minibatch iteration 206/10000: mean batch inertia: 38.101097, ewa inertia: 32.599710 
Minibatch iteration 207/10000: mean batch inertia: 34.399462, ewa inertia: 32.635701 
Minibatch iteration 208/10000: mean batch inertia: 38.598266, ewa inertia: 32.754941 
Minibatch iteration 209/10000: mean batch inertia: 26.187133, ewa inertia: 32.623598 
Minibatch iteration 210/10000: mean batch inertia: 33.061973, ewa inertia: 32.632364 
Minibatch iteration 211/10000: mean batch inertia: 23.232047, ewa inertia: 32.444377 
Mini

MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=100,
        n_clusters=1000, n_init=2, random_state=None,
        reassignment_ratio=0.4, tol=0.0, verbose=1)

In [140]:

cls = defaultdict(list)

for i, cl in enumerate(cluster.labels_):
    cls[cl].append(id2word[i])

f = open('cluster_ft.txt', 'w')
for cl in cls:
    f.write('### '+ str(cl) + ' ###\n')
    f.write('\n'.join(cls[cl]))
    f.write('\n\n')
f.close()