In [1]:
import re
import string
import pandas as pd
from lxml import html
import numpy as np
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier
import gensim
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter,defaultdict
from string import punctuation
import os
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
%matplotlib inline

morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))


In [2]:
PATH_TO_DATA = './data'

In [3]:
files = [os.path.join(PATH_TO_DATA, file) for file in os.listdir(PATH_TO_DATA)[1:]]

In [4]:
data_rt = pd.concat([pd.read_json(file, lines=True) for file in files], axis=0, ignore_index=True)

In [5]:
data_rt.head()

Unnamed: 0,abstract,content,keywords,summary,title,url
0,Изложен метод проектирования устройств подачи ...,УДК 687.053\n\nМАТЕМАТИЧЕСКАЯ МОДЕЛЬ РАБОЧЕГО ...,"[КРАЕОБМЕТОЧНЫЕ ШВЕЙНЫЕ МАШИНЫ, РАСЧЕТ НИТЕПОД...",,Математическая модель рабочего процесса образо...,https://cyberleninka.ru/article/n/matematiches...
1,В статье представлены исследовательские методы...,"/84\n\nCivil SecurityTechnology, Vol. 8, 2011,...","[панель из ячеистого бетона, аварийное состоян...",,Инженерная безопасность эксплуатации жилых зда...,https://cyberleninka.ru/article/n/inzhenernaya...
2,,Ю.В. Чудодеев\nИВ РАН\n\nМировой финансово-эко...,"[КИТАЙ, МИРОВОЙ КРИЗИС, ФИНАНСЫ, ЭКОНОМИКА]",,Мировой финансово-экономический кризис как выз...,https://cyberleninka.ru/article/n/mirovoy-fina...
3,Представлено исследование актуальной философск...,Методологические проблемы сравнительного анали...,"[ТЕХНИЧЕСКАЯ РЕАЛЬНОСТЬ, ПАРАДИГМА, ФИЛОСОФСКИ...",,Методологические проблемы сравнительного анали...,https://cyberleninka.ru/article/n/metodologich...
4,В статье анализируется суть как и понятия. О...,Социокультурный феномен православного монашест...,"[СОЦИОКУЛЬТУРНЫЙ ФЕНОМЕН, МОНАШЕСТВО, ПРАВОСЛА...",,Социокультурный феномен православного монашества,https://cyberleninka.ru/article/n/sotsiokultur...


In [6]:
def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]

    return ' '.join(words)

def tokenize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]

    return ' '.join(words)

In [7]:
data_rt['content_norm'] = data_rt['content'].apply(normalize)

In [8]:
cv = CountVectorizer(min_df=3, max_df=0.4, max_features=1000)
X = cv.fit_transform(data_rt['content_norm'])

In [9]:
X.shape

(17266, 1000)

In [11]:
tfidf = TfidfVectorizer(min_df=3, max_df=0.4, max_features=1000)
X1 = tfidf.fit_transform(data_rt['content_norm'])

In [12]:
corpus_xml = html.fromstring(open('paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [13]:
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

#### SVD через CountVectorizer

In [14]:
svd = TruncatedSVD(100)
svd.fit(X)

TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5,
       random_state=None, tol=0.0)

In [15]:
X_text_1 = svd.transform(cv.transform(data['text_1_norm']))
X_text_2 = svd.transform(cv.transform(data['text_2_norm']))

X_svd = [X_text_1, X_text_2]

#### SVD через TfidfVectorizer

In [16]:
svd1 = TruncatedSVD(100)
svd1.fit(X1)

TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5,
       random_state=None, tol=0.0)

In [17]:
X_text_1 = svd1.transform(tfidf.transform(data['text_1_norm']))
X_text_2 = svd1.transform(tfidf.transform(data['text_2_norm']))

X_svd1 = [X_text_1, X_text_2]

In [18]:
final_X_svd = [X_svd, X_svd1]

#### NMF через CountVectorizer

In [19]:
nmf = NMF(100)
nmf.fit(X)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=100, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [20]:
X_text_1 = nmf.transform(cv.transform(data['text_1_norm']))
X_text_2  = nmf.transform(cv.transform(data['text_2_norm']))

X_nmf = [X_text_1, X_text_2]

#### NMF через TfidfVectorizer

In [21]:
nmf1 = NMF(100)
nmf1.fit(X1)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=100, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [22]:
X_text_1 = nmf1.transform(tfidf.transform(data['text_1_norm']))
X_text_2  = nmf1.transform(tfidf.transform(data['text_2_norm']))

X_nmf1 = [X_text_1, X_text_2]

#### Word2Vec

In [24]:
w2v = gensim.models.Word2Vec([text.split() for text in data_rt['content_norm']], size=50, sg=1)

In [25]:
def get_embedding(text, model, dim):
    text = text.split()
    
    # чтобы не доставать одно слово несколько раз
    # сделаем счетчик, а потом векторы домножим на частоту
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i,word in enumerate(words):
        try:
            v = model[word]
            vectors[i] = v*(words[word]/total) # просто умножаем вектор на частоту
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [26]:
dim = 50
X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v[i] = get_embedding(text, w2v, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v[i] = get_embedding(text, w2v, dim)

  if sys.path[0] == '':


#### FastText с нормализацией

In [None]:
fast_text = gensim.models.FastText([text.split() for text in data_rt['content_norm']], size=50, min_n=4, max_n=8)

In [None]:
dim = 50
X_text_1_ft = np.zeros((len(data['text_1_norm']), dim))
X_text_2_ft = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_ft[i] = get_embedding(text, fast_text, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_ft[i] = get_embedding(text, fast_text, dim)

X_text_ft = [X_text_1_ft, X_text_2_ft]

In [29]:
vectors = [X_text_1_w2v, X_text_2_w2v, X_nmf, X_nmf1, X_svd, X_svd1]

In [31]:
results = {}
for index, pair in enumerate(vectors):
    x = pair[0]
    y = pair[1]
    res = []
    
    for i in range(len(x)):
        ans = cosine_distances(x[i].reshape(1, -1), y[i].reshape(1, -1))[0]
        res.append(ans[0])
    results[index] = res

In [33]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, make_scorer