In [2]:
import pickle
import pymorphy2
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import random_projection
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_distances

morph = pymorphy2.MorphAnalyzer()
morph_dct = {}

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = text.replace('\r', ' ').replace('\n', ' ')
    text  = only_valid_symb(text)
    norm_text = only_valid_forms(text)
    return norm_text
    
    
def only_valid_symb(text: str)-> str:
    valid = set('йцукенгшщзхъфывапролджэячсмитьбюё ')
    txt =  ''.join(x for x in text if x in valid)
    while '  ' in txt:
        txt = txt.replace('  ', ' ')
    return txt.strip()

def only_valid_forms(text)->str:
    filter_forms = {'ADJF', 'ADJS', 'COMP', 'NUMR', 'NPRO', 'PREP', 'CONJ', 'PRCL', 'INTJ'}
    lst = []
    for word in text.split():
        tag = morph_parse(word)
        if any([x in tag.tag for x in filter_forms]):
            continue
        lst.append(tag.normal_form)
    return ' '.join(lst)
   
def morph_parse(word:str)->str:
    if word not in morph_dct:
        tag = morph.parse(word)[0]
        morph_dct[word] = tag
    return morph_dct[word]

### Sample of data

In [None]:
# data = [{'fulltext':'FULLTEXT',
#            'title': 'TITLE', 
#            'url': 'URL'},
#           {'fulltext':'FULLTEXT',
#            'title': 'TITLE', 
#            'url': 'URL'}]

In [5]:
%%time
f  =open(_SOURCE, 'rb')
data = pickle.load(f)
_text = [preprocess_text(x['fulltext']) for x in data if x['fulltext']!='']
len(_text)

CPU times: user 6min 50s, sys: 3.19 s, total: 6min 53s
Wall time: 6min 54s


In [7]:
vectorized = TfidfVectorizer(min_df=2).fit_transform(_text)
vectorized.shape

(57133, 67288)

In [8]:
X = random_projection.GaussianRandomProjection(n_components=100).fit_transform(vectorized)
__X = np.ceil(X)

In [9]:
dist = pairwise_distances(__X, metric='cosine')
np.fill_diagonal(dist, 1)

In [None]:
pairs = set()
for x, y in np.argwhere(dist<0.15):
    pairs.add(tuple(sorted([x, y])))

In [None]:
print(pairs)