In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

import pandas as pd

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arthursaprykin/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [168]:
# pages_data = pd.read_feather('data/link_page_sentences.ftr')
pages_data = pd.read_csv('data/link_page_sentences.csv')

pages_data.dropna(inplace=True)
# pages_data.drop('index', axis=1, inplace=True)
pages_data.set_index(['link', 'sentence_order'], inplace=True)

pages_data.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,sentence_text
link,sentence_order,Unnamed: 2_level_1
data/raw_pages/www.kinopoisk.ru/media/article/4005174.html,0,6 главный аниме-режиссёр xxi век — статья на к...
data/raw_pages/www.kinopoisk.ru/media/article/4005174.html,1,6 главный аниме-режиссёр xxi век вчера обсудит...
data/raw_pages/www.kinopoisk.ru/media/article/4005174.html,2,масаак юас сатося кон исао такахат мамор хосод...
data/raw_pages/www.kinopoisk.ru/media/article/4005174.html,3,буйный визуальность он работа быть слишком рад...
data/raw_pages/www.kinopoisk.ru/media/article/4005174.html,4,зато « игра разум » заметить масао маруям соуч...


In [29]:
stopwords.words('russian')[:10]

['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со']

In [169]:
pages_data.shape

(2908, 1)

In [175]:
# vectorizer = TfidfVectorizer(stop_words=stopwords.words('russian'), min_df=0.01)
vectorizer = TfidfVectorizer(stop_words=stopwords.words('russian'))
page_indices_array = vectorizer.fit_transform(pages_data['sentence_text']).toarray()
columns = list(vectorizer.vocabulary_.keys())

page_indices = pd.DataFrame(page_indices_array, columns=columns)
page_indices.index = pages_data.index
page_indices.shape

(2834, 8242)

In [176]:
page_indices.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,главный,аниме,режиссёр,xxi,век,статья,кинопоиск,вчера,обсудить,рассказывать,...,098,169,снг,нишевый,клаустрофобы,my,little,pony,65,клаустрофоб
link,sentence_order,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
data/raw_pages/www.kinopoisk.ru/media/article/4005174.html,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data/raw_pages/www.kinopoisk.ru/media/article/4005174.html,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data/raw_pages/www.kinopoisk.ru/media/article/4005174.html,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data/raw_pages/www.kinopoisk.ru/media/article/4005174.html,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data/raw_pages/www.kinopoisk.ru/media/article/4005174.html,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [186]:
ranker = NearestNeighbors(n_neighbors=100, metric='cosine', n_jobs=-1)
ranker.fit(page_indices)

NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=100)

In [179]:
pages_data.values[0][0]

'6 главный аниме-режиссёр xxi век — статья на кинопоиск'

In [78]:
import re
import pymorphy2

from string import punctuation

MORPH = pymorphy2.MorphAnalyzer()

def preprocess_query(query):
    query = query.lower()
    query = re.sub('\s\s+', ' ', query)
    
    wrds = []
    for wrd in nltk.word_tokenize(query):
        if wrd in punctuation:
            continue

        wrd = MORPH.parse(wrd)[0].normal_form
        wrds.append(wrd)

    clear_query = ' '.join(wrds)
    
    return clear_query

In [183]:
query = 'Портрет Незнакомца фильм 2021'
query

'Портрет Незнакомца фильм 2021'

In [181]:
preprocess_query(query)

'портрет незнакомец фильм 2021'

In [187]:
query = preprocess_query(query)
query_vect = vectorizer.transform([query]).toarray()

scores, indices = ranker.kneighbors(query_vect)
scores = scores[0]
indices = indices[0]

In [188]:
scores, indices

(array([0.55114236, 0.6087677 , 0.66223688, 0.67534476, 0.68018343,
        0.71729737, 0.72039753, 0.72890622, 0.7294319 , 0.7294319 ,
        0.7294319 , 0.7294319 , 0.74771221, 0.75404744, 0.78704213,
        0.79424969, 0.79596282, 0.82291179, 0.82338844, 0.82454984,
        0.84528956, 0.85170747, 0.85484129, 0.85618358, 0.8581341 ,
        0.86111766, 0.86111766, 0.86214161, 0.86457662, 0.87018639,
        0.88135155, 0.8835661 , 0.88879483, 0.88892178, 0.89065605,
        0.8917871 , 0.89438294, 0.89470846, 0.89476364, 0.89599814,
        0.89675175, 0.89926407, 0.89988494, 0.90036317, 0.90136784,
        0.90209388, 0.90525519, 0.90534801, 0.90691443, 0.90762005,
        0.91497822, 0.91509751, 0.91557447, 0.91656697, 0.91662917,
        0.91691866, 0.91745842, 0.92032152, 0.92202593, 0.92262168,
        0.92284892, 0.92333823, 0.9236229 , 0.92528187, 0.92639604,
        0.92818243, 0.92892482, 0.928946  , 0.92917703, 0.93013304,
        0.9304157 , 0.93167885, 0.93217449, 0.93

In [189]:
page_indices.iloc[indices]

Unnamed: 0_level_0,Unnamed: 1_level_0,главный,аниме,режиссёр,xxi,век,статья,кинопоиск,вчера,обсудить,рассказывать,...,098,169,снг,нишевый,клаустрофобы,my,little,pony,65,клаустрофоб
link,sentence_order,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
data/raw_pages/www.kinopoisk.ru/media/article/4005172.html,16,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
data/raw_pages/www.kinopoisk.ru/media/article/4005166.html,29,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
data/raw_pages/www.kinopoisk.ru/media/article/4005166.html,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
data/raw_pages/www.kinopoisk.ru/media/article/4005143.html,58,0.0,0.0,0.0,0.115926,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
data/raw_pages/www.kinopoisk.ru/media/article/4005151.html,38,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
data/raw_pages/www.kinopoisk.ru/media/article/4005145.html,20,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
data/raw_pages/www.kinopoisk.ru/media/article/4005143.html,31,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
data/raw_pages/www.kinopoisk.ru/media/news/4005175.html,7,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.580006,0.0,0.0,0.0,0.0,0.0
data/raw_pages/www.kinopoisk.ru/media/article/4005144.html,2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [190]:
candidate_links = page_indices.iloc[indices].reset_index()['link']
score_df = pd.DataFrame({'candidate_link': candidate_links, 'score': scores})

score_df.head(5)

Unnamed: 0,candidate_link,score
0,data/raw_pages/www.kinopoisk.ru/media/article/...,0.551142
1,data/raw_pages/www.kinopoisk.ru/media/article/...,0.608768
2,data/raw_pages/www.kinopoisk.ru/media/article/...,0.662237
3,data/raw_pages/www.kinopoisk.ru/media/article/...,0.675345
4,data/raw_pages/www.kinopoisk.ru/media/article/...,0.680183


In [191]:
# group_score_df.groupby('candidate_link').mean()['score']
# group_score_df = score_df.groupby('candidate_link').min()['score']
group_score_df = score_df.groupby('candidate_link').sum()['score']
group_score_df = group_score_df.sort_values()

best_link = group_score_df.index[0]
best_link

'data/raw_pages/www.kinopoisk.ru/media/news/4005182.html'

In [194]:
group_score_df[:10].index.tolist()

['data/raw_pages/www.kinopoisk.ru/media/news/4005182.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005146.html',
 'data/raw_pages/www.kinopoisk.ru/media/podcast/4005149.html',
 'data/raw_pages/www.kinopoisk.ru/media/podcast/4005142.html',
 'data/raw_pages/www.kinopoisk.ru/media/podcast/4005170.html',
 'data/raw_pages/www.kinopoisk.ru/media/news/4005167.html',
 'data/raw_pages/www.kinopoisk.ru/media/news/4005165.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005163.html',
 'data/raw_pages/www.kinopoisk.ru/media/news/4005152.html',
 'data/raw_pages/www.kinopoisk.ru/media/news/4005141.html']

In [195]:
pca = PCA(n_components=60)
pca_indices_array = pca.fit_transform(page_indices_array)

In [196]:
pca_indices_array[0]

array([-0.05579107,  0.1586528 , -0.01501149,  0.00971648, -0.19270814,
       -0.00236334, -0.00385838,  0.00043815, -0.00587431, -0.00112893,
        0.01634169, -0.0040918 , -0.00641354,  0.34722111, -0.18581852,
       -0.12000913, -0.02351873,  0.00618232, -0.04716522, -0.07707103,
        0.02372438, -0.04838603,  0.11851169,  0.03676219, -0.05218381,
       -0.07471299, -0.04536047, -0.02678036, -0.07408447,  0.05724542,
       -0.10681919, -0.03867108, -0.00966327,  0.01251088,  0.01289695,
        0.00796495, -0.03075216, -0.01331153,  0.0511176 ,  0.08935133,
        0.08766186,  0.02746931, -0.12428874, -0.04355384, -0.15640179,
       -0.06213313, -0.04822399, -0.01082842, -0.03598109,  0.09678597,
        0.09044876,  0.00270292,  0.07310857, -0.09294209, -0.04216126,
        0.00539889, -0.0186913 ,  0.02545263,  0.09132821, -0.00232028])

In [197]:
ranker_pca = NearestNeighbors(n_neighbors=100, metric='cosine', n_jobs=-1)
ranker_pca.fit(pca_indices_array)

NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=100)

In [198]:
pca.transform(vectorizer.transform([query]).toarray())

array([[-0.02415816,  0.00539845, -0.00164377, -0.04353249, -0.02265387,
        -0.02742472, -0.02959118,  0.10359807,  0.02063569, -0.01424193,
        -0.00133396, -0.01001319, -0.01460749, -0.05448649, -0.09091566,
         0.07798178, -0.18978552, -0.04791192,  0.07965657,  0.04470414,
        -0.09977531,  0.06621161, -0.03243371, -0.03850579, -0.04771212,
        -0.00086189,  0.01986298,  0.0126633 ,  0.00068622,  0.01777741,
        -0.03537532,  0.01233005, -0.02939948,  0.0236982 ,  0.0082996 ,
        -0.03837016,  0.05957157, -0.0093931 , -0.00761916, -0.00930393,
         0.02818713,  0.03667668, -0.01009846, -0.02856856, -0.02133234,
        -0.01499202, -0.0225727 ,  0.01102077,  0.01761455, -0.02967358,
        -0.01425896, -0.03415633, -0.04580957, -0.00603988,  0.03197917,
        -0.03548049,  0.01761347, -0.01312308,  0.05699182,  0.05302916]])

In [218]:
def search_page(query, vectorizer, pca, ranker, page_count=10, score_type='sum'):
    query = preprocess_query(query)
    query_vect = vectorizer.transform([query]).toarray()
    query_vect = pca.transform(query_vect)
    
    scores, indices = ranker.kneighbors(query_vect)
    scores = scores[0]
    indices = indices[0]
    
    candidate_links = page_indices.iloc[indices].reset_index()['link']
    score_df = pd.DataFrame({'candidate_link': candidate_links, 'score': scores})
    
    if score_type == 'mean':
        group_score_df = group_score_df.groupby('candidate_link').mean()['score']
        group_score_df = group_score_df.sort_values(ascending=True)
    elif score_type == 'min':
        group_score_df = score_df.groupby('candidate_link').min()['score']
        group_score_df = group_score_df.sort_values(ascending=True)
    else:
        score_df['score'] = 1 - score_df['score']
        group_score_df = score_df.groupby('candidate_link').sum()['score']
        group_score_df = group_score_df.sort_values(ascending=False)
    
    return group_score_df[:page_count].index.tolist()

In [222]:
search_page('кинотавр фотографии', vectorizer, pca, ranker_pca, page_count=15, score_type='sum')

['data/raw_pages/www.kinopoisk.ru/media/article/4005154.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005172.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005168.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005155.html',
 'data/raw_pages/www.kinopoisk.ru/media/news/4005167.html',
 'data/raw_pages/www.kinopoisk.ru/media/news/4005165.html',
 'data/raw_pages/www.kinopoisk.ru/media/podcast/4005170.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005145.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005143.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005151.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005166.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005156.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005144.html',
 'data/raw_pages/www.kinopoisk.ru/media/news/4005162.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005160.html']

In [223]:
import pickle

with open('search_model', 'wb') as f:
    pickle.dump({'vectorizer': vectorizer, 'pca': pca, 'ranker': ranker_pca}, f)

In [224]:
with open('search_model', 'rb') as f:
    search_model = pickle.load(f)
    
search_model

{'vectorizer': TfidfVectorizer(stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с',
                             'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его',
                             'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы',
                             'по', 'только', 'ее', 'мне', ...]),
 'pca': PCA(n_components=60),
 'ranker': NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=100)}

In [225]:
vectorizer = search_model['vectorizer']
pca = search_model['pca']
ranker_pca = search_model['ranker']

vectorizer, pca, ranker_pca

(TfidfVectorizer(stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с',
                             'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его',
                             'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы',
                             'по', 'только', 'ее', 'мне', ...]),
 PCA(n_components=60),
 NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=100))

In [226]:
search_page('дюна 2021', vectorizer, pca, ranker_pca, page_count=15, score_type='sum')

['data/raw_pages/www.kinopoisk.ru/media/article/4005136.html',
 'data/raw_pages/www.kinopoisk.ru/media/news/4005148.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005147.html',
 'data/raw_pages/www.kinopoisk.ru/media/news/4005175.html',
 'data/raw_pages/www.kinopoisk.ru/media/video/4005135.html',
 'data/raw_pages/www.kinopoisk.ru/media/podcast/4005142.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005158.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005145.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005154.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005168.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005171.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005178.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005155.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005144.html',
 'data/raw_pages/www.kinopoisk.ru/media/article/4005166.html']