In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
train_data = pd.read_csv('titles_and_lemmatized_titles_with_nltk_and_maru_train.csv', encoding='utf-8')
train_data.fillna('', inplace=True)
test_data = pd.read_csv('titles_and_lemmatized_titles_with_nltk_and_maru_test.csv', encoding='utf-8')
test_data.fillna('', inplace=True)

train_data.head()

Unnamed: 0,doc_id,pair_id,group_id,target,title,lemmatized_titles_with_nltk_and_maru
0,15731,1,1,0,ВАЗ 21213 | Замена подшипников ступицы | Нива,ваза 21213 замена подшипник ступица нива
1,14829,2,1,0,"Ваз 2107 оптом в Сочи. Сравнить цены, купить ...",ваза 2107 оптом сочи сравнить цена купить потр...
2,15764,3,1,0,Купить ступица Лада калина2. Трансмиссия - пер...,купить ступица лад калина2 трансмиссия переход...
3,17669,4,1,0,Классика 21010 - 21074,классика 21010 21074
4,14852,5,1,0,Ступица Нива — замена подшипника своими руками,ступица нива замена подшипник свой рука


In [52]:
last_group = train_data.group_id.iloc[-1]
max_words = 15
tf_feat = pd.DataFrame(columns=np.arange(max_words))

for group in range(1, last_group + 1):
    mask_gr = train_data.group_id == group
    documents = train_data.lemmatized_titles_with_nltk_and_maru[mask_gr]
    
    vect = TfidfVectorizer()
    tfidf_matrix = vect.fit_transform(documents)
    df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
    voc = np.array(df.mean(axis=0).sort_values(ascending=False)[:max_words].index)
    intersect_w = np.zeros((documents.shape[0], max_words))
    for i, title in enumerate(documents):
        words = title.strip().split()
        for j, word in enumerate(voc):
            intersect_w[i, j] = words.count(word)

    tf_feat = tf_feat.append(pd.DataFrame(intersect_w))


print(tf_feat.shape)

(11690, 15)


In [53]:
with open(f'Parse_done/tfidf_train_titles_ALL.csv', mode='w', encoding='utf-8') as f_csv:
    tf_feat.to_csv(f_csv, index=False)

In [54]:
tf_feat.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,11690.0,11690.0,11690.0,11690.0,11690.0,11690.0,11690.0,11690.0,11690.0,11690.0,11690.0,11690.0,11690.0,11690.0,11690.0
mean,0.472113,0.340633,0.257827,0.207271,0.165184,0.1284,0.113259,0.100257,0.09059,0.078015,0.071685,0.067579,0.066125,0.05834,0.054662
std,0.750818,0.630559,0.473519,0.427554,0.455387,0.396351,0.373432,0.338392,0.371942,0.294363,0.269016,0.286374,0.357973,0.252325,0.237273
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,16.0,15.0,5.0,3.0,11.0,15.0,15.0,9.0,15.0,6.0,3.0,10.0,15.0,4.0,3.0


In [55]:
first_group = test_data.group_id.iloc[0]
last_group = test_data.group_id.iloc[-1]

max_words = 15
tf_feat_test = pd.DataFrame(columns=np.arange(max_words))

for group in range(first_group, last_group + 1):
    mask_gr = test_data.group_id == group
    documents = test_data.lemmatized_titles_with_nltk_and_maru[mask_gr]
    
    vect = TfidfVectorizer()
    tfidf_matrix = vect.fit_transform(documents)
    df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
    voc = np.array(df.mean(axis=0).sort_values(ascending=False)[:max_words].index)
    intersect_w = np.zeros((documents.shape[0], max_words))
    for i, title in enumerate(documents):
        words = title.strip().split()
        for j, word in enumerate(voc):
            intersect_w[i, j] = words.count(word)

    tf_feat_test = tf_feat_test.append(pd.DataFrame(intersect_w))


print(tf_feat_test.shape)

(16627, 15)


In [56]:
with open(f'Parse_done/tfidf_test_titles_ALL.csv', mode='w', encoding='utf-8') as f_csv:
    tf_feat_test.to_csv(f_csv, index=False)

In [57]:
tf_feat_test.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0,16627.0
mean,0.430625,0.326758,0.252962,0.203224,0.16786,0.142479,0.115655,0.096349,0.093583,0.081614,0.072773,0.066157,0.065857,0.063451,0.057978
std,0.670238,0.664425,0.506308,0.464829,0.409976,0.435098,0.343049,0.338376,0.380758,0.330517,0.328664,0.261994,0.323959,0.300973,0.257231
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,15.0,16.0,14.0,13.0,7.0,15.0,7.0,14.0,15.0,13.0,13.0,4.0,15.0,11.0,6.0
