In [214]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import jaccard
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing

In [205]:
def compute_jackard(feature):
    idx = feature.shape[0]
    dist_mean = np.ones((idx))
    dist_median = np.ones((idx))
    dist_min = np.ones((idx))
    dist_max = np.ones((idx))
    for i in range(idx):    
        dist_mat = np.array([jaccard(feature[i], feature[j]) for j in range(idx) if j != i])
        dist_mat[np.isnan(dist_mat)] = 1.0
        dist_mean[i] = dist_mat.mean()
        dist_median[i] = np.median(dist_mat)
        dist_min[i] = dist_mat.min()
        dist_max[i] = dist_mat.max()
    return dist_mean, dist_median, dist_min, dist_max

In [228]:
train_data = pd.read_csv('titles_and_lemmatized_titles_with_nltk_and_maru_train.csv', encoding='utf-8')
train_data.fillna('', inplace=True)
test_data = pd.read_csv('titles_and_lemmatized_titles_with_nltk_and_maru_test.csv', encoding='utf-8')
test_data.fillna('', inplace=True)

test_data.head()

Unnamed: 0,doc_id,pair_id,group_id,title,lemmatized_titles_with_nltk_and_maru
0,6710,11691,130,КАК ПРОПИСАТЬ АДМИНКУ В КС 1.6 СЕБЕ ИЛИ ДРУГУ ...,как прописать админка в кс 1.6 себя или друг y...
1,4030,11692,130,Скачать: SGL-RP доработка | Слив мода [MySQL] ...,скачать sgl-rp доработка слива мода mysql rp r...
2,5561,11693,130,Как прописать админку в кс 1.6 - Counter-Strik...,как прописать админка кс 1.6 counter-strike ка...
3,4055,11694,130,Как прописать простую админку в кс 1 6,как прописать простой админка кс 1 6
4,4247,11695,130,Подбор админов для сервера по КОД_4 [Архив] ...,подбор админ сервер код_4 архив форум ozone


In [207]:
# train_data[train_data.doc_id == 2455]

In [209]:
last_group = train_data.group_id.iloc[-1]

f1_mean = np.array([])
f1_med = np.array([])
f1_min = np.array([])
f1_max = np.array([])
f2_mean = np.array([])
f2_med = np.array([])
f2_min = np.array([])
f2_max = np.array([])
f3_mean = np.array([])
f3_med = np.array([])
f3_min = np.array([])
f3_max = np.array([])

f1_lem_mean = np.array([])
f1_lem_med = np.array([])
f1_lem_min = np.array([])
f1_lem_max = np.array([])
f2_lem_mean = np.array([])
f2_lem_med = np.array([])
f2_lem_min = np.array([])
f2_lem_max = np.array([])
f3_lem_mean = np.array([])
f3_lem_med = np.array([])
f3_lem_min = np.array([])
f3_lem_max = np.array([])

for group in range(1, last_group + 1):
    mask_gr = train_data.group_id == group
    t_gr = train_data.title[mask_gr]
    t_lem_gr = train_data.lemmatized_titles_with_nltk_and_maru[mask_gr]

    cv_word_1_1 = CountVectorizer()
    cv_word_1_1.fit(t_gr)
    feat_1 = cv_word_1_1.transform(t_gr).toarray()
    mean_1, med_1, min_1, max_1 = compute_jackard(feat_1)
    
    cv_lem_word_1_1 = CountVectorizer()
    cv_lem_word_1_1.fit(t_lem_gr)
    feat_lem_1 = cv_lem_word_1_1.transform(t_lem_gr).toarray()
    mean_lem_1, med_lem_1, min_lem_1, max_lem_1 = compute_jackard(feat_lem_1)
    
    cv_word_2_2 = CountVectorizer(analyzer='word', ngram_range=(2,2))
    cv_word_2_2.fit(t_gr)
    feat_2 = cv_word_2_2.transform(t_gr).toarray()
    mean_2, med_2, min_2, max_2 = compute_jackard(feat_2)
    
    cv_lem_word_2_2 = CountVectorizer()
    cv_lem_word_2_2.fit(t_lem_gr)
    feat_lem_2 = cv_lem_word_2_2.transform(t_lem_gr).toarray()
    mean_lem_2, med_lem_2, min_lem_2, max_lem_2 = compute_jackard(feat_lem_2)
    
    cv_char_3_4 = CountVectorizer(analyzer='char', ngram_range=(3,4))
    cv_char_3_4.fit(t_gr)
    feat_3 = cv_char_3_4.transform(t_gr).toarray()
    mean_3, med_3, min_3, max_3 = compute_jackard(feat_3)
        
    cv_lem_char_3_4 = CountVectorizer()
    cv_lem_char_3_4.fit(t_lem_gr)
    feat_lem_3 = cv_lem_char_3_4.transform(t_lem_gr).toarray()
    mean_lem_3, med_lem_3, min_lem_3, max_lem_3 = compute_jackard(feat_lem_3)
    
    f1_mean = np.concatenate((f1_mean, mean_1))
    f1_med = np.concatenate((f1_med, med_1))
    f1_min = np.concatenate((f1_min, min_1))
    f1_max = np.concatenate((f1_max, max_1))
    f2_mean = np.concatenate((f2_mean, mean_2))
    f2_med = np.concatenate((f2_med, med_2))
    f2_min = np.concatenate((f2_min, min_2))
    f2_max = np.concatenate((f2_max, max_2))
    f3_mean = np.concatenate((f3_mean, mean_3))
    f3_med = np.concatenate((f3_med, med_3))
    f3_min = np.concatenate((f3_min, min_3))
    f3_max = np.concatenate((f3_max, max_3))
    
    f1_lem_mean = np.concatenate((f1_lem_mean, mean_lem_1))
    f1_lem_med = np.concatenate((f1_lem_med, med_lem_1))
    f1_lem_min = np.concatenate((f1_lem_min, min_lem_1))
    f1_lem_max = np.concatenate((f1_lem_max, max_lem_1))
    f2_lem_mean = np.concatenate((f2_lem_mean, mean_lem_2))
    f2_lem_med = np.concatenate((f2_lem_med, med_lem_2))
    f2_lem_min = np.concatenate((f2_lem_min, min_lem_2))
    f2_lem_max = np.concatenate((f2_lem_max, max_lem_2))
    f3_lem_mean = np.concatenate((f3_lem_mean, mean_lem_3))
    f3_lem_med = np.concatenate((f3_lem_med, med_lem_3))
    f3_lem_min = np.concatenate((f3_lem_min, min_lem_3))
    f3_lem_max = np.concatenate((f3_lem_max, max_lem_3))

feat_to_df = {'mean_word_1_1': f1_mean, 'median_word_1_1': f1_med,
              'min_word_1_1': f1_min, 'max_word_1_1': f1_max,
              'mean_word_2_2': f2_mean, 'median_word_2_2': f2_med,
              'min_word_2_2': f2_min, 'max_word_2_2': f2_max,
              'mean_char_3_4': f3_mean, 'median_char_3_4': f3_med,
              'min_char_3_4': f3_min, 'max_char_3_4': f3_max}
df_features = pd.DataFrame(feat_to_df)

feat_to_df = {'mean_word_1_1': f1_lem_mean, 'median_word_1_1': f1_lem_med,
              'min_word_1_1': f1_lem_min, 'max_word_1_1': f1_lem_max,
              'mean_word_2_2': f2_lem_mean, 'median_word_2_2': f2_lem_med,
              'min_word_2_2': f2_lem_min, 'max_word_2_2': f2_lem_max,
              'mean_char_3_4': f3_lem_mean, 'median_char_3_4': f3_lem_med,
              'min_char_3_4': f3_lem_min, 'max_char_3_4': f3_lem_max}
df_lem_features = pd.DataFrame(feat_to_df)

print(df_lem_features.shape)

(11690, 12)


In [210]:
df_lem_features.head()

Unnamed: 0,mean_word_1_1,median_word_1_1,min_word_1_1,max_word_1_1,mean_word_2_2,median_word_2_2,min_word_2_2,max_word_2_2,mean_char_3_4,median_char_3_4,min_char_3_4,max_char_3_4
0,0.90238,0.9375,0.142857,1.0,0.90238,0.9375,0.142857,1.0,0.90238,0.9375,0.142857,1.0
1,0.941398,0.964286,0.533333,1.0,0.941398,0.964286,0.533333,1.0,0.941398,0.964286,0.533333,1.0
2,0.970934,1.0,0.85,1.0,0.970934,1.0,0.85,1.0,0.970934,1.0,0.85,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.930193,1.0,0.5,1.0,0.930193,1.0,0.5,1.0,0.930193,1.0,0.5,1.0


In [215]:
df_features.isnull().sum()

mean_word_1_1      0
median_word_1_1    0
min_word_1_1       0
max_word_1_1       0
mean_word_2_2      0
median_word_2_2    0
min_word_2_2       0
max_word_2_2       0
mean_char_3_4      0
median_char_3_4    0
min_char_3_4       0
max_char_3_4       0
dtype: int64

In [216]:
with open('jackard_titles_train_not_nan.csv', mode='w', encoding='utf-8') as f_csv:
#     df_features.fillna(1.0, inplace=True)
    df_features.to_csv(f_csv)

In [223]:
# normalized = preprocessing.normalize(df_features)
# normalized

In [222]:
# normalized.sum(axis=1)

In [None]:
# with open('jackard_titles_train_not_nan_normalized.csv', mode='w', encoding='utf-8') as f_csv:
# #     df_features.fillna(1.0, inplace=True)
#     df_features.to_csv(f_csv)

In [224]:
df_lem_features.isnull().sum()

mean_word_1_1      0
median_word_1_1    0
min_word_1_1       0
max_word_1_1       0
mean_word_2_2      0
median_word_2_2    0
min_word_2_2       0
max_word_2_2       0
mean_char_3_4      0
median_char_3_4    0
min_char_3_4       0
max_char_3_4       0
dtype: int64

In [225]:
with open('jackard_titles_lemmatization_train_not_nan.csv', mode='w', encoding='utf-8') as f_csv:
#     df_lem_features.fillna(1.0, inplace=True)
    df_lem_features.to_csv(f_csv)

In [None]:
# normalized__lem = preprocessing.normalize(df_lem_features)

In [226]:
# df_lem_features[df_lem_features.mean_word_2_2.isnull()]

In [229]:
first_group = test_data.group_id.iloc[0]
last_group = test_data.group_id.iloc[-1]

f1_mean = np.array([])
f1_med = np.array([])
f1_min = np.array([])
f1_max = np.array([])
f2_mean = np.array([])
f2_med = np.array([])
f2_min = np.array([])
f2_max = np.array([])
f3_mean = np.array([])
f3_med = np.array([])
f3_min = np.array([])
f3_max = np.array([])

f1_lem_mean = np.array([])
f1_lem_med = np.array([])
f1_lem_min = np.array([])
f1_lem_max = np.array([])
f2_lem_mean = np.array([])
f2_lem_med = np.array([])
f2_lem_min = np.array([])
f2_lem_max = np.array([])
f3_lem_mean = np.array([])
f3_lem_med = np.array([])
f3_lem_min = np.array([])
f3_lem_max = np.array([])

for group in range(first_group, last_group + 1):
    mask_gr = test_data.group_id == group
    t_gr = test_data.title[mask_gr]
    t_lem_gr = test_data.lemmatized_titles_with_nltk_and_maru[mask_gr]

    cv_word_1_1 = CountVectorizer()
    cv_word_1_1.fit(t_gr)
    feat_1 = cv_word_1_1.transform(t_gr).toarray()
    mean_1, med_1, min_1, max_1 = compute_jackard(feat_1)
    
    cv_lem_word_1_1 = CountVectorizer()
    cv_lem_word_1_1.fit(t_lem_gr)
    feat_lem_1 = cv_lem_word_1_1.transform(t_lem_gr).toarray()
    mean_lem_1, med_lem_1, min_lem_1, max_lem_1 = compute_jackard(feat_lem_1)
    
    cv_word_2_2 = CountVectorizer(analyzer='word', ngram_range=(2,2))
    cv_word_2_2.fit(t_gr)
    feat_2 = cv_word_2_2.transform(t_gr).toarray()
    mean_2, med_2, min_2, max_2 = compute_jackard(feat_2)
    
    cv_lem_word_2_2 = CountVectorizer()
    cv_lem_word_2_2.fit(t_lem_gr)
    feat_lem_2 = cv_lem_word_2_2.transform(t_lem_gr).toarray()
    mean_lem_2, med_lem_2, min_lem_2, max_lem_2 = compute_jackard(feat_lem_2)
    
    cv_char_3_4 = CountVectorizer(analyzer='char', ngram_range=(3,4))
    cv_char_3_4.fit(t_gr)
    feat_3 = cv_char_3_4.transform(t_gr).toarray()
    mean_3, med_3, min_3, max_3 = compute_jackard(feat_3)
        
    cv_lem_char_3_4 = CountVectorizer()
    cv_lem_char_3_4.fit(t_lem_gr)
    feat_lem_3 = cv_lem_char_3_4.transform(t_lem_gr).toarray()
    mean_lem_3, med_lem_3, min_lem_3, max_lem_3 = compute_jackard(feat_lem_3)
    
    f1_mean = np.concatenate((f1_mean, mean_1))
    f1_med = np.concatenate((f1_med, med_1))
    f1_min = np.concatenate((f1_min, min_1))
    f1_max = np.concatenate((f1_max, max_1))
    f2_mean = np.concatenate((f2_mean, mean_2))
    f2_med = np.concatenate((f2_med, med_2))
    f2_min = np.concatenate((f2_min, min_2))
    f2_max = np.concatenate((f2_max, max_2))
    f3_mean = np.concatenate((f3_mean, mean_3))
    f3_med = np.concatenate((f3_med, med_3))
    f3_min = np.concatenate((f3_min, min_3))
    f3_max = np.concatenate((f3_max, max_3))
    
    f1_lem_mean = np.concatenate((f1_lem_mean, mean_lem_1))
    f1_lem_med = np.concatenate((f1_lem_med, med_lem_1))
    f1_lem_min = np.concatenate((f1_lem_min, min_lem_1))
    f1_lem_max = np.concatenate((f1_lem_max, max_lem_1))
    f2_lem_mean = np.concatenate((f2_lem_mean, mean_lem_2))
    f2_lem_med = np.concatenate((f2_lem_med, med_lem_2))
    f2_lem_min = np.concatenate((f2_lem_min, min_lem_2))
    f2_lem_max = np.concatenate((f2_lem_max, max_lem_2))
    f3_lem_mean = np.concatenate((f3_lem_mean, mean_lem_3))
    f3_lem_med = np.concatenate((f3_lem_med, med_lem_3))
    f3_lem_min = np.concatenate((f3_lem_min, min_lem_3))
    f3_lem_max = np.concatenate((f3_lem_max, max_lem_3))

feat_to_df = {'mean_word_1_1': f1_mean, 'median_word_1_1': f1_med,
              'min_word_1_1': f1_min, 'max_word_1_1': f1_max,
              'mean_word_2_2': f2_mean, 'median_word_2_2': f2_med,
              'min_word_2_2': f2_min, 'max_word_2_2': f2_max,
              'mean_char_3_4': f3_mean, 'median_char_3_4': f3_med,
              'min_char_3_4': f3_min, 'max_char_3_4': f3_max}
df_features_test = pd.DataFrame(feat_to_df)

feat_to_df = {'mean_word_1_1': f1_lem_mean, 'median_word_1_1': f1_lem_med,
              'min_word_1_1': f1_lem_min, 'max_word_1_1': f1_lem_max,
              'mean_word_2_2': f2_lem_mean, 'median_word_2_2': f2_lem_med,
              'min_word_2_2': f2_lem_min, 'max_word_2_2': f2_lem_max,
              'mean_char_3_4': f3_lem_mean, 'median_char_3_4': f3_lem_med,
              'min_char_3_4': f3_lem_min, 'max_char_3_4': f3_lem_max}
df_lem_features_test = pd.DataFrame(feat_to_df)

print(df_lem_features_test.shape)

(16627, 12)


In [230]:
df_lem_features_test.head()

Unnamed: 0,mean_word_1_1,median_word_1_1,min_word_1_1,max_word_1_1,mean_word_2_2,median_word_2_2,min_word_2_2,max_word_2_2,mean_char_3_4,median_char_3_4,min_char_3_4,max_char_3_4
0,0.938947,1.0,0.5,1.0,0.938947,1.0,0.5,1.0,0.938947,1.0,0.5,1.0
1,0.98246,1.0,0.857143,1.0,0.98246,1.0,0.857143,1.0,0.98246,1.0,0.857143,1.0
2,0.950288,1.0,0.692308,1.0,0.950288,1.0,0.692308,1.0,0.950288,1.0,0.692308,1.0
3,0.924959,1.0,0.2,1.0,0.924959,1.0,0.2,1.0,0.924959,1.0,0.2,1.0
4,0.967093,1.0,0.727273,1.0,0.967093,1.0,0.727273,1.0,0.967093,1.0,0.727273,1.0


In [231]:
df_features_test.isnull().sum()

mean_word_1_1      0
median_word_1_1    0
min_word_1_1       0
max_word_1_1       0
mean_word_2_2      0
median_word_2_2    0
min_word_2_2       0
max_word_2_2       0
mean_char_3_4      0
median_char_3_4    0
min_char_3_4       0
max_char_3_4       0
dtype: int64

In [233]:
with open('jackard_titles_test_not_nan.csv', mode='w', encoding='utf-8') as f_csv:
#     df_features_test.fillna(1.0, inplace=True)
    df_features_test.to_csv(f_csv)

In [232]:
df_lem_features_test.isnull().sum()

mean_word_1_1      0
median_word_1_1    0
min_word_1_1       0
max_word_1_1       0
mean_word_2_2      0
median_word_2_2    0
min_word_2_2       0
max_word_2_2       0
mean_char_3_4      0
median_char_3_4    0
min_char_3_4       0
max_char_3_4       0
dtype: int64

In [234]:
with open('jackard_titles_lemmatization_test_not_nan.csv', mode='w', encoding='utf-8') as f_csv:
#     df_lem_features_test.fillna(1.0, inplace=True)
    df_lem_features_test.to_csv(f_csv)