In [188]:
import numpy as np
import pandas as pd
import os
import codecs
from bs4 import BeautifulSoup as BS
import trafilatura
from tqdm.notebook import tqdm
import warnings
import pickle
from scipy.sparse import hstack, vstack, csr_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GroupKFold
from catboost import CatBoostClassifier
import lightgbm
import xgboost
from nltk.corpus import stopwords

warnings.filterwarnings('ignore')

# Идея 1.
Добавить фич с помощью контента веб-страниц в .dat-файлах. Распарсим htmlки и нормализуем слова.

Будем сохранять данные в процессе, так как ноутбук может вылетать.

In [2]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

Распарсим текст в HTML-ках, чтобы потом получить оттуда фичи.

In [3]:
datapath = os.path.join("..", "project_data")
test_data = pd.read_csv(os.path.join(datapath, "data", "test_groups.csv"), dtype=np.int16)
train_data = pd.read_csv(os.path.join(datapath, 'data', 'train_groups.csv'), dtype=np.int16)

def parse_features(data, savename): # test or train
    description = {}
    keywords = {}
    texts = {}

    contentpath = os.path.join(datapath, 'content')
    for filename in tqdm(os.listdir(contentpath)):
    #for filename in ['10.dat']:
        doc_id = int(filename.strip('.dat'))
        if doc_id not in data.doc_id.values:
            continue
        try:
            with codecs.open(os.path.join(contentpath, filename), 'r', encoding='utf-8') as f:
                url = f.readline().strip()
                html = f.read()
                #print(html)
                bs = BS(html, 'html.parser')
                meta = bs.find('meta', {'name': 'description'})
                if meta:
                    description[doc_id] = meta.get('content', '')
                meta = bs.find('meta', {'name': 'keywords'})
                if meta:
                    keywords[doc_id] = meta.get('content', '')
                text = trafilatura.extract(html, include_comments=False)
                if text:
                    text = text.replace('\n', ' ')
                    text = text.replace('\t', ' ')
                    text = text.replace('\r', ' ')
                    texts[doc_id] = text
        except UnicodeDecodeError:
            print(f'got unicode error with utf-8 in {filename}')
        except Exception as e:
            print(f'{e} in {filename}')

    data['description'] = data['doc_id'].map(description)
    data['keywords'] = data['doc_id'].map(keywords)
    data['texts'] = data['doc_id'].map(texts)
    
    save_obj(data, savename)

parse_features(train_data, 'train_data')
train_data.head()

  0%|          | 0/28024 [00:00<?, ?it/s]

Unnamed: 0,pair_id,group_id,doc_id,target,description,keywords,texts
0,1,1,15731,0,"Замена подшипников ступицы, руководство по рем...","Замена подшипников ступицы, техническое описан...","Сняв рычаг с болтов, отводим его от поворотног..."
1,2,1,14829,0,"Ваз 2107 оптом. Продажа, поиск, поставщики и м...","ваз 2107 оптом, Россия, Сочи, цены, предложени...",при заказе от 5 шт. при заказе от 10 шт. при з...
2,3,1,15764,0,Продажа запчастей ступица для легковых и грузо...,"Цена, замена, тюнинг, купить, продать, ступица...",|Ступица задняя левая ВАЗ ЛАДА Калина 2 С ABS|...
3,4,1,17669,0,,,|Номенклатура ||21010 ||Цена | |00000094000000...
4,5,1,14852,0,Передняя ступица Нива имеет свои особенности в...,,Ступица Нива — как провести ремонт и замену по...


И добавим заголовки к документам.

In [4]:
def add_titles(data):  # test or train
    doc_to_title = {}
    with open(os.path.join(datapath, 'data', 'docs_titles.tsv'), encoding='UTF-8') as f:
        for num_line, line in enumerate(f):
            if num_line == 0:
                continue
            titles = line.strip().split('\t', 1)
            doc_id = int(titles[0])
            if len(titles) == 1:
                title = ''
            else:
                title = titles[1]
            doc_to_title[doc_id] = title
    data['titles'] = data['doc_id'].map(doc_to_title)

add_titles(train_data)

In [5]:
train_data.head()

Unnamed: 0,pair_id,group_id,doc_id,target,description,keywords,texts,titles
0,1,1,15731,0,"Замена подшипников ступицы, руководство по рем...","Замена подшипников ступицы, техническое описан...","Сняв рычаг с болтов, отводим его от поворотног...",ВАЗ 21213 | Замена подшипников ступицы | Нива
1,2,1,14829,0,"Ваз 2107 оптом. Продажа, поиск, поставщики и м...","ваз 2107 оптом, Россия, Сочи, цены, предложени...",при заказе от 5 шт. при заказе от 10 шт. при з...,"Ваз 2107 оптом в Сочи. Сравнить цены, купить п..."
2,3,1,15764,0,Продажа запчастей ступица для легковых и грузо...,"Цена, замена, тюнинг, купить, продать, ступица...",|Ступица задняя левая ВАЗ ЛАДА Калина 2 С ABS|...,Купить ступица Лада калина2. Трансмиссия - пер...
3,4,1,17669,0,,,|Номенклатура ||21010 ||Цена | |00000094000000...,Классика 21010 - 21074
4,5,1,14852,0,Передняя ступица Нива имеет свои особенности в...,,Ступица Нива — как провести ремонт и замену по...,Ступица Нива — замена подшипника своими руками


In [36]:
save_obj(train_data, 'train_data')

Уберем Nan.

In [19]:
train_data = train_data.fillna('')
test_data = test_data.fillna('')

Текст -- это хорошо, но разобьем на слова и приведем к нормальной форме.

In [21]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def normalize_words(text):
    words = text.split()
    res = [morph.parse(word)[0].normal_form for word in words]
    return ' '.join(res)

In [28]:
def normalize_data(data): # test or train
    data['norm_title'] = data.apply(lambda row: normalize_words(row.titles), axis=1)
    data['norm_keywords'] = data.apply(lambda row: normalize_words(row.keywords), axis=1)
    data['norm_description'] = data.apply(lambda row: normalize_words(row.description), axis=1)
    data['sum_title'] = data.apply(lambda row: row.norm_title + row.norm_description + row.norm_keywords, axis=1)

normalize_data(train_data)

In [37]:
train_data.head()

Unnamed: 0,pair_id,group_id,doc_id,target,description,keywords,texts,titles,norm_title,norm_keywords,norm_description,sum_title
0,1,1,15731,0,"Замена подшипников ступицы, руководство по рем...","Замена подшипников ступицы, техническое описан...","Сняв рычаг с болтов, отводим его от поворотног...",ВАЗ 21213 | Замена подшипников ступицы | Нива,ваз 21213 | замена подшипник ступица | нива,"замена подшипник ступицы, технический описание...","замена подшипник ступицы, руководство по ремон...",ваз 21213 | замена подшипник ступица | нивазам...
1,2,1,14829,0,"Ваз 2107 оптом. Продажа, поиск, поставщики и м...","ваз 2107 оптом, Россия, Сочи, цены, предложени...",при заказе от 5 шт. при заказе от 10 шт. при з...,"Ваз 2107 оптом в Сочи. Сравнить цены, купить п...","ваз 2107 оптом в сочи. сравнить цены, купить п...","ваз 2107 оптом, россия, сочи, цены, предложени...","ваз 2107 оптом. продажа, поиск, поставщик и ма...","ваз 2107 оптом в сочи. сравнить цены, купить п..."
2,3,1,15764,0,Продажа запчастей ступица для легковых и грузо...,"Цена, замена, тюнинг, купить, продать, ступица...",|Ступица задняя левая ВАЗ ЛАДА Калина 2 С ABS|...,Купить ступица Лада калина2. Трансмиссия - пер...,купить ступица лада калина2. трансмиссия - пер...,"цена, замена, тюнинг, купить, продать, ступица...",продажа запчасть ступица для легковой и грузов...,купить ступица лада калина2. трансмиссия - пер...
3,4,1,17669,0,,,|Номенклатура ||21010 ||Цена | |00000094000000...,Классика 21010 - 21074,классика 21010 - 21074,,,классика 21010 - 21074
4,5,1,14852,0,Передняя ступица Нива имеет свои особенности в...,,Ступица Нива — как провести ремонт и замену по...,Ступица Нива — замена подшипника своими руками,ступица нива — замена подшипник свой рука,,передний ступица нива иметь свой особенность в...,ступица нива — замена подшипник свой рукаперед...


# Идея 2.

Теперь нужно на основе нормализованных слов создать фичи. Используем различные способы обработки слов в группах.

In [89]:
def top_n_words(corpus, n=10):
    vectorizer = CountVectorizer(stop_words=stopwords.words('russian')).fit(corpus)
    bag_of_words = vectorizer.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    freq = sorted(freq, key = lambda x: x[1], reverse=True)
    return freq[:n]

In [96]:
def one_hot_groups(data):
    groups = np.arange(0, 310).reshape(-1, 1)
    onehot = OneHotEncoder()
    onehot.fit(groups)
    return onehot.transform(data.group_id.values.reshape(-1, 1))

Попытка раз: векторизуем с помощью TfidfVectorizer и берем полученные фичи.

In [182]:
def get_features(data):
    scaler = StandardScaler(with_mean=False)
    vectorizer = TfidfVectorizer(preprocessor=normalize_words, stop_words=stopwords.words('russian'), max_features=100)
    init = True
    for group in tqdm(data.group_id.unique()):
        corpus_ids = data.group_id == group
        
        # векторизуем все нормализованные столбцы, сначала фит по заголовкам
        vectorizer.fit(data.norm_title[corpus_ids])
        # затем уже векторизация и скейлинг
        vec_titles = vectorizer.transform(data.norm_title[corpus_ids])
        vec_titles = scaler.fit_transform(vec_titles)
        
        vec_texts = vectorizer.transform(data.texts[corpus_ids])
        vec_texts = scaler.fit_transform(vec_texts)
        
        vec_keywords = vectorizer.transform(data.norm_keywords[corpus_ids])
        vec_keywords = scaler.fit_transform(vec_keywords)
        
        vec_description = vectorizer.transform(data.norm_description[corpus_ids])
        vec_description = scaler.fit_transform(vec_description)
        
        vec_sumtitles = vectorizer.transform(data.sum_title[corpus_ids])
        vec_sumtitles = scaler.fit_transform(vec_sumtitles)
        
        # посчитаем косинусные расстояния между полученными векторами
        distmatr1 = pairwise_distances(vec_titles, vec_titles, metric='cosine')
        distmatr2 = pairwise_distances(vec_texts, vec_texts, metric='cosine')
        distmatr3 = pairwise_distances(vec_keywords, vec_keywords, metric='cosine')
        distmatr4 = pairwise_distances(vec_description, vec_description, metric='cosine')
        distmatr5 = pairwise_distances(vec_sumtitles, vec_sumtitles, metric='cosine')
        distmatr6 = pairwise_distances(vec_titles, vec_texts, metric='cosine')
        distmatr7 = pairwise_distances(vec_texts, vec_keywords, metric='cosine')
        distmatr8 = pairwise_distances(vec_keywords, vec_description, metric='cosine')
        distmatr9 = pairwise_distances(vec_description, vec_titles, metric='cosine')
        
        # возьмем средние
        meandist1 = distmatr1.mean(axis=1)
        meandist2 = distmatr2.mean(axis=1)
        meandist3 = distmatr3.mean(axis=1)
        meandist4 = distmatr4.mean(axis=1)
        meandist5 = distmatr5.mean(axis=1)
        meandist6 = distmatr6.mean(axis=1)
        meandist7 = distmatr7.mean(axis=1)
        meandist8 = distmatr8.mean(axis=1)
        meandist9 = distmatr9.mean(axis=1)
        
        # отсортируем
        distmatr1 = np.sort(distmatr1, axis=1)[:, 0:21]
        distmatr2 = np.sort(distmatr2, axis=1)[:, 0:21]
        distmatr3 = np.sort(distmatr3, axis=1)[:, 0:21]
        distmatr4 = np.sort(distmatr4, axis=1)[:, 0:21]
        distmatr5 = np.sort(distmatr5, axis=1)[:, 0:21]
        distmatr6 = np.sort(distmatr6, axis=1)[:, 0:21]
        distmatr7 = np.sort(distmatr7, axis=1)[:, 0:21]
        distmatr8 = np.sort(distmatr8, axis=1)[:, 0:21]
        distmatr9 = np.sort(distmatr9, axis=1)[:, 0:21]
        
        # и получим фичи
        distmatr1 = np.hstack((distmatr1, meandist1.reshape(-1, 1)))
        distmatr1 = np.hstack((distmatr1, meandist2.reshape(-1, 1)))
        distmatr1 = np.hstack((distmatr1, meandist3.reshape(-1, 1)))
        distmatr1 = np.hstack((distmatr1, meandist4.reshape(-1, 1)))
        distmatr1 = np.hstack((distmatr1, meandist5.reshape(-1, 1)))
        distmatr1 = np.hstack((distmatr1, meandist6.reshape(-1, 1)))
        distmatr1 = np.hstack((distmatr1, meandist7.reshape(-1, 1)))
        distmatr1 = np.hstack((distmatr1, meandist8.reshape(-1, 1)))
        distmatr1 = np.hstack((distmatr1, meandist9.reshape(-1, 1)))
        
        distmatr = np.hstack((distmatr1, distmatr2))
        distmatr = np.hstack((distmatr, distmatr3))
        distmatr = np.hstack((distmatr, distmatr4))
        distmatr = np.hstack((distmatr, distmatr5))
        distmatr = np.hstack((distmatr, distmatr6))
        distmatr = np.hstack((distmatr, distmatr7))
        distmatr = np.hstack((distmatr, distmatr8))
        distmatr = np.hstack((distmatr, distmatr9))

        if init:
            features = distmatr
        else:
            features = vstack((features, distmatr))
        init = False
    return features.tocsr()

In [183]:
onehotfeat = one_hot_groups(train_data)
features = get_features(train_data)
features = hstack([features, one_hot_groups(train_data)])
features = features.tocsr()
target = train_data.target.values

  0%|          | 0/129 [00:00<?, ?it/s]

In [184]:
features.shape

(11690, 508)

In [187]:
save_obj(features, 'features_test')

Посмотрим несколько разных моделей, валидируем и выберем наилучшую.

In [190]:
def get_score(X, y, model, n=3):
    groupkfold = GroupKFold(n_splits=n)
    scores = []
    for train_idx, test_idx in groupkfold.split(X, y, train_data.group_id):
        X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
        
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        score = f1_score(pred, y_test)
        scores.append(score)
    return np.mean(scores)

In [191]:
models = [
    xgboost.XGBClassifier(),
    RandomForestClassifier(),
    CatBoostClassifier(),
    lightgbm.LGBMClassifier(),
    GradientBoostingClassifier()
]
fin_scores = []
for model in models:
    fin_scores.append(get_score(features, target, model))

Learning rate set to 0.024758
0:	learn: 0.6724548	total: 222ms	remaining: 3m 41s
1:	learn: 0.6550013	total: 248ms	remaining: 2m 3s
2:	learn: 0.6372687	total: 273ms	remaining: 1m 30s
3:	learn: 0.6220036	total: 303ms	remaining: 1m 15s
4:	learn: 0.6074692	total: 333ms	remaining: 1m 6s
5:	learn: 0.5930925	total: 361ms	remaining: 59.8s
6:	learn: 0.5799534	total: 388ms	remaining: 55s
7:	learn: 0.5659428	total: 414ms	remaining: 51.4s
8:	learn: 0.5533097	total: 442ms	remaining: 48.6s
9:	learn: 0.5425627	total: 468ms	remaining: 46.4s
10:	learn: 0.5326271	total: 496ms	remaining: 44.6s
11:	learn: 0.5222566	total: 526ms	remaining: 43.3s
12:	learn: 0.5134102	total: 555ms	remaining: 42.1s
13:	learn: 0.5046721	total: 582ms	remaining: 41s
14:	learn: 0.4964965	total: 608ms	remaining: 40s
15:	learn: 0.4892396	total: 637ms	remaining: 39.2s
16:	learn: 0.4817222	total: 666ms	remaining: 38.5s
17:	learn: 0.4741627	total: 691ms	remaining: 37.7s
18:	learn: 0.4681816	total: 725ms	remaining: 37.4s
19:	learn: 0.4

144:	learn: 0.3043499	total: 4.25s	remaining: 25.1s
145:	learn: 0.3039805	total: 4.28s	remaining: 25.1s
146:	learn: 0.3036872	total: 4.32s	remaining: 25s
147:	learn: 0.3033460	total: 4.35s	remaining: 25s
148:	learn: 0.3028872	total: 4.38s	remaining: 25s
149:	learn: 0.3025579	total: 4.41s	remaining: 25s
150:	learn: 0.3021986	total: 4.44s	remaining: 25s
151:	learn: 0.3019325	total: 4.47s	remaining: 25s
152:	learn: 0.3015117	total: 4.5s	remaining: 24.9s
153:	learn: 0.3011721	total: 4.54s	remaining: 24.9s
154:	learn: 0.3008142	total: 4.57s	remaining: 24.9s
155:	learn: 0.3005142	total: 4.59s	remaining: 24.9s
156:	learn: 0.3001293	total: 4.63s	remaining: 24.8s
157:	learn: 0.2999221	total: 4.65s	remaining: 24.8s
158:	learn: 0.2996691	total: 4.68s	remaining: 24.8s
159:	learn: 0.2993055	total: 4.71s	remaining: 24.7s
160:	learn: 0.2989788	total: 4.74s	remaining: 24.7s
161:	learn: 0.2987731	total: 4.77s	remaining: 24.7s
162:	learn: 0.2984737	total: 4.8s	remaining: 24.6s
163:	learn: 0.2981338	tota

307:	learn: 0.2621035	total: 9.28s	remaining: 20.8s
308:	learn: 0.2618201	total: 9.31s	remaining: 20.8s
309:	learn: 0.2615818	total: 9.34s	remaining: 20.8s
310:	learn: 0.2613537	total: 9.37s	remaining: 20.7s
311:	learn: 0.2612195	total: 9.39s	remaining: 20.7s
312:	learn: 0.2609716	total: 9.43s	remaining: 20.7s
313:	learn: 0.2607419	total: 9.46s	remaining: 20.7s
314:	learn: 0.2604163	total: 9.49s	remaining: 20.6s
315:	learn: 0.2603107	total: 9.52s	remaining: 20.6s
316:	learn: 0.2601518	total: 9.54s	remaining: 20.6s
317:	learn: 0.2599241	total: 9.57s	remaining: 20.5s
318:	learn: 0.2597300	total: 9.6s	remaining: 20.5s
319:	learn: 0.2596035	total: 9.63s	remaining: 20.5s
320:	learn: 0.2593939	total: 9.67s	remaining: 20.5s
321:	learn: 0.2591299	total: 9.71s	remaining: 20.4s
322:	learn: 0.2588609	total: 9.75s	remaining: 20.4s
323:	learn: 0.2586734	total: 9.79s	remaining: 20.4s
324:	learn: 0.2584521	total: 9.82s	remaining: 20.4s
325:	learn: 0.2582123	total: 9.85s	remaining: 20.4s
326:	learn: 0

469:	learn: 0.2258863	total: 14.3s	remaining: 16.2s
470:	learn: 0.2256620	total: 14.4s	remaining: 16.1s
471:	learn: 0.2254833	total: 14.4s	remaining: 16.1s
472:	learn: 0.2252912	total: 14.4s	remaining: 16.1s
473:	learn: 0.2250566	total: 14.5s	remaining: 16.1s
474:	learn: 0.2248898	total: 14.5s	remaining: 16s
475:	learn: 0.2246621	total: 14.5s	remaining: 16s
476:	learn: 0.2244287	total: 14.6s	remaining: 16s
477:	learn: 0.2242310	total: 14.6s	remaining: 16s
478:	learn: 0.2239833	total: 14.6s	remaining: 15.9s
479:	learn: 0.2237705	total: 14.7s	remaining: 15.9s
480:	learn: 0.2235352	total: 14.7s	remaining: 15.9s
481:	learn: 0.2233183	total: 14.7s	remaining: 15.8s
482:	learn: 0.2230957	total: 14.8s	remaining: 15.8s
483:	learn: 0.2228451	total: 14.8s	remaining: 15.8s
484:	learn: 0.2226635	total: 14.8s	remaining: 15.7s
485:	learn: 0.2224560	total: 14.9s	remaining: 15.7s
486:	learn: 0.2222500	total: 14.9s	remaining: 15.7s
487:	learn: 0.2220299	total: 14.9s	remaining: 15.6s
488:	learn: 0.221865

633:	learn: 0.1948118	total: 19.5s	remaining: 11.3s
634:	learn: 0.1946471	total: 19.6s	remaining: 11.3s
635:	learn: 0.1944593	total: 19.6s	remaining: 11.2s
636:	learn: 0.1942866	total: 19.6s	remaining: 11.2s
637:	learn: 0.1941411	total: 19.7s	remaining: 11.2s
638:	learn: 0.1940094	total: 19.7s	remaining: 11.1s
639:	learn: 0.1938468	total: 19.7s	remaining: 11.1s
640:	learn: 0.1936610	total: 19.7s	remaining: 11.1s
641:	learn: 0.1934839	total: 19.8s	remaining: 11s
642:	learn: 0.1933259	total: 19.8s	remaining: 11s
643:	learn: 0.1931321	total: 19.8s	remaining: 11s
644:	learn: 0.1929765	total: 19.9s	remaining: 10.9s
645:	learn: 0.1928170	total: 19.9s	remaining: 10.9s
646:	learn: 0.1926221	total: 19.9s	remaining: 10.9s
647:	learn: 0.1924987	total: 20s	remaining: 10.8s
648:	learn: 0.1922969	total: 20s	remaining: 10.8s
649:	learn: 0.1921586	total: 20s	remaining: 10.8s
650:	learn: 0.1919672	total: 20.1s	remaining: 10.8s
651:	learn: 0.1918542	total: 20.1s	remaining: 10.7s
652:	learn: 0.1916992	to

795:	learn: 0.1710940	total: 24.7s	remaining: 6.34s
796:	learn: 0.1709802	total: 24.8s	remaining: 6.31s
797:	learn: 0.1708780	total: 24.8s	remaining: 6.28s
798:	learn: 0.1707291	total: 24.8s	remaining: 6.25s
799:	learn: 0.1706257	total: 24.9s	remaining: 6.22s
800:	learn: 0.1705307	total: 24.9s	remaining: 6.19s
801:	learn: 0.1703930	total: 24.9s	remaining: 6.16s
802:	learn: 0.1702484	total: 25s	remaining: 6.13s
803:	learn: 0.1701534	total: 25s	remaining: 6.09s
804:	learn: 0.1700429	total: 25s	remaining: 6.06s
805:	learn: 0.1699337	total: 25.1s	remaining: 6.03s
806:	learn: 0.1698150	total: 25.1s	remaining: 6s
807:	learn: 0.1696846	total: 25.1s	remaining: 5.97s
808:	learn: 0.1695484	total: 25.2s	remaining: 5.94s
809:	learn: 0.1694126	total: 25.2s	remaining: 5.91s
810:	learn: 0.1693000	total: 25.2s	remaining: 5.88s
811:	learn: 0.1691723	total: 25.3s	remaining: 5.85s
812:	learn: 0.1690106	total: 25.3s	remaining: 5.82s
813:	learn: 0.1689091	total: 25.3s	remaining: 5.79s
814:	learn: 0.1687665

958:	learn: 0.1513736	total: 30s	remaining: 1.28s
959:	learn: 0.1512729	total: 30s	remaining: 1.25s
960:	learn: 0.1511563	total: 30s	remaining: 1.22s
961:	learn: 0.1510643	total: 30.1s	remaining: 1.19s
962:	learn: 0.1509245	total: 30.1s	remaining: 1.16s
963:	learn: 0.1507855	total: 30.1s	remaining: 1.13s
964:	learn: 0.1506513	total: 30.2s	remaining: 1.09s
965:	learn: 0.1505232	total: 30.2s	remaining: 1.06s
966:	learn: 0.1504270	total: 30.2s	remaining: 1.03s
967:	learn: 0.1502904	total: 30.3s	remaining: 1s
968:	learn: 0.1501929	total: 30.3s	remaining: 969ms
969:	learn: 0.1500791	total: 30.3s	remaining: 938ms
970:	learn: 0.1499590	total: 30.4s	remaining: 907ms
971:	learn: 0.1498236	total: 30.4s	remaining: 876ms
972:	learn: 0.1497085	total: 30.4s	remaining: 844ms
973:	learn: 0.1496315	total: 30.5s	remaining: 813ms
974:	learn: 0.1495448	total: 30.5s	remaining: 782ms
975:	learn: 0.1494282	total: 30.5s	remaining: 750ms
976:	learn: 0.1493709	total: 30.5s	remaining: 719ms
977:	learn: 0.1492429

119:	learn: 0.3481563	total: 3.88s	remaining: 28.4s
120:	learn: 0.3477904	total: 3.9s	remaining: 28.4s
121:	learn: 0.3473896	total: 3.94s	remaining: 28.3s
122:	learn: 0.3469857	total: 3.97s	remaining: 28.3s
123:	learn: 0.3466139	total: 4s	remaining: 28.2s
124:	learn: 0.3461172	total: 4.03s	remaining: 28.2s
125:	learn: 0.3456164	total: 4.06s	remaining: 28.2s
126:	learn: 0.3452307	total: 4.09s	remaining: 28.1s
127:	learn: 0.3449282	total: 4.12s	remaining: 28.1s
128:	learn: 0.3445933	total: 4.16s	remaining: 28.1s
129:	learn: 0.3442867	total: 4.19s	remaining: 28s
130:	learn: 0.3439734	total: 4.22s	remaining: 28s
131:	learn: 0.3436362	total: 4.25s	remaining: 28s
132:	learn: 0.3432966	total: 4.29s	remaining: 27.9s
133:	learn: 0.3429592	total: 4.32s	remaining: 27.9s
134:	learn: 0.3424922	total: 4.36s	remaining: 27.9s
135:	learn: 0.3419667	total: 4.39s	remaining: 27.9s
136:	learn: 0.3415862	total: 4.42s	remaining: 27.9s
137:	learn: 0.3412967	total: 4.45s	remaining: 27.8s
138:	learn: 0.3408092	

282:	learn: 0.2989001	total: 9.1s	remaining: 23.1s
283:	learn: 0.2986921	total: 9.13s	remaining: 23s
284:	learn: 0.2984047	total: 9.17s	remaining: 23s
285:	learn: 0.2982402	total: 9.21s	remaining: 23s
286:	learn: 0.2980269	total: 9.25s	remaining: 23s
287:	learn: 0.2978614	total: 9.29s	remaining: 23s
288:	learn: 0.2976634	total: 9.32s	remaining: 22.9s
289:	learn: 0.2974353	total: 9.35s	remaining: 22.9s
290:	learn: 0.2972613	total: 9.38s	remaining: 22.9s
291:	learn: 0.2970424	total: 9.41s	remaining: 22.8s
292:	learn: 0.2968098	total: 9.45s	remaining: 22.8s
293:	learn: 0.2964391	total: 9.48s	remaining: 22.8s
294:	learn: 0.2961581	total: 9.51s	remaining: 22.7s
295:	learn: 0.2958528	total: 9.54s	remaining: 22.7s
296:	learn: 0.2956173	total: 9.57s	remaining: 22.7s
297:	learn: 0.2954006	total: 9.6s	remaining: 22.6s
298:	learn: 0.2951652	total: 9.63s	remaining: 22.6s
299:	learn: 0.2949332	total: 9.66s	remaining: 22.5s
300:	learn: 0.2946877	total: 9.69s	remaining: 22.5s
301:	learn: 0.2945183	to

443:	learn: 0.2603612	total: 14.3s	remaining: 17.9s
444:	learn: 0.2600617	total: 14.4s	remaining: 17.9s
445:	learn: 0.2598059	total: 14.4s	remaining: 17.9s
446:	learn: 0.2595665	total: 14.4s	remaining: 17.9s
447:	learn: 0.2593293	total: 14.5s	remaining: 17.8s
448:	learn: 0.2590744	total: 14.5s	remaining: 17.8s
449:	learn: 0.2589024	total: 14.5s	remaining: 17.8s
450:	learn: 0.2586448	total: 14.6s	remaining: 17.7s
451:	learn: 0.2584126	total: 14.6s	remaining: 17.7s
452:	learn: 0.2581642	total: 14.6s	remaining: 17.6s
453:	learn: 0.2579659	total: 14.6s	remaining: 17.6s
454:	learn: 0.2576750	total: 14.7s	remaining: 17.6s
455:	learn: 0.2574689	total: 14.7s	remaining: 17.5s
456:	learn: 0.2572057	total: 14.7s	remaining: 17.5s
457:	learn: 0.2569087	total: 14.8s	remaining: 17.5s
458:	learn: 0.2567279	total: 14.8s	remaining: 17.5s
459:	learn: 0.2565249	total: 14.8s	remaining: 17.4s
460:	learn: 0.2562775	total: 14.9s	remaining: 17.4s
461:	learn: 0.2559927	total: 14.9s	remaining: 17.4s
462:	learn: 

607:	learn: 0.2247784	total: 19.6s	remaining: 12.6s
608:	learn: 0.2245979	total: 19.6s	remaining: 12.6s
609:	learn: 0.2244321	total: 19.7s	remaining: 12.6s
610:	learn: 0.2242537	total: 19.7s	remaining: 12.5s
611:	learn: 0.2240356	total: 19.7s	remaining: 12.5s
612:	learn: 0.2238260	total: 19.7s	remaining: 12.5s
613:	learn: 0.2236625	total: 19.8s	remaining: 12.4s
614:	learn: 0.2234838	total: 19.8s	remaining: 12.4s
615:	learn: 0.2233423	total: 19.8s	remaining: 12.4s
616:	learn: 0.2231263	total: 19.9s	remaining: 12.3s
617:	learn: 0.2229482	total: 19.9s	remaining: 12.3s
618:	learn: 0.2228077	total: 19.9s	remaining: 12.3s
619:	learn: 0.2226029	total: 20s	remaining: 12.2s
620:	learn: 0.2224686	total: 20s	remaining: 12.2s
621:	learn: 0.2222768	total: 20s	remaining: 12.2s
622:	learn: 0.2221041	total: 20.1s	remaining: 12.1s
623:	learn: 0.2219383	total: 20.1s	remaining: 12.1s
624:	learn: 0.2217174	total: 20.1s	remaining: 12.1s
625:	learn: 0.2215325	total: 20.2s	remaining: 12s
626:	learn: 0.221404

770:	learn: 0.1975041	total: 25.1s	remaining: 7.46s
771:	learn: 0.1973742	total: 25.2s	remaining: 7.43s
772:	learn: 0.1971740	total: 25.2s	remaining: 7.4s
773:	learn: 0.1970144	total: 25.2s	remaining: 7.37s
774:	learn: 0.1968588	total: 25.3s	remaining: 7.34s
775:	learn: 0.1967100	total: 25.3s	remaining: 7.3s
776:	learn: 0.1965762	total: 25.3s	remaining: 7.27s
777:	learn: 0.1964293	total: 25.4s	remaining: 7.24s
778:	learn: 0.1962673	total: 25.4s	remaining: 7.21s
779:	learn: 0.1961212	total: 25.4s	remaining: 7.18s
780:	learn: 0.1959998	total: 25.5s	remaining: 7.15s
781:	learn: 0.1958738	total: 25.5s	remaining: 7.11s
782:	learn: 0.1957555	total: 25.6s	remaining: 7.08s
783:	learn: 0.1955830	total: 25.6s	remaining: 7.05s
784:	learn: 0.1954260	total: 25.6s	remaining: 7.02s
785:	learn: 0.1952860	total: 25.7s	remaining: 6.99s
786:	learn: 0.1951771	total: 25.7s	remaining: 6.95s
787:	learn: 0.1950045	total: 25.7s	remaining: 6.92s
788:	learn: 0.1948951	total: 25.8s	remaining: 6.89s
789:	learn: 0.

934:	learn: 0.1751737	total: 30.7s	remaining: 2.13s
935:	learn: 0.1750677	total: 30.7s	remaining: 2.1s
936:	learn: 0.1749132	total: 30.8s	remaining: 2.07s
937:	learn: 0.1747816	total: 30.8s	remaining: 2.04s
938:	learn: 0.1746446	total: 30.8s	remaining: 2s
939:	learn: 0.1744945	total: 30.9s	remaining: 1.97s
940:	learn: 0.1743647	total: 30.9s	remaining: 1.94s
941:	learn: 0.1742477	total: 30.9s	remaining: 1.9s
942:	learn: 0.1741094	total: 31s	remaining: 1.87s
943:	learn: 0.1739899	total: 31s	remaining: 1.84s
944:	learn: 0.1738444	total: 31s	remaining: 1.8s
945:	learn: 0.1737045	total: 31.1s	remaining: 1.77s
946:	learn: 0.1735389	total: 31.1s	remaining: 1.74s
947:	learn: 0.1734219	total: 31.1s	remaining: 1.71s
948:	learn: 0.1733229	total: 31.2s	remaining: 1.67s
949:	learn: 0.1732161	total: 31.2s	remaining: 1.64s
950:	learn: 0.1730724	total: 31.2s	remaining: 1.61s
951:	learn: 0.1729452	total: 31.3s	remaining: 1.58s
952:	learn: 0.1728046	total: 31.3s	remaining: 1.54s
953:	learn: 0.1726828	to

99:	learn: 0.3646196	total: 3.3s	remaining: 29.7s
100:	learn: 0.3641404	total: 3.33s	remaining: 29.6s
101:	learn: 0.3633725	total: 3.36s	remaining: 29.6s
102:	learn: 0.3628971	total: 3.39s	remaining: 29.6s
103:	learn: 0.3622656	total: 3.43s	remaining: 29.6s
104:	learn: 0.3616901	total: 3.47s	remaining: 29.6s
105:	learn: 0.3610271	total: 3.51s	remaining: 29.6s
106:	learn: 0.3605217	total: 3.55s	remaining: 29.6s
107:	learn: 0.3600918	total: 3.59s	remaining: 29.7s
108:	learn: 0.3595530	total: 3.63s	remaining: 29.7s
109:	learn: 0.3591618	total: 3.68s	remaining: 29.8s
110:	learn: 0.3586912	total: 3.72s	remaining: 29.8s
111:	learn: 0.3581188	total: 3.76s	remaining: 29.8s
112:	learn: 0.3577321	total: 3.8s	remaining: 29.8s
113:	learn: 0.3571871	total: 3.84s	remaining: 29.8s
114:	learn: 0.3566314	total: 3.88s	remaining: 29.8s
115:	learn: 0.3562153	total: 3.92s	remaining: 29.8s
116:	learn: 0.3559343	total: 3.95s	remaining: 29.8s
117:	learn: 0.3553805	total: 3.99s	remaining: 29.8s
118:	learn: 0.3

261:	learn: 0.3069864	total: 8.76s	remaining: 24.7s
262:	learn: 0.3067610	total: 8.79s	remaining: 24.6s
263:	learn: 0.3064469	total: 8.82s	remaining: 24.6s
264:	learn: 0.3062495	total: 8.85s	remaining: 24.5s
265:	learn: 0.3060454	total: 8.88s	remaining: 24.5s
266:	learn: 0.3057964	total: 8.91s	remaining: 24.5s
267:	learn: 0.3055672	total: 8.94s	remaining: 24.4s
268:	learn: 0.3053390	total: 8.97s	remaining: 24.4s
269:	learn: 0.3050969	total: 9s	remaining: 24.3s
270:	learn: 0.3048374	total: 9.03s	remaining: 24.3s
271:	learn: 0.3046283	total: 9.06s	remaining: 24.2s
272:	learn: 0.3042835	total: 9.09s	remaining: 24.2s
273:	learn: 0.3040993	total: 9.12s	remaining: 24.2s
274:	learn: 0.3038955	total: 9.15s	remaining: 24.1s
275:	learn: 0.3036438	total: 9.18s	remaining: 24.1s
276:	learn: 0.3034117	total: 9.22s	remaining: 24.1s
277:	learn: 0.3030823	total: 9.26s	remaining: 24s
278:	learn: 0.3028576	total: 9.29s	remaining: 24s
279:	learn: 0.3026106	total: 9.32s	remaining: 24s
280:	learn: 0.3022752

425:	learn: 0.2665184	total: 13.9s	remaining: 18.7s
426:	learn: 0.2662707	total: 13.9s	remaining: 18.7s
427:	learn: 0.2660218	total: 13.9s	remaining: 18.6s
428:	learn: 0.2657676	total: 14s	remaining: 18.6s
429:	learn: 0.2655158	total: 14s	remaining: 18.6s
430:	learn: 0.2653012	total: 14s	remaining: 18.5s
431:	learn: 0.2650681	total: 14.1s	remaining: 18.5s
432:	learn: 0.2648341	total: 14.1s	remaining: 18.5s
433:	learn: 0.2645186	total: 14.1s	remaining: 18.4s
434:	learn: 0.2642673	total: 14.2s	remaining: 18.4s
435:	learn: 0.2639809	total: 14.2s	remaining: 18.4s
436:	learn: 0.2637452	total: 14.2s	remaining: 18.3s
437:	learn: 0.2635034	total: 14.3s	remaining: 18.3s
438:	learn: 0.2632293	total: 14.3s	remaining: 18.3s
439:	learn: 0.2629625	total: 14.3s	remaining: 18.2s
440:	learn: 0.2627710	total: 14.3s	remaining: 18.2s
441:	learn: 0.2625480	total: 14.4s	remaining: 18.1s
442:	learn: 0.2622772	total: 14.4s	remaining: 18.1s
443:	learn: 0.2620158	total: 14.4s	remaining: 18.1s
444:	learn: 0.2617

589:	learn: 0.2296690	total: 18.9s	remaining: 13.1s
590:	learn: 0.2294215	total: 18.9s	remaining: 13.1s
591:	learn: 0.2292179	total: 19s	remaining: 13.1s
592:	learn: 0.2290345	total: 19s	remaining: 13s
593:	learn: 0.2288188	total: 19s	remaining: 13s
594:	learn: 0.2286518	total: 19.1s	remaining: 13s
595:	learn: 0.2284705	total: 19.1s	remaining: 12.9s
596:	learn: 0.2283067	total: 19.1s	remaining: 12.9s
597:	learn: 0.2281435	total: 19.1s	remaining: 12.9s
598:	learn: 0.2279523	total: 19.2s	remaining: 12.8s
599:	learn: 0.2277731	total: 19.2s	remaining: 12.8s
600:	learn: 0.2276099	total: 19.2s	remaining: 12.8s
601:	learn: 0.2274016	total: 19.3s	remaining: 12.7s
602:	learn: 0.2271965	total: 19.3s	remaining: 12.7s
603:	learn: 0.2269952	total: 19.3s	remaining: 12.7s
604:	learn: 0.2267557	total: 19.4s	remaining: 12.6s
605:	learn: 0.2266190	total: 19.4s	remaining: 12.6s
606:	learn: 0.2264387	total: 19.4s	remaining: 12.6s
607:	learn: 0.2262021	total: 19.5s	remaining: 12.5s
608:	learn: 0.2260150	to

751:	learn: 0.2016279	total: 23.9s	remaining: 7.89s
752:	learn: 0.2014544	total: 23.9s	remaining: 7.85s
753:	learn: 0.2013161	total: 24s	remaining: 7.82s
754:	learn: 0.2011478	total: 24s	remaining: 7.79s
755:	learn: 0.2010072	total: 24s	remaining: 7.75s
756:	learn: 0.2008631	total: 24.1s	remaining: 7.72s
757:	learn: 0.2006964	total: 24.1s	remaining: 7.69s
758:	learn: 0.2005608	total: 24.1s	remaining: 7.66s
759:	learn: 0.2004104	total: 24.1s	remaining: 7.63s
760:	learn: 0.2002571	total: 24.2s	remaining: 7.59s
761:	learn: 0.2000781	total: 24.2s	remaining: 7.56s
762:	learn: 0.1998926	total: 24.2s	remaining: 7.53s
763:	learn: 0.1997281	total: 24.3s	remaining: 7.5s
764:	learn: 0.1995598	total: 24.3s	remaining: 7.46s
765:	learn: 0.1994288	total: 24.3s	remaining: 7.43s
766:	learn: 0.1993299	total: 24.4s	remaining: 7.4s
767:	learn: 0.1992302	total: 24.4s	remaining: 7.37s
768:	learn: 0.1990847	total: 24.4s	remaining: 7.34s
769:	learn: 0.1989470	total: 24.5s	remaining: 7.3s
770:	learn: 0.1987882

910:	learn: 0.1795172	total: 28.9s	remaining: 2.83s
911:	learn: 0.1794204	total: 29s	remaining: 2.8s
912:	learn: 0.1793024	total: 29s	remaining: 2.76s
913:	learn: 0.1791558	total: 29.1s	remaining: 2.73s
914:	learn: 0.1790674	total: 29.1s	remaining: 2.7s
915:	learn: 0.1789114	total: 29.1s	remaining: 2.67s
916:	learn: 0.1787637	total: 29.1s	remaining: 2.64s
917:	learn: 0.1786796	total: 29.2s	remaining: 2.61s
918:	learn: 0.1785459	total: 29.2s	remaining: 2.58s
919:	learn: 0.1784204	total: 29.3s	remaining: 2.54s
920:	learn: 0.1782737	total: 29.3s	remaining: 2.51s
921:	learn: 0.1781205	total: 29.3s	remaining: 2.48s
922:	learn: 0.1779820	total: 29.4s	remaining: 2.45s
923:	learn: 0.1778691	total: 29.4s	remaining: 2.42s
924:	learn: 0.1777443	total: 29.4s	remaining: 2.38s
925:	learn: 0.1776061	total: 29.5s	remaining: 2.35s
926:	learn: 0.1774256	total: 29.5s	remaining: 2.32s
927:	learn: 0.1772436	total: 29.5s	remaining: 2.29s
928:	learn: 0.1771635	total: 29.5s	remaining: 2.26s
929:	learn: 0.1770

In [192]:
fin_scores

[0.6763546255963542,
 0.6678023693602192,
 0.6751688248064891,
 0.684149937155015,
 0.6589346920617668]

И, наконец, сабмит.

In [199]:
model = lightgbm.LGBMClassifier()
model.fit(features, target)

save_obj(model, 'lgbmclf')

In [195]:
onehottest = one_hot_groups(test_data)
feattest = get_features(test_data)
feattest = hstack([feattest, onehottest])
feattest = feattest.tocsr()

pred = model.predict(feattest)
pred.astype(int)
pred.shape

  0%|          | 0/180 [00:00<?, ?it/s]

(16627,)

In [200]:
pred = model.predict(feattest)
pred.astype(int)
pred.shape

(16627,)

In [197]:
save_obj(feattest, 'feature_test')

In [201]:
test_data['target'] = pred
test_data.to_csv('../project_data/predictions/sub1.csv', index=False, columns=['pair_id', 'target'])