In [1]:
import numpy as np
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath('..'))

from src.feature_generation import FeatureGenerator

from catboost import CatBoostClassifier
from sklearn.metrics import ndcg_score, roc_auc_score
from sklearn.model_selection import train_test_split

from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds

pd.set_option('display.max_columns', 50)

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv(r'../data/sample_submission.csv')

members = pd.read_csv('../data/members.csv')
songs = pd.read_csv('../data/songs.csv')
# song_extra_info = pd.read_csv('../data/song_extra_info.csv')

## Предобработка
В FeatureGenerator cоздаются следующие переменные:
1) Target-encoding переменные
2) User и item фичи полученные через SVD разложение user_item матрицы
3) Условные вероятности, например вероятность прослуживания пользователем жанра

In [3]:
feature_generator = FeatureGenerator(train, songs)

train = feature_generator.fit_transform(train)
test = feature_generator.transform(test)
train.head(2)

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,lyricist,language,target_artist,target_composer,target_lyricist,source_system_tab_probability,source_screen_name_probability,source_type_probability,genre_ids_probability,artist_name_probability,composer_probability,lyricist_probability,language_probability,source_system_tab_probability_song,source_screen_name_probability_song,...,15_x,16_x,17_x,18_x,19_x,0_y,1_y,2_y,3_y,4_y,5_y,6_y,7_y,8_y,9_y,10_y,11_y,12_y,13_y,14_y,15_y,16_y,17_y,18_y,19_y
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471.0,359,Bastille,Dan Smith| Mark Crew,,52.0,0.463158,0.49499,,0.161132,0.122301,0.2593,0.029577,0.000544,0.000363,0.0,0.530938,0.018605,0.009302,...,6.562958,-5.22406,9.178658,-3.28815,23.121849,0.002763,-0.003956,-0.003529,-0.006336,-0.003363,0.002412,-0.002755,-0.003791,-0.003431,0.007523,-0.011094,-0.004392,-0.001807,0.004514,-0.003239,0.003841,-0.001198,0.004246,-0.004608,0.002201
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,284584.0,1259,Various Artists,,,52.0,0.509851,,,0.885852,0.885852,0.21865,0.067524,0.038585,0.0,0.0,0.18328,1.0,1.0,...,-1.152435,0.115976,4.304136,3.096813,9.598484,1e-05,8.3e-05,-9e-06,2e-06,-4e-05,-4.3e-05,0.000107,4.3e-05,-2.4e-05,-2.6e-05,-1.2e-05,-4.3e-05,1.4e-05,-8.2e-05,2.4e-05,-2.1e-05,2e-06,4.5e-05,2.5e-05,1.1e-05


In [4]:
cat_features = ['source_system_tab', 'source_screen_name', 'source_type', 'language']
svd_features = ['0_x', '1_x', '2_x', '3_x', '4_x', '5_x', '6_x', '7_x', '8_x', '9_x',
                '10_x', '11_x', '12_x', '13_x', '14_x', '15_x', '16_x', '17_x', '18_x',
                '19_x', '0_y', '1_y', '2_y', '3_y', '4_y', '5_y', '6_y', '7_y', '8_y',
                '9_y', '10_y', '11_y', '12_y', '13_y', '14_y', '15_y', '16_y', '17_y',
                '18_y', '19_y']
train_features = ['song_length', 'target_artist', 'target_composer',
                  'target_lyricist', 'source_system_tab_probability',
                  'source_screen_name_probability', 'source_type_probability',
                  'genre_ids_probability', 'artist_name_probability',
                  'composer_probability', 'lyricist_probability',
                  'language_probability', 'source_system_tab_probability_song',
                  'source_screen_name_probability_song', 'source_type_probability_song']
train_features += svd_features + cat_features

train['language'] = train['language'].map(str)
train[cat_features] = train[cat_features].fillna('nan')
test['language'] = test['language'].map(str)
test[cat_features] = test[cat_features].fillna('nan')

## Training

In [5]:
train_set, val_set = train_test_split(train, test_size = 0.2, random_state=36)

In [6]:
model = CatBoostClassifier(
    n_estimators=700,
    random_seed=12,
    verbose = 0
).fit(train_set[train_features],
      train_set['target'],
      cat_features = cat_features)

In [7]:
# ROS-AUC
val_set['predict'] = model.predict_proba(val_set[train_features])[:, 1]
roc_auc_score(val_set['target'], val_set['predict'])

0.7936462311252559

In [8]:
# nDCG
score = val_set.groupby('msno').apply(
    lambda x: (float('nan') if len(x) < 2 else
               ndcg_score(x['target'].values.reshape(1, -1),
                          x['predict'].values.reshape(1, -1)))
).mean()
score

0.7823310604084982

In [9]:
sample_submission['target'] = model.predict_proba(test[train_features])[:, 1]

In [10]:
def save(df, pattern='../predictions/pred{}.csv', start_index=1):
    index = start_index
    while os.path.exists(pattern.format(index)):
        index += 1
    df.to_csv(pattern.format(index), index=False)
    return pattern.format(index)
save(sample_submission)

'../predictions/pred8.csv'

## Public score: 0.65