In [1]:
import numpy as np
import pandas as pd
import os

from catboost import CatBoostClassifier
from sklearn.metrics import ndcg_score, roc_auc_score
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 50)

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv(r'../data/sample_submission.csv')

members = pd.read_csv('../data/members.csv')
songs = pd.read_csv('../data/songs.csv')
song_extra_info = pd.read_csv('../data/song_extra_info.csv')

## Предобработка

In [1]:
class FeatureGenerator:
    def __init__(self, train):
        self.train = train.copy()
        
        self.artist_target = None
        self.composer_target = None
        self.lyricist_target = None
        
        self.total_counts = None
        self.genre_counts = None
        self.tab_counts = None
        self.screen_counts = None
        
        self.user_features = None
        self.item_features = None
    
    def conditional_probabilities(self, predictor, condition, data):
        total_counts = data.groupby(condition).size().reset_index(name='count')
        predictor_counts = data.groupby([condition, predictor]).size().reset_index(name=f'{predictor}_count')
        predictor_counts = pd.merge(predictor_counts, total_counts, on=condition)
        predictor_counts[f'{predictor}_probability'] = predictor_counts[f'{predictor}_count'] / predictor_counts['count']
        predictor_counts = predictor_counts.drop([f'{predictor}_count', 'count'], axis=1)
        return predictor_counts
    
    def svd_features(self):
        user_item = self.train.groupby(['msno', 'song_id']).size().reset_index(name='count')
        user_item['count'] = user_item['count'].astype(float)
        
        # unique users and songs
        idx_user = user_item['msno'].unique()
        idx_item = user_item['song_id'].unique()
        
        # hash-map to indexes
        user_idx = {user_id: idx for idx, user_id in enumerate(idx_user)}
        item_idx = {item_id: idx for idx, item_id in enumerate(idx_item)}
        
        # replace ids with indexes
        user_item['msno'] = user_item['msno'].map(lambda x: user_idx[x])
        user_item['song_id'] = user_item['song_id'].map(lambda x: item_idx[x])
        
        # sparse matrix
        user_item_matrix = coo_matrix(
            (user_item['count'], (user_item['msno'], user_item['song_id']))
        )
        user_item_matrix = user_item_matrix.tocsr()
        
        # compute svd features
        u, s, vt = svds(user_item_matrix, k=20)
        user_features = np.dot(u, np.diag(s))
        item_features = vt
        
        user_features = pd.DataFrame(user_features).reset_index().rename(columns={'index': 'msno'})
        user_features['msno'] = user_features['msno'].map(lambda x: idx_user[x])
        
        item_features = pd.DataFrame(item_features).reset_index().rename(columns={'index': 'song_id'})
        item_features['song_id'] = item_features['song_id'].map(lambda x: idx_item[x])
        
        self.user_features = user_features
        self.item_features = item_features
        return
    
    def fit_transform(self, df):
        df.copy()
        df = pd.merge(df, songs, on='song_id', how='left')

        # target features
        self.artist_target = df.groupby('artist_name').agg({'target': 'mean'}) \
            .rename(columns={'target': 'target_artist'}).reset_index()
        self.composer_target = df.groupby('composer').agg({'target': 'mean'}) \
            .rename(columns={'target': 'target_composer'}).reset_index()
        self.lyricist_target = df.groupby('lyricist').agg({'target': 'mean'}) \
            .rename(columns={'target': 'target_lyricist'}).reset_index()
        
        df = pd.merge(df, self.artist_target, on='artist_name', how='left')
        df = pd.merge(df, self.composer_target, on='composer', how='left')
        df = pd.merge(df, self.lyricist_target, on='lyricist', how='left')

        # conditional probabilities features
        predictors = ['source_system_tab', 'source_screen_name', 'source_type',
              'genre_ids', 'artist_name', 'composer', 'lyricist', 'language']
        for predictor in predictors:
            predictor_counts = self.conditional_probabilities(predictor, 'msno', df)
            df = pd.merge(df, predictor_counts, on=['msno', predictor], how='left')
            df[f'{predictor}_probability'] = df[f'{predictor}_probability'].fillna(0)

        predictors = ['source_system_tab', 'source_screen_name', 'source_type']
        for predictor in predictors:
            predictor_counts = self.conditional_probabilities(predictor, 'song_id', df)
            df = pd.merge(df, predictor_counts, on=['song_id', predictor], how='left', suffixes=('', '_song'))
            df[f'{predictor}_probability_song'] = df[f'{predictor}_probability_song'].fillna(0)

        # svd features
        svd_features()
        df = pd.merge(df, self.user_features, on='msno')
        df = pd.merge(df, self.item_features, on='song_id')
        
        return df
    def transform(self, df):
        df = pd.merge(df, songs, on='song_id', how='left')
        df = pd.merge(df, self.artist_target, on='artist_name', how='left')
        df = pd.merge(df, self.composer_target, on='composer', how='left')
        df = pd.merge(df, self.lyricist_target, on='lyricist', how='left')

        predictors = ['source_system_tab', 'source_screen_name', 'source_type',
              'genre_ids', 'artist_name', 'composer', 'lyricist', 'language']
        for predictor in predictors:
            predictor_counts = self.conditional_probabilities(predictor, 'msno', pd.concat((df, self.train)))
            df = pd.merge(df, predictor_counts, on=['msno', predictor], how='left')
            df[f'{predictor}_probability'] = df[f'{predictor}_probability'].fillna(0)

        predictors = ['source_system_tab', 'source_screen_name', 'source_type']
        for predictor in predictors:
            predictor_counts = self.conditional_probabilities(predictor, 'song_id', pd.concat((df, self.train)))
            df = pd.merge(df, predictor_counts, on=['song_id', predictor], how='left', suffixes=('', '_song'))
            df[f'{predictor}_probability_song'] = df[f'{predictor}_probability_song'].fillna(0)
        return df

In [4]:
feature_generator = FeatureGenerator(train)

In [5]:
train = feature_generator.fit_transform(train)
test = feature_generator.transform(test)

In [6]:
train.head(2)

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,lyricist,language,target_artist,target_composer,target_lyricist,source_system_tab_probability,source_screen_name_probability,source_type_probability,genre_ids_probability,artist_name_probability,composer_probability,lyricist_probability,language_probability,source_system_tab_probability_song,source_screen_name_probability_song,source_type_probability_song
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471.0,359,Bastille,Dan Smith| Mark Crew,,52.0,0.463158,0.49499,,0.161132,0.122301,0.2593,0.029577,0.000544,0.000363,0.0,0.530938,0.018605,0.009302,0.24186
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,284584.0,1259,Various Artists,,,52.0,0.509851,,,0.885852,0.885852,0.21865,0.067524,0.038585,0.0,0.0,0.18328,1.0,1.0,1.0


In [7]:
cat_features = ['source_system_tab', 'source_screen_name', 'source_type', 'language']
train_features = cat_features + ['song_length', 'target_artist', 'target_composer',
                                 'target_lyricist', 'source_system_tab_probability',
                                 'source_screen_name_probability', 'source_type_probability',
                                 'genre_ids_probability', 'artist_name_probability',
                                 'composer_probability', 'lyricist_probability',
                                 'language_probability', 'source_system_tab_probability_song',
                                 'source_screen_name_probability_song', 'source_type_probability_song']

train['language'] = train['language'].map(str)
train[cat_features] = train[cat_features].fillna('nan')
test['language'] = test['language'].map(str)
test[cat_features] = test[cat_features].fillna('nan')

In [8]:
train_set, val_set = train_test_split(train, test_size = 0.2, random_state=36)

In [9]:
# train_set = train_set.sample(1000000)

In [10]:
model = CatBoostClassifier(
    n_estimators=1100,
    random_seed=12,
    verbose = 0
).fit(train_set[train_features],
      train_set['target'],
      cat_features = cat_features)

In [11]:
val_set['predict'] = model.predict_proba(val_set[train_features])[:, 1]

In [12]:
roc_auc_score(val_set['target'], val_set['predict'])

0.7570425094760802

In [13]:
score = val_set.groupby('msno').apply(
    lambda x: (float('nan') if len(x) < 2 else
               ndcg_score(x['target'].values.reshape(1, -1),
                          x['predict'].values.reshape(1, -1)))
).mean()

In [14]:
score

0.771786404114766

In [15]:
sample_submission['target'] = model.predict_proba(test[train_features])[:, 1]

In [16]:
def save(df, start_index=1):
    index = start_index
    while os.path.exists(rf'../predictions/pred{index}.csv'):
        index += 1
    df.to_csv(rf'../predictions/pred{index}.csv', index=False)
    return rf'../predictions/pred{index}.csv'

In [17]:
save(sample_submission)

'../predictions/pred6.csv'