In [1]:
import numpy as np
import pandas as pd
import os

from catboost import CatBoostClassifier
from sklearn.metrics import ndcg_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv(r'../data/sample_submission.csv')

members = pd.read_csv('../data/members.csv')
songs = pd.read_csv('../data/songs.csv')
song_extra_info = pd.read_csv('../data/song_extra_info.csv')

## Предобработка

In [3]:
class FeatureGenerator:
    def __init__(self):
        self.artist_target = None
        self.composer_target = None
        self.lyricist_target = None
        self.total_counts = None
        self.genre_counts = None
        self.tab_counts = None
        self.screen_counts = None
    def fit_transform(self, df):
        df.copy()
        df = pd.merge(df, songs, on='song_id', how='left')
        
        self.artist_target = df.groupby('artist_name').agg({'target': 'mean'}) \
            .rename(columns={'target': 'target_artist'}).reset_index()
        self.composer_target = df.groupby('composer').agg({'target': 'mean'}) \
            .rename(columns={'target': 'target_composer'}).reset_index()
        self.lyricist_target = df.groupby('lyricist').agg({'target': 'mean'}) \
            .rename(columns={'target': 'target_lyricist'}).reset_index()
        
        df = pd.merge(df, self.artist_target, on='artist_name', how='left')
        df = pd.merge(df, self.composer_target, on='composer', how='left')
        df = pd.merge(df, self.lyricist_target, on='lyricist', how='left')

        self.total_counts = df.groupby('msno').size().reset_index(name='count')
        
        self.genre_counts = df.groupby(['msno', 'genre_ids']).size().reset_index(name='genre_count')
        self.genre_counts = pd.merge(self.genre_counts, self.total_counts, on='msno')
        self.genre_counts['genre_probability'] = self.genre_counts['genre_count'] / self.genre_counts['count']
        self.genre_counts = self.genre_counts.drop(['genre_count', 'count'], axis=1)
        df = pd.merge(df, self.genre_counts, on=['msno', 'genre_ids'], how='left')
        df['genre_probability'] = df['genre_probability'].fillna(0)

        self.tab_counts = df.groupby(['msno', 'source_system_tab']).size().reset_index(name='tab_count')
        self.tab_counts = pd.merge(self.tab_counts, self.total_counts, on='msno')
        self.tab_counts['tab_probability'] = self.tab_counts['tab_count'] / self.tab_counts['count']
        self.tab_counts = self.tab_counts.drop(['tab_count', 'count'], axis=1)
        df = pd.merge(df, self.tab_counts, on=['msno', 'source_system_tab'], how='left')
        df['tab_probability'] = df['tab_probability'].fillna(0)

        self.screen_counts = df.groupby(['msno', 'source_screen_name']).size().reset_index(name='screen_count')
        self.screen_counts = pd.merge(self.screen_counts, self.total_counts, on='msno')
        self.screen_counts['screen_probability'] = self.screen_counts['screen_count'] / self.screen_counts['count']
        self.screen_counts = self.screen_counts.drop(['screen_count', 'count'], axis=1)
        df = pd.merge(df, self.screen_counts, on=['msno', 'source_screen_name'], how='left')
        df['screen_probability'] = df['screen_probability'].fillna(0)
        
        return df
    def transform(self, df):
        df = pd.merge(df, songs, on='song_id', how='left')
        df = pd.merge(df, self.artist_target, on='artist_name', how='left')
        df = pd.merge(df, self.composer_target, on='composer', how='left')
        df = pd.merge(df, self.lyricist_target, on='lyricist', how='left')
        
        df = pd.merge(df, self.genre_counts, on=['msno', 'genre_ids'], how='left')
        df['genre_probability'] = df['genre_probability'].fillna(0)
        df = pd.merge(df, self.tab_counts, on=['msno', 'source_system_tab'], how='left')
        df['tab_probability'] = df['tab_probability'].fillna(0)
        df = pd.merge(df, self.screen_counts, on=['msno', 'source_screen_name'], how='left')
        df['screen_probability'] = df['screen_probability'].fillna(0)
        return df

In [4]:
feature_generator = FeatureGenerator()

In [5]:
train = feature_generator.fit_transform(train)
test = feature_generator.transform(test)

In [22]:
test.head(4)

Unnamed: 0,id,msno,song_id,source_system_tab,source_screen_name,source_type,song_length,genre_ids,artist_name,composer,lyricist,language,target_artist,target_composer,target_lyricist,genre_probability,tab_probability,screen_probability
0,0,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=,my library,Local playlist more,local-library,224130.0,458,梁文音 (Rachel Liang),Qi Zheng Zhang,,3.0,0.520141,0.501397,,0.274648,1.0,0.992958
1,1,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=,my library,Local playlist more,local-library,320470.0,465,林俊傑 (JJ Lin),林俊傑,孫燕姿/易家揚,3.0,0.586352,0.577564,0.623988,0.535211,1.0,0.992958
2,2,/uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=,8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=,discover,,song-based-playlist,315899.0,2022,Yu Takahashi (高橋優),Yu Takahashi,Yu Takahashi,17.0,0.289181,0.281001,0.281001,0.25,0.833333,0.0
3,3,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=,ztCf8thYsS4YN3GcIL/bvoxLm/T5mYBVKOO4C9NiVfQ=,radio,Radio,radio,285210.0,465,U2,The Edge| Adam Clayton| Larry Mullen| Jr.,,52.0,0.326648,0.216667,,0.407216,0.654639,0.654639


In [9]:
cat_features = ['source_system_tab', 'source_screen_name', 'source_type', 'language']
train_features = cat_features + ['song_length', 'target_artist', 'target_composer',
                                 'target_lyricist', 'genre_probability', 'tab_probability', 'screen_probability']

train['language'] = train['language'].map(str)
train[cat_features] = train[cat_features].fillna('nan')
test['language'] = test['language'].map(str)
test[cat_features] = test[cat_features].fillna('nan')

In [12]:
train_set, val_set = train_test_split(train, test_size = 0.2, random_state=36)

In [13]:
train_set = train_set.sample(1000000)

In [14]:
model = CatBoostClassifier(
    n_estimators=600,
    random_seed=12,
    verbose = 0
).fit(train_set[train_features],
      train_set['target'],
      cat_features = cat_features)

In [15]:
val_set['predict'] = model.predict_proba(val_set[train_features])[:, 1]

In [16]:
roc_auc_score(val_set['target'], val_set['predict'])

0.7174243862322778

In [17]:
score = val_set.groupby('msno').apply(
    lambda x: (float('nan') if len(x) < 2 else
               ndcg_score(x['target'].values.reshape(1, -1),
                          x['predict'].values.reshape(1, -1)))
).mean()

In [18]:
score

0.7612359400469136

In [19]:
sample_submission['target'] = model.predict_proba(test[train_features])[:, 1]

In [20]:
def save(df, start_index=1):
    index = start_index
    while os.path.exists(rf'../predictions/pred{index}.csv'):
        index += 1
    df.to_csv(rf'../predictions/pred{index}.csv', index=False)
    return rf'../predictions/pred{index}.csv'

In [21]:
save(sample_submission)

'../predictions/pred3.csv'