In [1]:
%matplotlib inline
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

In [2]:
members = pd.read_csv('Data/members.csv')

In [3]:
members.head(5)

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,expiration_date
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,,7,20110820,20170920
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,,7,20150628,20170622
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,,4,20160411,20170712
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,,9,20150906,20150907
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,,4,20170126,20170613


In [4]:
members.dtypes

msno                      object
city                       int64
bd                         int64
gender                    object
registered_via             int64
registration_init_time     int64
expiration_date            int64
dtype: object

In [5]:
for col in members.columns:
    print(col + ": ", members[col].isnull().sum())

msno:  0
city:  0
bd:  0
gender:  19902
registered_via:  0
registration_init_time:  0
expiration_date:  0


In [6]:
for col in members.columns:
    print(col + ": ", members[col][members[col] == 0].count())

msno:  0
city:  0
bd:  19932
gender:  0
registered_via:  0
registration_init_time:  0
expiration_date:  0


In [7]:
songs = pd.read_csv('Data/songs.csv')

In [8]:
for col in songs.columns:
    print(col + ": ", songs[col].isnull().sum())

song_id:  0
song_length:  0
genre_ids:  94116
artist_name:  0
composer:  1071354
lyricist:  1945268
language:  1


In [9]:
for col in songs.columns:
    print(col + ": ", songs[col][songs[col] == 0].count())

song_id:  0
song_length:  0
genre_ids:  0
artist_name:  0
composer:  0
lyricist:  0
language:  0


In [10]:
song_extra_info = pd.read_csv('Data/song_extra_info.csv')

In [11]:
for col in song_extra_info.columns:
    print(col + ": ", song_extra_info[col].isnull().sum())

song_id:  0
name:  2
isrc:  136548


In [12]:
for col in song_extra_info.columns:
    print(col + ": ", song_extra_info[col][song_extra_info[col] == 0].count())

song_id:  0
name:  0
isrc:  0


In [13]:
train = pd.read_csv('Data/train.csv')

In [14]:
for col in train.columns:
    print(col + ": ", train[col].isnull().sum())

msno:  0
song_id:  0
source_system_tab:  24849
source_screen_name:  414804
source_type:  21539
target:  0


In [15]:
for col in train.columns:
    print(col + ": ", train[col][train[col] == 0].count())

msno:  0
song_id:  0
source_system_tab:  0
source_screen_name:  0
source_type:  0
target:  3662762


In [16]:
test = pd.read_csv('Data/test.csv')

In [17]:
for col in test.columns:
    print(col + ": ", test[col].isnull().sum())

id:  0
msno:  0
song_id:  0
source_system_tab:  8442
source_screen_name:  162883
source_type:  7297


In [18]:
for col in test.columns:
    print(col + ": ", test[col][test[col] == 0].count())

id:  1
msno:  0
song_id:  0
source_system_tab:  0
source_screen_name:  0
source_type:  0


### Join training and test data with members and songs data 

In [19]:
train = train.join(members.set_index('msno'), on='msno')

In [20]:
train = train.join(songs.set_index('song_id'), on='song_id')

In [21]:
test = test.join(members.set_index('msno'), on='msno')

In [22]:
test = test.join(songs.set_index('song_id'), on='song_id')

In [23]:
for i in train.columns:
    print(i+": "+train[i].dtype.kind)

msno: O
song_id: O
source_system_tab: O
source_screen_name: O
source_type: O
target: i
city: i
bd: i
gender: O
registered_via: i
registration_init_time: i
expiration_date: i
song_length: f
genre_ids: O
artist_name: O
composer: O
lyricist: O
language: f


In [24]:
#convert data types for 'city', 'registered_via' and 'language' to be categorical variables
for column in ['city', 'registered_via', 'language']:
    train[column] = train[column].astype('str')
    test[column] = test[column].astype('str')

In [25]:
#convert data types for 'registration_init_time' and 'expiration_date' to datetime object
for column in ['registration_init_time', 'expiration_date']:
    train[column] = pd.to_datetime(train[column], format='%Y%m%d')
    train['Year_' + column] = train[column].dt.year.astype(str)
    train['Month_' + column] = train[column].dt.month.astype(str)
    train['Day_' + column] = train[column].dt.day.astype(str)
    train.drop([column], axis=1, inplace=True)
#    
    test[column] = pd.to_datetime(test[column], format='%Y%m%d')
    test['Year_' + column] = test[column].dt.year.astype(str)
    test['Month_' + column] = test[column].dt.month.astype(str)
    test['Day_' + column] = test[column].dt.day.astype(str)
    test.drop([column], axis=1, inplace=True)

In [26]:
#check if there is inconsistant data type in data again
for x in train.columns:
    print(x + ": " + train[x].dtype.kind)

msno: O
song_id: O
source_system_tab: O
source_screen_name: O
source_type: O
target: i
city: O
bd: i
gender: O
registered_via: O
song_length: f
genre_ids: O
artist_name: O
composer: O
lyricist: O
language: O
Year_registration_init_time: O
Month_registration_init_time: O
Day_registration_init_time: O
Year_expiration_date: O
Month_expiration_date: O
Day_expiration_date: O


In [27]:
# fill na for all categorical features
for x in [x for x in train.columns if train[x].dtype.kind == 'O']:
    train[x].fillna('NA', inplace=True)
    
for x in [x for x in test.columns if test[x].dtype.kind == 'O']:
    test[x].fillna('NA', inplace =True)

In [28]:
# check which numerical feature(s) still consist of null value and fill na for each of them using using appropriate method
for x in [x for x in train.columns if train[x].dtype.kind in ['f','i']]:
    print(x + ": ", train[x].isnull().sum())
for x in [x for x in test.columns if test[x].dtype.kind in ['f','i']]:
    print(x + ": ", test[x].isnull().sum())

train['song_length'] = train['song_length'].fillna(train['song_length'].median())
test['song_length'] = test['song_length'].fillna(test['song_length'].median())

target:  0
bd:  0
song_length:  114
id:  0
bd:  0
song_length:  25


In [29]:
"""convert all categorical features to 'category data type'"""
for x in [x for x in train.columns if train[x].dtype.kind == 'O']:
    train[x] = train[x].astype('category')

for x in [x for x in test.columns if test[x].dtype.kind == 'O']:
    test[x] = test[x].astype('category')

In [30]:
train.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,city,bd,gender,registered_via,...,artist_name,composer,lyricist,language,Year_registration_init_time,Month_registration_init_time,Day_registration_init_time,Year_expiration_date,Month_expiration_date,Day_expiration_date
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,1,0,,7,...,Bastille,Dan Smith| Mark Crew,,52.0,2012,1,2,2017,10,5
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,13,24,female,9,...,Various Artists,,,52.0,2011,5,25,2017,9,11
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1,13,24,female,9,...,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,,52.0,2011,5,25,2017,9,11
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1,13,24,female,9,...,Soundway,Kwadwo Donkoh,,-1.0,2011,5,25,2017,9,11
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1,1,0,,7,...,Brett Young,Brett Young| Kelly Archer| Justin Ebach,,52.0,2012,1,2,2017,10,5


# Full Set Model

In [31]:
"""split training data into train and test set"""
X = train.drop(['target'],axis=1)
y = train.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=123)

In [32]:
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size = 0.25, stratify=y_train, random_state=123)

In [33]:
"""prepare and train the model"""
cat_features_index = [i for i, x in enumerate(X_train.columns) if X_train[x].dtype.kind == 'O']
model_FULL = CatBoostClassifier(iterations = 50, learning_rate = 0.3, eval_metric='AUC', max_ctr_complexity=2, boosting_type = 'Plain', bootstrap_type= 'Bernoulli', use_best_model=True, random_seed=123)
model_FULL.fit(X_train, y_train, cat_features=cat_features_index, eval_set=(X_cv, y_cv))

0:	test: 0.7773163	best: 0.7773163 (0)	total: 15.5s	remaining: 12m 37s
1:	test: 0.7893558	best: 0.7893558 (1)	total: 31.4s	remaining: 12m 33s
2:	test: 0.7954023	best: 0.7954023 (2)	total: 43.1s	remaining: 11m 15s
3:	test: 0.7983946	best: 0.7983946 (3)	total: 57.6s	remaining: 11m 2s
4:	test: 0.7999809	best: 0.7999809 (4)	total: 1m 16s	remaining: 11m 27s
5:	test: 0.8007297	best: 0.8007297 (5)	total: 1m 24s	remaining: 10m 22s
6:	test: 0.8103959	best: 0.8103959 (6)	total: 1m 37s	remaining: 10m
7:	test: 0.8147370	best: 0.8147370 (7)	total: 1m 48s	remaining: 9m 30s
8:	test: 0.8159288	best: 0.8159288 (8)	total: 2m 4s	remaining: 9m 27s
9:	test: 0.8165684	best: 0.8165684 (9)	total: 2m 25s	remaining: 9m 43s
10:	test: 0.8183613	best: 0.8183613 (10)	total: 2m 42s	remaining: 9m 34s
11:	test: 0.8200969	best: 0.8200969 (11)	total: 2m 50s	remaining: 9m
12:	test: 0.8213597	best: 0.8213597 (12)	total: 3m 6s	remaining: 8m 49s
13:	test: 0.8221411	best: 0.8221411 (13)	total: 3m 26s	remaining: 8m 49s
14:	te

<catboost.core.CatBoostClassifier at 0x7fd3d5cdfc90>

In [34]:
"""Check result"""
y_pred = model_FULL.predict_proba(X_test)
score_FULL = metrics.roc_auc_score(y_test, y_pred[:,1])
print("Model's roc_auc_score: ", score_FULL)

Model's roc_auc_score:  0.8315188565259726


# Full Set Model without Gender

In [35]:
X_woG = X.drop(['gender'],axis=1)
y_woG = train.target

In [36]:
X_woG.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,city,bd,registered_via,song_length,genre_ids,artist_name,composer,lyricist,language,Year_registration_init_time,Month_registration_init_time,Day_registration_init_time,Year_expiration_date,Month_expiration_date,Day_expiration_date
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,0,7,206471.0,359,Bastille,Dan Smith| Mark Crew,,52.0,2012,1,2,2017,10,5
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,13,24,9,284584.0,1259,Various Artists,,,52.0,2011,5,25,2017,9,11
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,13,24,9,225396.0,1259,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,,52.0,2011,5,25,2017,9,11
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,13,24,9,255512.0,1019,Soundway,Kwadwo Donkoh,,-1.0,2011,5,25,2017,9,11
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1,0,7,187802.0,1011,Brett Young,Brett Young| Kelly Archer| Justin Ebach,,52.0,2012,1,2,2017,10,5


In [37]:
X_woG_train, X_woG_test, y_woG_train, y_woG_test = train_test_split(X_woG, y_woG, test_size = 0.2,\
                                                                    stratify=y_woG, random_state=123)

In [38]:
X_woG_train, X_woG_cv, y_woG_train, y_woG_cv = train_test_split(X_woG_train, y_woG_train, test_size = 0.25,\
                                                        stratify=y_woG_train, random_state=123)

In [39]:
# Model without Gender
cat_features_index_woG = [i for i, x in enumerate(X_woG_train.columns) if X_woG_train[x].dtype.kind == 'O']
model_FULL_woG = CatBoostClassifier(iterations = 50, learning_rate = 0.3, eval_metric='AUC', max_ctr_complexity=2, boosting_type = 'Plain', bootstrap_type= 'Bernoulli', use_best_model=True, random_seed=123)
model_FULL_woG.fit(X_woG_train, y_woG_train, cat_features=cat_features_index_woG, eval_set=(X_woG_cv, y_woG_cv))

0:	test: 0.7751961	best: 0.7751961 (0)	total: 15.8s	remaining: 12m 53s
1:	test: 0.7919814	best: 0.7919814 (1)	total: 35.3s	remaining: 14m 7s
2:	test: 0.7940979	best: 0.7940979 (2)	total: 52s	remaining: 13m 34s
3:	test: 0.8078154	best: 0.8078154 (3)	total: 58.4s	remaining: 11m 11s
4:	test: 0.8090265	best: 0.8090265 (4)	total: 1m 4s	remaining: 9m 41s
5:	test: 0.8121318	best: 0.8121318 (5)	total: 1m 21s	remaining: 9m 55s
6:	test: 0.8127590	best: 0.8127590 (6)	total: 1m 32s	remaining: 9m 28s
7:	test: 0.8163307	best: 0.8163307 (7)	total: 1m 44s	remaining: 9m 8s
8:	test: 0.8180301	best: 0.8180301 (8)	total: 1m 55s	remaining: 8m 45s
9:	test: 0.8189031	best: 0.8189031 (9)	total: 2m 6s	remaining: 8m 25s
10:	test: 0.8204627	best: 0.8204627 (10)	total: 2m 14s	remaining: 7m 58s
11:	test: 0.8214132	best: 0.8214132 (11)	total: 2m 30s	remaining: 7m 56s
12:	test: 0.8219638	best: 0.8219638 (12)	total: 2m 42s	remaining: 7m 42s
13:	test: 0.8227642	best: 0.8227642 (13)	total: 3m 3s	remaining: 7m 52s
14:	t

<catboost.core.CatBoostClassifier at 0x7fd3ce31b350>

In [40]:
"""Check result"""
y_woG_pred = model_FULL_woG.predict_proba(X_woG_test)
score_woG = metrics.roc_auc_score(y_woG_test, y_woG_pred[:,1])
print("Model's roc_auc_score: ", score_woG)

Model's roc_auc_score:  0.8318957908183386


# Full Set Model without Age

In [41]:
X_woA = X.drop(['bd'],axis=1)
y_woA = train.target
X_woA_train, X_woA_test, y_woA_train, y_woA_test = train_test_split(X_woA, \
                                                                    y_woA, test_size = 0.2, stratify=y_woA, random_state=123)

In [42]:
X_woA_train, X_woA_cv, y_woA_train, y_woA_cv = train_test_split(X_woA_train, \
                                                                y_woA_train, test_size = 0.25, stratify=y_woA_train, random_state=123)

In [43]:
# Model without Gender
cat_features_index_woA = [i for i, x in enumerate(X_woA_train.columns) if X_woA_train[x].dtype.kind == 'O']
model_FULL_woA = CatBoostClassifier(iterations = 50, learning_rate = 0.3, eval_metric='AUC', max_ctr_complexity=2, boosting_type = 'Plain', bootstrap_type= 'Bernoulli', use_best_model=True, random_seed=123)
model_FULL_woA.fit(X_woA_train, y_woA_train, cat_features=cat_features_index_woA, eval_set=(X_woA_cv, y_woA_cv))

0:	test: 0.7770941	best: 0.7770941 (0)	total: 17.4s	remaining: 14m 13s
1:	test: 0.7890060	best: 0.7890060 (1)	total: 28.4s	remaining: 11m 22s
2:	test: 0.7952741	best: 0.7952741 (2)	total: 42.3s	remaining: 11m 3s
3:	test: 0.8088808	best: 0.8088808 (3)	total: 1m 2s	remaining: 12m 2s
4:	test: 0.8096140	best: 0.8096140 (4)	total: 1m 8s	remaining: 10m 20s
5:	test: 0.8124514	best: 0.8124514 (5)	total: 1m 23s	remaining: 10m 13s
6:	test: 0.8156134	best: 0.8156134 (6)	total: 1m 36s	remaining: 9m 53s
7:	test: 0.8161664	best: 0.8161664 (7)	total: 1m 48s	remaining: 9m 31s
8:	test: 0.8166264	best: 0.8166264 (8)	total: 1m 55s	remaining: 8m 46s
9:	test: 0.8181737	best: 0.8181737 (9)	total: 2m 3s	remaining: 8m 15s
10:	test: 0.8197075	best: 0.8197075 (10)	total: 2m 16s	remaining: 8m 3s
11:	test: 0.8207621	best: 0.8207621 (11)	total: 2m 31s	remaining: 7m 59s
12:	test: 0.8218061	best: 0.8218061 (12)	total: 2m 51s	remaining: 8m 8s
13:	test: 0.8227190	best: 0.8227190 (13)	total: 2m 59s	remaining: 7m 41s
14

<catboost.core.CatBoostClassifier at 0x7fd3ccebc1d0>

In [44]:
"""Check result"""
y_woA_pred = model_FULL_woA.predict_proba(X_woA_test)
score_FULL_woA = metrics.roc_auc_score(y_woA_test, y_woA_pred[:,1])
print("Full Scale Model's roc_auc_score without Age: ", score_FULL_woA)

Full Scale Model's roc_auc_score without Age:  0.8320130497142425
