In [None]:
"""
The training dataset contains 30755 unique users who contribute to the entire training data of 
song listens. And the test set has 25131 unique users who contribute to the entiriety of the test set.

This goes on to show the importance of user bias in this model. To predict if a known user would listen to 
a song again, we cant ignore the other songs he/she has liked and re-listened before and would need to 
factor that in as well.

This makes the problem a good fit for a latent factor model where we take user and product bias into account,
to make a decision.


However, lets first start with a simpler model by removing the msno(user id) column. This way, we would be
predicting if a song is trendy and is good for a relisten by any user, not just the user in question. 
Since the user bias would not be taken into account, we would just predict if a song is probable for a 
relisten.

"""

In [3]:
%matplotlib inline
import operator
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from collections import Counter
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [4]:
pd.set_option("display.max_columns",40)

In [5]:
def read_data(file_path):
    return pd.read_csv(file_path)

In [6]:
def get_merged_df_with_user_song_info(df, user_data_df, song_data_df, song_aux_data_df):
    """
    Join all the dataframes(on matching columns) to form a single merged dataframe with all the user and song 
    information
    """
    agg_user_df = pd.merge(left=df, right=user_data_df, how='left')
    
    agg_user_song_df = pd.merge(left=agg_user_df, right=song_data_df, how='left')
    
    merged_df = pd.merge(left=agg_user_song_df, right=song_aux_data_df, how='left')
    
    return merged_df

In [40]:
def data_massaging(df):
    
    df = df.assign(
        unique_user_id=(df['msno']).astype('category').cat.codes,
        unique_song_id=(df['song_id']).astype('category').cat.codes,
    )
    
    # drop the msno(user id) field for the current, user-agnostic, simpler model
    df = df.drop('msno', 1)
    df = df.drop('song_id', 1)    
    df = df.drop('id', 1) if 'id' in df.columns else df
    df = df.drop('isrc', 1)
    
    df['registration_init_year'] = df['registration_init_time'].apply(lambda x: str(x)[:4]).astype('int64')
    df['registration_init_month'] = df['registration_init_time'].apply(lambda x: str(x)[4:6]).astype('int64')
    df['registration_init_date'] = df['registration_init_time'].apply(lambda x: str(x)[6:8]).astype('int64')
    
    df['expiration_year'] = df['expiration_date'].apply(lambda x: str(x)[:4]).astype('int64')
    df['expiration_month'] = df['expiration_date'].apply(lambda x: str(x)[4:6]).astype('int64')
    df['expiration_date_'] = df['expiration_date'].apply(lambda x: str(x)[6:8]).astype('int64')
    
    df['age_of_account'] = df['expiration_year'].apply(int) - df['registration_init_year'].apply(int)
    
    df = df.drop('registration_init_time', 1)
    df = df.drop('expiration_date', 1)

    # missing data
    string_missing_value_columns = [
        'lyricist', 'gender', 'composer', 'source_screen_name', 'genre_ids',
        'source_type', 'source_system_tab', 'name', 'language', 'artist_name'
    ]
    numeric_missing_value_columns = ['song_length']
    
    for column in string_missing_value_columns:
        df[column] = df[column].fillna('missing')
    for column in numeric_missing_value_columns:
        df[column] = df[column].fillna(df[column].median())
        
    
    def genre_id_count(genre_ids):
        return 0 if genre_ids == 'missing' else genre_ids.count('|') + 1
    df['genre_count'] = df['genre_ids'].apply(genre_id_count)

    def composer_count(composer):
        return 0 if composer == 'missing' else composer.count('|') + 1
    df['composer_count'] = df['composer'].apply(composer_count)

    def lyricist_count(lyricist):
        return 0 if lyricist == 'missing' else lyricist.count('|') + 1
    df['lyricist_count'] = df['lyricist'].apply(lyricist_count)

    def artist_count(artist_name):
        if artist_name == 'missing':   return 0
        elif artist_name == 'Various Artists':   return 2
        else:   return 1
    df['artist_count'] = df['artist_name'].apply(artist_count)
    
    df['ranked_artist'] = df.groupby('artist_name')['artist_name'].transform(pd.Series.value_counts)
    
    
    scaler = MinMaxScaler()
    df[['song_length']] = scaler.fit_transform(df[['song_length']])

    # fix dtypes
    categorical_columns = [
        'source_system_tab', 'source_screen_name', 'source_type', 'city', 'gender', 
        'registered_via', 'genre_ids', 'artist_name', 'composer', 'lyricist', 'language', 'name'
    ]
    for column in categorical_columns:
        df[column] = df[column].astype('category')
        
    return df

In [9]:
def generate_dummies_for_data(train_X, test_X):
    limit = train_X.shape[0]
    
    df = pd.concat([train_X, test_X])
    dummy_df = pd.get_dummies(df)
    
    dummyfied_train_X = dummy_df.iloc[:, :limit + 1]
    dummyfied_test_X = dummy_df.iloc[:, limit + 1:]
    
    assert dummyfied_train_X.shape[0] == train_X.shape[0]
    assert dummyfied_test_X.shape[0] == test_X.shape[0]
    assert dummyfied_train_X.shape[1] == dummyfied_test_X.shape[1]
    
    return dummyfied_train_X, dummyfied_test_X

In [10]:
def grid_search_report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [62]:
def lgbm_classifier(train_X, train_y, test_X, test_y=None, **kwargs):
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'auc'},
        'num_leaves': 80,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'bagging_freq': 5,
        'max_bin': 256,
        'min_samples_split': 500,
        'min_samples_leaf': 50,
        'max_depth': 10
    }
    lgb_train = lgb.Dataset(train_X, train_y.values)

    try:
        lgb_val = lgb.Dataset(test_X, test_y.values, reference=lgb_train)    
        gbm = lgb.train(
            params,
            lgb_train,
            num_boost_round=200,
            valid_sets=lgb_val,
            early_stopping_rounds=5
        )    
        pred_y = gbm.predict(test_X, num_iteration=gbm.best_iteration)
        print('The rmse of prediction is:', mean_squared_error(test_y, pred_y) ** 0.5)
        print('The AUC is:', roc_auc_score(test_y, pred_y))
    except:
        gbm = lgb.train(
            params,
            lgb_train,
            num_boost_round=300
        )
        pred_y = gbm.predict(test_X, num_iteration=gbm.best_iteration)
        
    return pred_y

In [12]:
training_data = 'train.csv'
test_data = 'test.csv'
song_data = 'songs.csv'
user_data = 'members.csv'
song_aux_data = 'song_extra_info.csv'

In [13]:
# read data
train_X = read_data(training_data)
test_X = read_data(test_data)

song_data_df = read_data(song_data)
user_data_df = read_data(user_data)
song_aux_data_df = read_data(song_aux_data)

In [14]:
test_data_ids = test_X['id']

train_merged_df = get_merged_df_with_user_song_info(
    train_X,
    user_data_df,
    song_data_df,
    song_aux_data_df
)

test_merged_df = get_merged_df_with_user_song_info(
    test_X,
    user_data_df,
    song_data_df,
    song_aux_data_df
)

In [41]:
refined_train_X = data_massaging(train_merged_df)
refined_test_X = data_massaging(test_merged_df)

refined_train_y = refined_train_X['target']
refined_train_X = refined_train_X.drop('target', axis=1)

In [42]:
refined_train_X.to_csv('refined_train_X.csv')
refined_train_y.to_csv('refined_train_y.csv')
refined_test_X.to_csv('refined_test_X.csv')

In [43]:
# split the training data into train and validation sets into 2/3 and 1/3
ref_train_X, ref_val_X, ref_train_y, ref_val_y = train_test_split(
    refined_train_X,
    refined_train_y,
    test_size=0.33,
    random_state=42,
)

In [238]:
grid_search_params = {
    'boosting_type': ['dart'],
    'objective': ['binary'],
    'metric': ['auc'],
    'num_leaves': [31],
    'learning_rate': [0.1],
    'feature_fraction': [0.9],
    'bagging_fraction': [0.8],
    'bagging_freq': [5],
    'min_samples_split': [500],
    'min_samples_leaf': [50],
    'max_depth': [8],
    'max_features': ['sqrt'],
    'subsample': [0.8]
}

In [239]:
grid_search = GridSearchCV(
    estimator=lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, min_samples_split=500, min_samples_leaf=50, 
                                 max_depth=8, max_features='sqrt', subsample=0.8, random_state=10), 
    param_grid=grid_search_params,
    scoring='roc_auc',
    iid=False,
    cv=5
)

grid_search.fit(refined_train_X, refined_train_y)
grid_search_report(grid_search.cv_results_)

Model with rank: 1
Mean validation score: 0.676 (std: 0.046)
Parameters: {'n_estimators': 50}

Model with rank: 2
Mean validation score: 0.676 (std: 0.047)
Parameters: {'n_estimators': 100}

Model with rank: 3
Mean validation score: 0.676 (std: 0.047)
Parameters: {'n_estimators': 150}



In [63]:
# training
pred_val_y = lgbm_classifier(ref_train_X, ref_train_y, ref_val_X, ref_val_y)

[1]	valid_0's auc: 0.682322
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 0.684582
[3]	valid_0's auc: 0.685638
[4]	valid_0's auc: 0.687767
[5]	valid_0's auc: 0.689404
[6]	valid_0's auc: 0.690068
[7]	valid_0's auc: 0.690793
[8]	valid_0's auc: 0.691736
[9]	valid_0's auc: 0.692206
[10]	valid_0's auc: 0.692732
[11]	valid_0's auc: 0.69398
[12]	valid_0's auc: 0.694369
[13]	valid_0's auc: 0.694859
[14]	valid_0's auc: 0.695286
[15]	valid_0's auc: 0.695729
[16]	valid_0's auc: 0.696435
[17]	valid_0's auc: 0.696946
[18]	valid_0's auc: 0.697625
[19]	valid_0's auc: 0.697949
[20]	valid_0's auc: 0.698182
[21]	valid_0's auc: 0.698443
[22]	valid_0's auc: 0.699051
[23]	valid_0's auc: 0.699519
[24]	valid_0's auc: 0.699886
[25]	valid_0's auc: 0.700156
[26]	valid_0's auc: 0.700486
[27]	valid_0's auc: 0.700778
[28]	valid_0's auc: 0.701404
[29]	valid_0's auc: 0.701518
[30]	valid_0's auc: 0.701772
[31]	valid_0's auc: 0.702223
[32]	valid_0's auc: 0.702426
[33]	valid_0's auc: 0

In [58]:
# testing
pred_test_y = lgbm_classifier(refined_train_X, refined_train_y, refined_test_X, None)

In [59]:
predictions = pd.DataFrame()
predictions['id'] = test_data_ids
predictions['target'] = pred_test_y
predictions.to_csv('KKBox_submission.csv', index=False)

In [55]:
refined_train_X

Unnamed: 0,source_system_tab,source_screen_name,source_type,city,bd,gender,registered_via,song_length,genre_ids,artist_name,composer,lyricist,language,name,unique_song_id,unique_user_id,registration_init_year,registration_init_month,registration_init_date,expiration_year,expiration_month,expiration_date_,age_of_account,genre_count,composer_count,lyricist_count,artist_count,ranked_artist
0,explore,Explore,online-playlist,1,0,missing,7,0.018901,359,Bastille,Dan Smith| Mark Crew,missing,52,Good Grief,74679,8158,2012,1,2,2017,10,5,5,1,2,0,1,1140
1,my library,Local playlist more,local-playlist,13,24,female,9,0.026100,1259,Various Artists,missing,missing,52,Lords of Cardboard,223479,17259,2011,5,25,2017,9,11,6,1,0,0,2,303616
2,my library,Local playlist more,local-playlist,13,24,female,9,0.020645,1259,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,missing,52,Hip Hop Is Dead(Album Version (Edited)),120758,17259,2011,5,25,2017,9,11,6,1,1,0,1,289
3,my library,Local playlist more,local-playlist,13,24,female,9,0.023420,1019,Soundway,Kwadwo Donkoh,missing,-1,Disco Africa,23707,17259,2011,5,25,2017,9,11,6,1,1,0,1,1
4,explore,Explore,online-playlist,1,0,missing,7,0.017180,1011,Brett Young,Brett Young| Kelly Archer| Justin Ebach,missing,52,Sleep Without You,33308,8158,2012,1,2,2017,10,5,5,1,3,0,1,427
5,explore,Explore,online-playlist,1,0,missing,7,0.022710,1259,Desiigner,Sidney Selby| Adnan Khan,missing,52,Panda,30191,8158,2012,1,2,2017,10,5,5,1,2,0,1,1692
6,my library,Local playlist more,local-playlist,13,24,female,9,0.021068,465,BIGBANG TAEYANG,TEDDY| DEE.P| Rebecca Johnson,TEDDY| TAEYANG,31,眼| 鼻| 口 (Eyes| Nose| Lips),190019,17259,2011,5,25,2017,9,11,6,1,3,2,1,8188
7,explore,Explore,online-playlist,1,0,missing,7,0.016564,1011,Thomas Rhett,Thomas Rhett| Rhett Akins| Ben Hayslip,missing,52,Star Of The Show,221827,8158,2012,1,2,2017,10,5,5,1,3,0,1,1172
8,my library,Local playlist more,local-library,15,26,male,9,0.025582,2022,OneRepublic,Ryan Tedder,missing,52,Dreaming Out Loud,9180,28058,2011,11,7,2018,3,4,7,1,1,0,1,9211
9,my library,Local playlist more,local-library,15,26,male,9,0.023592,465,OneRepublic,Ryan Tedder,missing,52,Counting Stars,93827,28058,2011,11,7,2018,3,4,7,1,1,0,1,9211


In [47]:
refined_train_X.dtypes

source_system_tab          category
source_screen_name         category
source_type                category
city                       category
bd                            int64
gender                     category
registered_via             category
song_length                 float64
genre_ids                  category
artist_name                category
composer                   category
lyricist                   category
language                   category
name                       category
unique_song_id                int32
unique_user_id                int16
registration_init_year        int64
registration_init_month       int64
registration_init_date        int64
expiration_year               int64
expiration_month              int64
expiration_date_              int64
age_of_account                int64
genre_count                   int64
composer_count                int64
lyricist_count                int64
artist_count                  int64
ranked_artist               