In [None]:
"""
The training dataset contains 30755 unique users who contribute to the entire training data of 
song listens. And the test set has 25131 unique users who contribute to the entiriety of the test set.

This goes on to show the importance of user bias in this model. To predict if a known user would listen to 
a song again, we cant ignore the other songs he/she has liked and re-listened before and would need to 
factor that in as well.

This makes the problem a good fit for a latent factor model where we take user and product bias into account,
to make a decision.


However, lets first start with a simpler model by removing the msno(user id) column. This way, we would be
predicting if a song is trendy and is good for a relisten by any user, not just the user in question. 
Since the user bias would not be taken into account, we would just predict if a song is probable for a 
relisten.

"""

In [204]:
%matplotlib inline
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [205]:
def read_data(file_path):
    return pd.read_csv(file_path)

In [206]:
def get_merged_df_with_user_song_info(df, user_data_df, song_data_df, song_aux_data_df):
    """
    Join all the dataframes(on matching columns) to form a single merged dataframe with all the user and song 
    information
    """
    agg_user_df = pd.merge(left=df, right=user_data_df, how='left')
    
    agg_user_song_df = pd.merge(left=agg_user_df, right=song_data_df, how='left')
    
    merged_df = pd.merge(left=agg_user_song_df, right=song_aux_data_df, how='left')
    
    return merged_df

In [164]:
def data_massaging(df):
    
    # drop the msno(user id) field for the current, user-agnostic, simpler model
    df = df.drop('msno', 1)    
    df = df.drop('id', 1) if 'id' in df.columns else df
    
    
    # missing data
    string_missing_value_columns = [
        'lyricist', 'gender', 'composer', 'isrc', 'source_screen_name', 'genre_ids',
        'source_type', 'source_system_tab', 'name', 'language', 'artist_name'
    ]
    numeric_missing_value_columns = ['song_length']
    for column in string_missing_value_columns:
        df[column] = df[column].fillna('missing')
    for column in numeric_missing_value_columns:
        df[column] = df[column].fillna(df[column].median())
    
    
    # fix dtypes
    categorical_columns = [
        'song_id', 'source_system_tab', 'source_screen_name', 'source_type', 'city', 'gender', 'isrc', 
        'registered_via', 'genre_ids', 'artist_name', 'composer', 'lyricist', 'language', 'name'
    ]
    for column in categorical_columns:
        df[column] = df[column].astype('category')
        
    return df

In [153]:
def generate_dummies_for_data(train_X, test_X):
    limit = train_X.shape[0]
    
    df = pd.concat([train_X, test_X])
    dummy_df = pd.get_dummies(df)
    
    dummyfied_train_X = dummy_df.iloc[:, :limit + 1]
    dummyfied_test_X = dummy_df.iloc[:, limit + 1:]
    
    assert dummyfied_train_X.shape[0] == train_X.shape[0]
    assert dummyfied_test_X.shape[0] == test_X.shape[0]
    
    return dummyfied_train_X, dummyfied_test_X

In [150]:
def xgb_classifier(train_X, train_y, test_X, **kwargs):    
    lm = xgb.XGBClassifier(
        reg_alpha=kwargs['reg_alpha'],
        colsample_bytree=kwargs['colsample_bytree'],
        learning_rate=kwargs['learning_rate'],
        min_child_weight=kwargs['min_child_weight'],
        n_estimators=kwargs['n_estimators'],
        subsample=kwargs['subsample'],
        max_depth=kwargs['max_depth'],
        gamma=kwargs['gamma']
    )
    lm.fit(train_X, train_y)
    predicted_test_y = lm.predict(test_X)
    return lm, predicted_test_y, train_X, test_X

In [47]:
training_data = 'train.csv'
test_data = 'test.csv'
song_data = 'songs.csv'
user_data = 'members.csv'
song_aux_data = 'song_extra_info.csv'

In [84]:
# read data
train_X = read_data(training_data)
test_X = read_data(test_data)

song_data_df = read_data(song_data)
user_data_df = read_data(user_data)
song_aux_data_df = read_data(song_aux_data)

In [112]:
train_merged_df = get_merged_df_with_user_song_info(
    train_X,
    user_data_df,
    song_data_df,song_aux_data_df
)

test_merged_df = get_merged_df_with_user_song_info(
    test_X,
    user_data_df,
    song_data_df,song_aux_data_df
)

In [165]:
refined_train_X = data_massaging(train_merged_df)
refined_test_X = data_massaging(test_merged_df)

refined_train_y = pd.DataFrame(refined_train_X['target'])
refined_train_X = refined_train_X.drop('target', axis=1)

In [None]:
refined_train_X, refined_test_X = generate_dummies_for_data(
    refined_train_X,
    refined_test_X
)

In [183]:
pd.concat([refined_train_X, refined_test_X])

ValueError: incompatible categories in categorical concat

In [143]:
# split the training data into train and validation sets into 2/3 and 1/3
ref_train_X, ref_val_X, ref_train_y, ref_val_y = train_test_split(
    refined_train_X,
    refined_train_y,
    test_size=0.33,
    random_state=42,
)

In [181]:
xgb_parameters = dict(
     colsample_bytree=0.2,
     gamma=0.0,
     learning_rate=0.01,
     max_depth=4,
     min_child_weight=1.5,
     n_estimators=7200,                                                                  
     reg_alpha=0.9,
     reg_lambda=0.6,
     subsample=0.2,
     seed=42,
     silent=1
)

model, predicted_val_y, _, _ = xgb_classifier(
    ref_train_X,
    ref_train_y,
    ref_val_X,
    **xgb_parameters
)
print "Mean squared error: %.2f" % np.sqrt(mean_squared_error(ref_val_y, predicted_val_y))
print "Variance score: %.2f" % r2_score(ref_val_y, predicted_val_y)

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields song_id, source_system_tab, source_screen_name, source_type, city, gender, registered_via, genre_ids, artist_name, composer, lyricist, language, name, isrc

In [203]:
refined_train_X.song_id.dtype

category

In [202]:
refined_train_X['song_id'].append(refined_test_X['song_id'])

ValueError: incompatible categories in categorical concat

In [199]:
refined_train_X.reset_index(drop=True, inplace=True)
refined_test_X.reset_index(drop=True, inplace=True)


dataset = pd.concat([refined_train_X, refined_test_X], axis=0)

ValueError: incompatible categories in categorical concat

In [187]:
pd.concat([refined_train_X, refined_test_X], ignore_index=True)

ValueError: incompatible categories in categorical concat