In [330]:
# To manipulate data set
import pandas as pd

# For mathematical computations
import numpy as np

# For preprocessing the data
from sklearn.preprocessing import Imputer
from sklearn import preprocessing

# To split the dataset into train and test datasets
from sklearn.model_selection import train_test_split

# Gaussian Navie Bayes classifier
from sklearn.naive_bayes import GaussianNB

# Measure the accuracy score of the model
from sklearn.metrics import accuracy_score

# time functions
from datetime import datetime
fromtimestamp = datetime.fromtimestamp

<h3>Reading the data set with pandas</h3>

In [331]:
train = pd.read_csv("data/train_sample_0.csv")
train.head(3)

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened
0,0,1478720068,127246869,13420269,0,20160624,0,2,184,0,0,9786,5992092,21,1
1,107695,1480179075,11927605,1093242,12,20081001,0,0,5626,0,0,10846,1313280,23,1
2,0,1478268865,130856006,13890672,0,20160826,0,0,186,0,1,431,5885427,25,1


In [332]:
test = pd.read_csv("data/test.csv")
test.head(3)

Unnamed: 0,sample_id,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age
0,0,50,1478104371,683078,82356,1,20021008,0,0,542,1,0,17698,2076,30
1,1,2744,1479317140,876497,99692,1,19851231,0,0,307,1,0,10525,26,28
2,2,2744,1479546361,876497,99692,1,19851231,0,0,307,1,0,8716,26,27


<h3>Preprocessing</h3>

In [333]:
def add_times2categorical(dataframe):
    """Converts timestamp columns to categorical columns"""
    # dataframe['year'] = dataframe['ts_listen'].apply(lambda x: fromtimestamp(x).year)
    # dataframe['month'] = dataframe['ts_listen'].apply(lambda x: fromtimestamp(x).month)
    # dataframe['day'] = dataframe['ts_listen'].apply(lambda x: fromtimestamp(x).day)
    dataframe['hour'] = dataframe['ts_listen'].apply(lambda x: fromtimestamp(x).hour)
    # dataframe['minute'] = dataframe['ts_listen'].apply(lambda x: fromtimestamp(x).minute)
    dataframe['weekday'] = dataframe['ts_listen'].apply(lambda x: fromtimestamp(x).isoweekday())
    dataframe['is_weekend'] = dataframe['ts_listen'].apply(lambda x: 1 if fromtimestamp(x).isoweekday() > 5 else 0)
    return dataframe

def add_releaseyear(dataframe):
    """extract release year from release date"""
    dataframe['release_year'] = dataframe['release_date'].apply(lambda x: str(x)[:4])
    return dataframe

train = add_times2categorical(train)
train = add_releaseyear(train)

test = add_times2categorical(test)
test = add_releaseyear(test)

train.head(3)

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened,hour,weekday,is_weekend,release_year
0,0,1478720068,127246869,13420269,0,20160624,0,2,184,0,0,9786,5992092,21,1,20,3,0,2016
1,107695,1480179075,11927605,1093242,12,20081001,0,0,5626,0,0,10846,1313280,23,1,17,6,1,2008
2,0,1478268865,130856006,13890672,0,20160826,0,0,186,0,1,431,5885427,25,1,15,5,0,2016


In [334]:
train.isnull().sum()

genre_id           0
ts_listen          0
media_id           0
album_id           0
context_type       0
release_date       0
platform_name      0
platform_family    0
media_duration     0
listen_type        0
user_gender        0
user_id            0
artist_id          0
user_age           0
is_listened        0
hour               0
weekday            0
is_weekend         0
release_year       0
dtype: int64

In [335]:
#for col in ['context_type', 'platform_name','platform_family','listen_type', 'user_gender']:
#    print(col,":", sum(train[col] == 0))

In [336]:
train_cpy = train
test_cpy = test

<h3>One-Hot Encoder</h3>

Transform categorical features into continuous

In [337]:
encoder = preprocessing.LabelEncoder()
release_year_cat = encoder.fit_transform(train.release_year)

train_cpy['release_year_cat'] = release_year_cat

del_cols = ['release_year']
train_cpy = train_cpy.drop(del_cols, axis = 1)

train_cpy.head(3)

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened,hour,weekday,is_weekend,release_year_cat
0,0,1478720068,127246869,13420269,0,20160624,0,2,184,0,0,9786,5992092,21,1,20,3,0,72
1,107695,1480179075,11927605,1093242,12,20081001,0,0,5626,0,0,10846,1313280,23,1,17,6,1,64
2,0,1478268865,130856006,13890672,0,20160826,0,0,186,0,1,431,5885427,25,1,15,5,0,72


In [338]:
encoder = preprocessing.LabelEncoder()
release_year_cat = encoder.fit_transform(test.release_year)

test_cpy['release_year_cat'] = release_year_cat

del_cols = ['sample_id','release_year']
test_cpy = test_cpy.drop(del_cols, axis = 1)

test_cpy.head(3)

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,hour,weekday,is_weekend,release_year_cat
0,50,1478104371,683078,82356,1,20021008,0,0,542,1,0,17698,2076,30,17,3,0,48
1,2744,1479317140,876497,99692,1,19851231,0,0,307,1,0,10525,26,28,18,3,0,31
2,2744,1479546361,876497,99692,1,19851231,0,0,307,1,0,8716,26,27,10,6,1,31


<h3>Standarization / Normalization</h3>

${x}_i=\frac{{x}_i - mean()}{\sigma(x)}$

In [339]:
cols = ['genre_id','ts_listen','media_id','album_id',
                'context_type','release_date','platform_name',
                'platform_family','media_duration','listen_type',
                'user_gender','user_id','artist_id','user_age', 
                'hour','weekday','is_weekend', 'release_year_cat']

scaled_features = {}
for col in cols:
    mean, std = train_cpy[col].mean(), train_cpy[col].std()
    scaled_features[col] = [mean, std]
    train_cpy.loc[:, col] = (train_cpy[col] - mean)/std
    
train_cpy.head(5)

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened,hour,weekday,is_weekend,release_year_cat
0,-0.170663,-0.020751,0.950496,0.966623,-0.50728,0.628626,-0.64408,3.179157,-0.660236,-0.666515,-0.806853,1.452019,1.788029,-0.878669,1,1.066268,-0.485467,-0.585072,0.646845
1,8.045596,0.091537,-1.294661,-1.289886,2.079592,-0.440212,-0.64408,-0.467175,75.815309,-0.666515,-0.806853,1.719911,-0.073349,-0.3487,1,0.544191,1.085855,1.709184,-0.459247
2,-0.170663,-0.055477,1.020762,1.052732,-0.50728,0.631338,-0.64408,-0.467175,-0.63213,-0.666515,1.239378,-0.912253,1.745594,0.181268,1,0.196139,0.562081,-0.585072,0.646845
3,-0.170663,-0.067786,0.996938,1.022671,-0.50728,0.631056,0.71514,-0.467175,-0.196492,-0.666515,-0.806853,3.237543,-0.403757,-1.408637,1,0.718216,-0.485467,-0.585072,0.646845
4,-0.169595,0.03356,-0.095345,-0.164316,-0.50728,-1.780935,0.71514,-0.467175,0.520202,-0.666515,-0.806853,-0.892793,-0.529337,0.976221,1,-2.414246,0.562081,-0.585072,-1.841861


In [340]:
scaled_features = {}
for col in cols:
    mean, std = test_cpy[col].mean(), test_cpy[col].std()
    scaled_features[col] = [mean, std]
    test_cpy.loc[:, col] = (test_cpy[col] - mean)/std
    
test_cpy.head(5)

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,hour,weekday,is_weekend,release_year_cat
0,-0.16357,-1.760345,-1.459542,-1.421649,-0.342558,-1.272473,-0.70935,-0.355778,4.405334,0.007086,-0.874262,1.346006,-0.574872,1.586467,0.449219,-0.53125,-0.57034,-1.315047
1,0.084402,-0.324646,-1.455687,-1.418385,-0.342558,-3.634592,-0.70935,-0.355778,1.055208,0.007086,-0.874262,0.098522,-0.575718,1.070645,0.63081,-0.53125,-0.57034,-3.747568
2,0.084402,-0.05329,-1.455687,-1.418385,-0.342558,-3.634592,-0.70935,-0.355778,1.055208,0.007086,-0.874262,-0.216088,-0.575718,0.812734,-0.82192,1.111319,1.753251,-3.747568
3,0.084402,-1.342033,-1.455687,-1.418385,-0.342558,-3.634592,2.482885,1.63143,0.456462,0.007086,-0.874262,-0.785308,-0.575718,1.586467,0.812402,1.658842,1.753251,-3.747568
4,0.084402,1.01475,-1.455687,-1.418385,-0.342558,-3.634592,2.482885,1.63143,1.753745,0.007086,-0.874262,-0.410176,-0.575718,1.328556,0.993993,-1.078773,-0.57034,-3.747568


<h3>Preparing Trainig and Test data sets</h3>

'context_type 4 (need processioning)', 'platform_name 6 (need processioning)','platform_family 7(improved)','listen_type 9(improved)', 'user_gender 10'(improved), 'user_age 13 (improved)', 'is_weekend' 17 decreased slightly, weekday 16 (nothing), hour 15 (in test is column 14), release_year_cat 18 (decreases slightly) (col 17 in test)


In [341]:
#features = train_cpy.values[:,:14]

features_train = train_cpy.values[:,(7,9,10,13,15)]
target_train = train_cpy.values[:,14]

#features_train, features_test, target_train, target_test = train_test_split(features,
#                                                                   target, test_size = 0.33, random_state = 10)

features_test = test_cpy.values[:,(7,9,10,13,14)]

In [342]:
features_train

array([[ 3.17915714, -0.66651463, -0.80685342, -0.87866861,  1.06626778],
       [-0.46717542, -0.66651463, -0.80685342, -0.34870016,  0.54419078],
       [-0.46717542, -0.66651463,  1.23937753,  0.18126829,  0.19613945],
       ..., 
       [-0.46717542, -0.66651463, -0.80685342, -1.67362129, -0.84801454],
       [-0.46717542, -0.66651463, -0.80685342, -1.40863707,  1.24029344],
       [-0.46717542, -0.66651463, -0.80685342,  1.24120519, -0.15191188]])

<h3>Gaussian Naive Bayes</h3>

In [343]:
clf = GaussianNB()
clf.fit(features_train, target_train)

prediction = clf.predict(features_test)

<h3>Accuracy</h3>

In [344]:
#accuracy_score(target_test, prediction, normalize = True)

In [345]:
df = pd.DataFrame(prediction)
df.to_csv("gaussian_naive_bayes_pred.csv")