In [1]:
%matplotlib inline
import pandas as pd 

from sklearn import metrics

import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

sns.set_style('whitegrid')

In [3]:
tracks = pd.read_csv('../../data/processed/tracks_processed.csv', dtype=object)
tracks.head()

Unnamed: 0.1,Unnamed: 0,track_id,album_date_created,album_date_released,album_favorites,album_id,album_listens,album_tags,album_title,album_tracks,...,track_language_code,track_listens,track_number,track_title,track_year_created,bit_rate_factor,listens_factor,interest_factor,track_price,track_length
0,0,2,2008-11-26 1:44,2009-01-05 0:00,4,1,6073,[],AWOL - A Way Of Life,7,...,en,1293,3,Food,2008,0.7,2,1.0,3.49,less than 3 minutes
1,1,3,2008-11-26 1:44,2009-01-05 0:00,4,1,6073,[],AWOL - A Way Of Life,7,...,en,514,4,Electric Ave,2008,0.7,3,0.1,0.35,between 3 and 5 minutes
2,2,5,2008-11-26 1:44,2009-01-05 0:00,4,1,6073,[],AWOL - A Way Of Life,7,...,en,1151,6,This World,2008,0.7,2,0.7,2.45,between 3 and 5 minutes
3,3,10,2008-11-26 1:45,2008-02-06 0:00,4,6,47632,[],Constant Hitmaker,2,...,en,50135,1,Freeway,2008,0.5,1,1.0,2.5,less than 3 minutes
4,4,20,2008-11-26 1:45,2009-01-06 0:00,2,4,2710,[],Niris,13,...,en,361,3,Spiritual Level,2008,0.7,4,0.1,0.35,between 5 and 7 minutes


# Feature Engineering

In [4]:
X_columns=[tracks.columns.get_loc(c) for c in tracks.columns if c in ["track_listens", "track_year_created", "artist_id"]]
y_column = tracks.columns.get_loc('interest_factor')

# Model Training

In [6]:
# split the data

threshold = 0.8
absolute_threshold = int(len(tracks)*threshold)

X_train = tracks.ix[:absolute_threshold, X_columns]
y_train = tracks.ix[:absolute_threshold, y_column]

X_test = tracks.ix[absolute_threshold:, X_columns]
y_test = tracks.ix[absolute_threshold:, y_column]

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (84720, 3)
y_train (84720,)
X_test (21180, 3)
y_test (21180,)


In [7]:
models = [
    ('Naive Bayes', GaussianNB()),
    ('RandomForestClassifier10', RandomForestClassifier(n_estimators=10)),
    ('RandomForestClassifier100', RandomForestClassifier(n_estimators=100)),
    ('KNeighborsClassifier', KNeighborsClassifier()),
    ('DecisionTreeClassifier', DecisionTreeClassifier())
]
results = []
for m in models:
    print('MODEL', m[0])
    model = m[1]
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred, average = 'micro')
    recall = recall_score(y_test, y_pred, average = 'micro')
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)
    results.append([m[0], precision, recall])
    
    # if there is a feature importance, print top 5
    importance = []
    if hasattr(model, 'feature_importances_'):
        print('Feature Importance')
        importance = []
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.feature_importances_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head())
    elif hasattr(model, 'coef_'):
        print('Feature Importance')
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.coef_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head())
        
    print('')

# sort the results and print as a table
df_results = pd.DataFrame(results)
df_results.columns = ['model', 'precision', 'recall']
df_results = df_results.sort_values(by='precision', ascending=False)
df_results

MODEL Naive Bayes
[[1606 1960   22 1889    0    0]
 [   1 2191 1154    0    0    0]
 [1823   41  730    0    0    0]
 [  14    2    4 2934   94    0]
 [   1    1    0  186  401   95]
 [  37   11   14   97  177 5695]]
Precision 0.6400849858356941
Recall 0.6400849858356941

MODEL RandomForestClassifier10
[[3958  818  143  557    0    1]
 [  33 3055  257    0    0    1]
 [ 504  248 1751   90    0    1]
 [  90    3    6 2655  233   61]
 [   1    1    0  205  234  243]
 [  49   19   14   84  109 5756]]
Precision 0.8219546742209631
Recall 0.8219546742209631
Feature Importance
    0         1
1  41  0.693387
0  18  0.294504
2  44  0.012109

MODEL RandomForestClassifier100
[[3846 1003   90  536    0    2]
 [  26 3038  282    0    0    0]
 [ 627  225 1742    0    0    0]
 [  97    3    5 2775  140   28]
 [   1    1    0  194  326  162]
 [  53   18   13   80  152 5715]]
Precision 0.8235127478753541
Recall 0.8235127478753541
Feature Importance
    0         1
1  41  0.685391
0  18  0.297966
2  44

Unnamed: 0,model,precision,recall
2,RandomForestClassifier100,0.823513,0.823513
1,RandomForestClassifier10,0.821955,0.821955
4,DecisionTreeClassifier,0.810104,0.810104
3,KNeighborsClassifier,0.728234,0.728234
0,Naive Bayes,0.640085,0.640085
