In [1]:
import pandas as pd 

from sklearn import metrics

import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

sns.set_style('whitegrid')

In [2]:
tracks = pd.read_csv('../../data/processed/tracks_processed.csv', dtype=object)
tracks.head()

Unnamed: 0.1,Unnamed: 0,track_id,album_date_created,album_date_released,album_favorites,album_id,album_listens,album_tags,album_title,album_tracks,...,track_genres_all,track_interest,track_language_code,track_listens,track_number,track_title,track_year_created,bit_rate_factor,interest_factor,track_price
0,0,2,2008-11-26 1:44,2009-01-05 0:00,4,1,6073,[],AWOL - A Way Of Life,7,...,[21],4656,en,1293,3,Food,2008,0.57,1.0,1.7
1,1,3,2008-11-26 1:44,2009-01-05 0:00,4,1,6073,[],AWOL - A Way Of Life,7,...,[21],1470,en,514,4,Electric Ave,2008,0.57,1.0,1.7
2,2,5,2008-11-26 1:44,2009-01-05 0:00,4,1,6073,[],AWOL - A Way Of Life,7,...,[21],1933,en,1151,6,This World,2008,0.57,1.0,1.7
3,3,10,2008-11-26 1:45,2008-02-06 0:00,4,6,47632,[],Constant Hitmaker,2,...,[10],54881,en,50135,1,Freeway,2008,0.43,1.0,1.29
4,4,20,2008-11-26 1:45,2009-01-06 0:00,2,4,2710,[],Niris,13,...,"[17, 10, 76, 103]",978,en,361,3,Spiritual Level,2008,0.57,1.0,1.7


# Feature Engineering

In [5]:
X_columns=[tracks.columns.get_loc(c) for c in tracks.columns if c in ["track_listens"]]
y_column = tracks.columns.get_loc('interest_factor')

# Model Training

In [6]:
# split the data

threshold = 0.8
absolute_threshold = int(len(tracks)*threshold)

X_train = tracks.ix[:absolute_threshold, X_columns]
y_train = tracks.ix[:absolute_threshold, y_column]

X_test = tracks.ix[absolute_threshold:, X_columns]
y_test = tracks.ix[absolute_threshold:, y_column]

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (84720, 1)
y_train (84720,)
X_test (21180, 1)
y_test (21180,)


In [8]:
models = [
    ('Naive Bayes', GaussianNB()),
   # ('RandomForestClassifier10', RandomForestClassifier(n_estimators=10)),
    ('RandomForestClassifier100', RandomForestClassifier(n_estimators=100)),
    ('KNeighborsClassifier', KNeighborsClassifier()),
    ('DecisionTreeClassifier', DecisionTreeClassifier())
]
results = []
for m in models:
    print('MODEL', m[0])
    model = m[1]
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred, average = 'micro')
    recall = recall_score(y_test, y_pred, average = 'micro')
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)
    results.append([m[0], precision, recall])
    
    # if there is a feature importance, print top 5
    importance = []
    if hasattr(model, 'feature_importances_'):
        print('Feature Importance')
        importance = []
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.feature_importances_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head())
    elif hasattr(model, 'coef_'):
        print('Feature Importance')
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.coef_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head())
        
    print('')

# sort the results and print as a table
df_results = pd.DataFrame(results)
df_results.columns = ['model', 'precision', 'recall']
df_results = df_results.sort_values(by='precision', ascending=False)
df_results

MODEL Naive Bayes
[[  593  1361     0     0]
 [    8  2307  1031     0]
 [    0    50  2532    12]
 [    1    20   600 12665]]
Precision 0.854438149197356
Recall 0.854438149197356

MODEL RandomForestClassifier100
[[  708  1204     0    42]
 [    5  2469   791    81]
 [    0    64   928  1602]
 [    0    28    38 13220]]
Precision 0.8179886685552408
Recall 0.8179886685552408
Feature Importance
    0    1
0  41  1.0

MODEL KNeighborsClassifier
[[  839   986    70    59]
 [   32  2291   733   290]
 [    1    88   942  1563]
 [    0    27    53 13206]]
Precision 0.8157695939565628
Recall 0.8157695939565628

MODEL DecisionTreeClassifier
[[  720  1207     0    27]
 [    5  2487   774    80]
 [    0    64   997  1533]
 [    0    28    39 13219]]
Precision 0.8226156751652502
Recall 0.8226156751652502
Feature Importance
    0    1
0  41  1.0



Unnamed: 0,model,precision,recall
0,Naive Bayes,0.854438,0.854438
3,DecisionTreeClassifier,0.822616,0.822616
1,RandomForestClassifier100,0.817989,0.817989
2,KNeighborsClassifier,0.81577,0.81577
