In [1]:
import pandas as pd 

from sklearn import metrics

import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

sns.set_style('whitegrid')

In [2]:
tracks = pd.read_csv('../../data/processed/tracks_processed.csv', dtype=object)
tracks.head()

Unnamed: 0.1,Unnamed: 0,track_id,album_date_created,album_date_released,album_favorites,album_id,album_listens,album_tags,album_title,album_tracks,...,track_language_code,track_listens,track_number,track_title,track_year_created,bit_rate_factor,listens_factor,interest_factor,track_price,track_length
0,0,2,2008-11-26 1:44,2009-01-05 0:00,4,1,6073,[],AWOL - A Way Of Life,7,...,en,1293,3,Food,2008,0.7,2,1.0,3.49,less than 3 minutes
1,1,3,2008-11-26 1:44,2009-01-05 0:00,4,1,6073,[],AWOL - A Way Of Life,7,...,en,514,4,Electric Ave,2008,0.7,3,0.1,0.35,between 3 and 5 minutes
2,2,5,2008-11-26 1:44,2009-01-05 0:00,4,1,6073,[],AWOL - A Way Of Life,7,...,en,1151,6,This World,2008,0.7,2,0.7,2.45,between 3 and 5 minutes
3,3,10,2008-11-26 1:45,2008-02-06 0:00,4,6,47632,[],Constant Hitmaker,2,...,en,50135,1,Freeway,2008,0.5,1,1.0,2.5,less than 3 minutes
4,4,20,2008-11-26 1:45,2009-01-06 0:00,2,4,2710,[],Niris,13,...,en,361,3,Spiritual Level,2008,0.7,4,0.1,0.35,between 5 and 7 minutes


# Feature Engineering

In [4]:
X_columns=[tracks.columns.get_loc(c) for c in tracks.columns if c in ["track_listens"]]
y_column = tracks.columns.get_loc('interest_factor')

# Model Training

In [6]:
# split the data

threshold = 0.8
absolute_threshold = int(len(tracks)*threshold)

X_train = tracks.ix[:absolute_threshold, X_columns]
y_train = tracks.ix[:absolute_threshold, y_column]

X_test = tracks.ix[absolute_threshold:, X_columns]
y_test = tracks.ix[absolute_threshold:, y_column]

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (84720, 1)
y_train (84720,)
X_test (21180, 1)
y_test (21180,)


In [8]:
models = [
    ('Naive Bayes', GaussianNB()),
    ('RandomForestClassifier10', RandomForestClassifier(n_estimators=10)),
    ('RandomForestClassifier100', RandomForestClassifier(n_estimators=100)),
    ('KNeighborsClassifier', KNeighborsClassifier()),
    ('DecisionTreeClassifier', DecisionTreeClassifier())
]
results = []
for m in models:
    print('MODEL', m[0])
    model = m[1]
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred, average = 'micro')
    recall = recall_score(y_test, y_pred, average = 'micro')
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)
    results.append([m[0], precision, recall])
    
    # if there is a feature importance, print top 5
    importance = []
    if hasattr(model, 'feature_importances_'):
        print('Feature Importance')
        importance = []
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.feature_importances_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head())
    elif hasattr(model, 'coef_'):
        print('Feature Importance')
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.coef_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head())
        
    print('')

# sort the results and print as a table
df_results = pd.DataFrame(results)
df_results.columns = ['model', 'precision', 'recall']
df_results = df_results.sort_values(by='precision', ascending=False)
df_results

MODEL Naive Bayes
[[2166 1961   27 1323    0    0]
 [   0 2315 1031    0    0    0]
 [1581   50  963    0    0    0]
 [  20    2    4 2975   47    0]
 [   1    1    0  234  444    4]
 [  44   11   15  102  405 5454]]
Precision 0.6759678942398489
Recall 0.6759678942398489

MODEL RandomForestClassifier10
[[3360 1105   63  946    0    3]
 [  30 2496  820    0    0    0]
 [1122   66 1386   20    0    0]
 [  49    2    5 2297  150  545]
 [   1    1    0   51   91  540]
 [  41   14   14   68   34 5860]]
Precision 0.7313503305004722
Recall 0.7313503305004722
Feature Importance
    0    1
0  41  1.0

MODEL RandomForestClassifier100
[[3448 1071   51  905    0    2]
 [   8 2488  850    0    0    0]
 [1088   66 1426   14    0    0]
 [  52    3    4 2355  134  500]
 [   1    1    0   48   89  545]
 [  41   14   16   68   28 5864]]
Precision 0.7398489140698773
Recall 0.7398489140698773
Feature Importance
    0    1
0  41  1.0

MODEL KNeighborsClassifier
[[3435  949   61  979   21   32]
 [ 647 2085 

Unnamed: 0,model,precision,recall
4,DecisionTreeClassifier,0.74594,0.74594
2,RandomForestClassifier100,0.739849,0.739849
1,RandomForestClassifier10,0.73135,0.73135
3,KNeighborsClassifier,0.697545,0.697545
0,Naive Bayes,0.675968,0.675968
