In [1]:
%matplotlib inline
import pandas as pd 

from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

import seaborn as sns

sns.set_style('whitegrid')

In [2]:
tracks = pd.read_csv('../../data/processed/tracks_processed.csv', dtype=object)
tracks.head()

Unnamed: 0.1,Unnamed: 0,track_id,album_date_created,album_date_released,album_favorites,album_id,album_listens,album_tags,album_title,album_tracks,...,track_language_code,track_listens,track_number,track_title,track_year_created,bit_rate_factor,listens_factor,interest_factor,track_price,track_length
0,0,2,2008-11-26 1:44,2009-01-05 0:00,4,1,6073,[],AWOL - A Way Of Life,7,...,en,1293,3,Food,2008,0.7,2,1.0,3.49,less than 3 minutes
1,1,3,2008-11-26 1:44,2009-01-05 0:00,4,1,6073,[],AWOL - A Way Of Life,7,...,en,514,4,Electric Ave,2008,0.7,3,0.1,0.35,between 3 and 5 minutes
2,2,5,2008-11-26 1:44,2009-01-05 0:00,4,1,6073,[],AWOL - A Way Of Life,7,...,en,1151,6,This World,2008,0.7,2,0.7,2.45,between 3 and 5 minutes
3,3,10,2008-11-26 1:45,2008-02-06 0:00,4,6,47632,[],Constant Hitmaker,2,...,en,50135,1,Freeway,2008,0.5,1,1.0,2.5,less than 3 minutes
4,4,20,2008-11-26 1:45,2009-01-06 0:00,2,4,2710,[],Niris,13,...,en,361,3,Spiritual Level,2008,0.7,4,0.1,0.35,between 5 and 7 minutes


# Feature Engineering

In [3]:
# Modeling based on following features: "album_listens","track_bit_rate","artist_id", "track_listens"
X_columns=[tracks.columns.get_loc(c) for c in tracks.columns if c in ["album_listens","track_bit_rate","artist_id", "track_listens"]]
y_column = tracks.columns.get_loc('track_interest')

# Model Training #1

In [5]:
# split the data

threshold = 0.8
absolute_threshold = int(len(tracks)*threshold)

X_train = tracks.ix[:absolute_threshold, X_columns]
y_train = tracks.ix[:absolute_threshold, y_column]

X_test = tracks.ix[absolute_threshold:, X_columns]
y_test = tracks.ix[absolute_threshold:, y_column]

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (84720, 4)
y_train (84720,)
X_test (21180, 4)
y_test (21180,)


In [6]:
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [51]:
# Model Accuracy for a model based on following features: "album_listens","track_bit_rate","artist_id", "track_listens"
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, y_pred)))

0.02%



# Feature Engineering #2 based on track_bit_rate and track_listens

In [44]:
# Modeling based on following features: "track_bit_rate","track_listens"
X_columns=[tracks.columns.get_loc(c) for c in tracks.columns if c in ["track_bit_rate","track_listens"]]

# Model Training #2 

In [45]:
# split the data
X_train = tracks.ix[:absolute_threshold, X_columns]
y_train = tracks.ix[:absolute_threshold, y_column]

X_test = tracks.ix[absolute_threshold:, X_columns]
y_test = tracks.ix[absolute_threshold:, y_column]

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (84720, 2)
y_train (84720,)
X_test (21180, 2)
y_test (21180,)


In [46]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [47]:
# Model accuracy for a one based on following features: "track_bit_rate","track_listens"
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, y_pred)))

0.05%



# Feature Engineering #3 based on Track_listens

In [40]:
# Modeling based on following features: track_listens"
X_columns=[tracks.columns.get_loc(c) for c in tracks.columns if c in ["track_listens"]]

# Model Training #3

In [41]:
X_train = tracks.ix[:absolute_threshold, X_columns]
y_train = tracks.ix[:absolute_threshold, y_column]

X_test = tracks.ix[absolute_threshold:, X_columns]
y_test = tracks.ix[absolute_threshold:, y_column]

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (84720, 1)
y_train (84720,)
X_test (21180, 1)
y_test (21180,)


In [42]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [43]:
# Modeling based on track_listens
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, y_pred)))

0.13%



# Feature Engineering #4 prediction based on track_listens and track_year_created

In [60]:
# Modeling based on following features: "track_year_created","track_listens"
X_columns=[tracks.columns.get_loc(c) for c in tracks.columns if c in ["track_listens", "track_year_created"]]

# Model Training #4

In [62]:
X_train = tracks.ix[:absolute_threshold, X_columns]
y_train = tracks.ix[:absolute_threshold, y_column]

X_test = tracks.ix[absolute_threshold:, X_columns]
y_test = tracks.ix[absolute_threshold:, y_column]

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (84720, 2)
y_train (84720,)
X_test (21180, 2)
y_test (21180,)


In [63]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [64]:
# Model accuracy for a one based on following features: "track_year_created","track_listens"
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, y_pred)))

0.18%



# Feature Engineering #5 prediction based on bit_rate_facor 

In [53]:
# Modeling based on bit_rate_factor
X_columns=[tracks.columns.get_loc(c) for c in tracks.columns if c in ["bit_rate_factor"]]

# Model Training #5

In [54]:
X_train = tracks.ix[:absolute_threshold, X_columns]
y_train = tracks.ix[:absolute_threshold, y_column]

X_test = tracks.ix[absolute_threshold:, X_columns]
y_test = tracks.ix[absolute_threshold:, y_column]

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (84720, 1)
y_train (84720,)
X_test (21180, 1)
y_test (21180,)


In [55]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [56]:
# Model accuracy for a one based on bit_rate_factor
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, y_pred)))

0.00%

