In [58]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from dmba import classificationSummary
from sklearn import preprocessing

In [2]:
tracks = pd.read_csv('tracks.csv')

In [3]:
#create a target variable "charted"

mask = tracks['chart_status'] > 0
tracks['charted'] = np.where(mask, 1, 0)

In [4]:
# define predictors and target

predictors = ['danceability', 'energy', 'key', 'loudness' , 'mode' , 'speechiness', 'acousticness', 
              'instrumentalness', 'liveness', 'valence','tempo', 'time_signature', 'artist_lifetime_wins', 
              'artist_lifetime_nominations', 'artist_lifetime_releases', 'recording_award_nominee', 
              'recording_award_winner', 'album_award_nominee', 'album_award_winner', 
              'artist_lifetime_chart_months', 'artist_chart_tracks','artist_chart_peak', 
              'artist_chart_months_recently']


In [5]:
#Fill NA values with zero

tracks.fillna(0, inplace = True)

In [13]:
#define x and y

X = tracks[predictors]
y = tracks['charted']

In [14]:
#fit logistic regression model

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.2, random_state=23)

logit_reg = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
logit_reg.fit(train_X, train_y)

LogisticRegression(C=1e+42, solver='liblinear')

In [16]:
classificationSummary(train_y, logit_reg.predict(train_X))

Confusion Matrix (Accuracy 0.8147)

       Prediction
Actual    0    1
     0 5690  574
     1 1055 1471


In [17]:
classificationSummary(valid_y, logit_reg.predict(valid_X))

Confusion Matrix (Accuracy 0.8057)

       Prediction
Actual    0    1
     0 1394  135
     1  292  377


In [35]:
#create top word list from genre column

tracks_charted = tracks.loc[tracks['chart_status'] >= 1].copy()

from collections import Counter
top_genres = Counter(" ".join(tracks_charted['genre']).split()).most_common(50)
top_word, top_count = zip(*top_genres)

top_word = list(top_word)
print(top_word)

for word in top_word:
    mask = tracks["genre"].str.contains(word)
    tracks[word] = np.where(mask, 1, 0)



['pop', 'other', 'hip', 'hop', 'country', 'rock', 'rap', 'canadian', 'r&b', 'post-teen', 'contemporary', 'road', 'dance', 'modern', 'metal', 'uk', 'dawn', 'east', 'coast', 'german', 'boy', 'band', 'house', 'soul', 'art', 'neo', 'new', 'chicago', 'crunk', 'funk', 'nu', 'atl', 'wave', 'australian', 'permanent', 'alternative', 'indie', 'group', 'urban', 'barbadian', 'miami', 'mellow', 'girl', 'orleans', 'detroit', '8-bit', 'trap', 'old', 'school', 'south']


In [36]:
#add dummy genre variables to predictors

X = tracks[predictors + top_word]
y = tracks['charted']

In [52]:
#fit second logistic regression

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.3, random_state=23)

logit_reg = LogisticRegression(penalty="l1", C=1e42, solver='liblinear')
logit_reg.fit(train_X, train_y)

LogisticRegression(C=1e+42, penalty='l1', solver='liblinear')

In [53]:
classificationSummary(train_y, logit_reg.predict(train_X))

classificationSummary(valid_y, logit_reg.predict(valid_X))

Confusion Matrix (Accuracy 0.8416)

       Prediction
Actual    0    1
     0 4996  479
     1  739 1477
Confusion Matrix (Accuracy 0.8350)

       Prediction
Actual    0    1
     0 2116  202
     1  342  637


# Trying ensemble methods

## Voting set to 'hard'

In [39]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

In [40]:
log_clf = LogisticRegression(solver="liblinear", random_state=23)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=23)
svm_clf = SVC(gamma="auto", random_state=23)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')

In [42]:
voting_clf.fit(train_X, train_y)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(random_state=23,
                                                 solver='liblinear')),
                             ('rf',
                              RandomForestClassifier(n_estimators=10,
                                                     random_state=23)),
                             ('svc', SVC(gamma='auto', random_state=23))])

In [44]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(train_X, train_y)
    pred_y = clf.predict(valid_X)
    print(clf.__class__.__name__, accuracy_score(valid_y, pred_y))

LogisticRegression 0.8421292083712466
RandomForestClassifier 0.8412192902638762
SVC 0.8234758871701547
VotingClassifier 0.8475887170154686


## Voting set to 'soft'

In [47]:
log_clf = LogisticRegression(solver="liblinear", random_state=23)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=23)
svm_clf = SVC(gamma="auto", probability=True, random_state=23)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')

voting_clf.fit(train_X, train_y)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(random_state=23,
                                                 solver='liblinear')),
                             ('rf',
                              RandomForestClassifier(n_estimators=10,
                                                     random_state=23)),
                             ('svc',
                              SVC(gamma='auto', probability=True,
                                  random_state=23))],
                 voting='soft')

In [48]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(train_X, train_y)
    pred_y = clf.predict(valid_X)
    print(clf.__class__.__name__, accuracy_score(valid_y, pred_y))

LogisticRegression 0.8421292083712466
RandomForestClassifier 0.8412192902638762
SVC 0.8234758871701547
VotingClassifier 0.848498635122839


In [49]:
classificationSummary(train_y, voting_clf.predict(train_X))

classificationSummary(valid_y, voting_clf.predict(valid_X))

Confusion Matrix (Accuracy 0.9732)

       Prediction
Actual    0    1
     0 6168   96
     1  140 2386
Confusion Matrix (Accuracy 0.8485)

       Prediction
Actual    0    1
     0 1376  153
     1  180  489


# Neural Net

In [54]:
from sklearn.neural_network import MLPClassifier

In [56]:
clf_NN = MLPClassifier(hidden_layer_sizes=(10), activation='logistic', solver='lbfgs', 
                    random_state=1)


clf_NN.fit(train_X, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(activation='logistic', hidden_layer_sizes=10, random_state=1,
              solver='lbfgs')

In [57]:
classificationSummary(train_y, clf_NN.predict(train_X))

classificationSummary(valid_y, clf_NN.predict(valid_X))

Confusion Matrix (Accuracy 0.8363)

       Prediction
Actual    0    1
     0 4461 1014
     1  245 1971
Confusion Matrix (Accuracy 0.8232)

       Prediction
Actual    0    1
     0 1857  461
     1  122  857


# Multinomial

In [59]:
X = tracks[predictors + top_word]
y = tracks['chart_status']

In [61]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.2, random_state=23)

two_logit_reg = LogisticRegression(penalty="l2", C=1e42, solver='lbfgs', max_iter=2000, multi_class='multinomial')
two_logit_reg.fit(train_X, train_y)

classificationSummary(train_y, two_logit_reg.predict(train_X))

classificationSummary(valid_y, two_logit_reg.predict(valid_X))

Confusion Matrix (Accuracy 0.7970)

       Prediction
Actual    0    1    2
     0 5856  169  239
     1  385  597  209
     2  661  121  553
Confusion Matrix (Accuracy 0.7894)

       Prediction
Actual    0    1    2
     0 1438   40   51
     1  103  170   47
     2  185   37  127


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
