In [30]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from dmba import classificationSummary
from sklearn import preprocessing

In [2]:
tracks = pd.read_csv('tracks.csv')

In [3]:
#create a target variable "charted"

mask = tracks['chart_status'] > 0
tracks['charted'] = np.where(mask, 1, 0)

In [4]:
# define predictors and target

predictors = ['danceability', 'energy', 'key', 'loudness' , 'mode' , 'speechiness', 'acousticness', 
              'instrumentalness', 'liveness', 'valence','tempo']


In [5]:
#Fill NA values with zero

tracks.fillna(0, inplace = True)

In [6]:
#define x and y

X = tracks[predictors]
y = tracks['charted']

In [7]:
#fit logistic regression model

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.2, random_state=23)

logit_reg = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
logit_reg.fit(train_X, train_y)

LogisticRegression(C=1e+42, solver='liblinear')

In [8]:
classificationSummary(train_y, logit_reg.predict(train_X))

Confusion Matrix (Accuracy 0.7253)

       Prediction
Actual    0    1
     0 6072  192
     1 2223  303


In [9]:
classificationSummary(valid_y, logit_reg.predict(valid_X))

Confusion Matrix (Accuracy 0.7111)

       Prediction
Actual    0    1
     0 1490   39
     1  596   73


In [10]:
#create top word list from genre column

tracks_charted = tracks.loc[tracks['chart_status'] >= 1].copy()

from collections import Counter
top_genres = Counter(" ".join(tracks_charted['genre']).split()).most_common(50)
top_word, top_count = zip(*top_genres)

top_word = list(top_word)
print(top_word)

for word in top_word:
    mask = tracks["genre"].str.contains(word)
    tracks[word] = np.where(mask, 1, 0)



['pop', 'other', 'hip', 'hop', 'country', 'rock', 'rap', 'canadian', 'r&b', 'post-teen', 'contemporary', 'road', 'dance', 'modern', 'metal', 'uk', 'dawn', 'east', 'coast', 'german', 'boy', 'band', 'house', 'soul', 'art', 'neo', 'new', 'chicago', 'crunk', 'funk', 'nu', 'atl', 'wave', 'australian', 'permanent', 'alternative', 'indie', 'group', 'urban', 'barbadian', 'miami', 'mellow', 'girl', 'orleans', 'detroit', '8-bit', 'trap', 'old', 'school', 'south']


In [11]:
#add dummy genre variables to predictors

X = tracks[predictors + top_word]
y = tracks['charted']

In [12]:
#fit second logistic regression

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.2, random_state=23)

logit_reg = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
logit_reg.fit(train_X, train_y)

LogisticRegression(C=1e+42, solver='liblinear')

In [13]:
classificationSummary(train_y, logit_reg.predict(train_X))

classificationSummary(valid_y, logit_reg.predict(valid_X))

Confusion Matrix (Accuracy 0.8011)

       Prediction
Actual    0    1
     0 5768  496
     1 1252 1274
Confusion Matrix (Accuracy 0.7816)

       Prediction
Actual    0    1
     0 1402  127
     1  353  316


# Trying ensemble methods

## Voting set to 'hard'

In [14]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

In [15]:
log_clf = LogisticRegression(solver="liblinear", random_state=23)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=23)
svm_clf = SVC(gamma="auto", random_state=23)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')

In [16]:
voting_clf.fit(train_X, train_y)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(random_state=23,
                                                 solver='liblinear')),
                             ('rf',
                              RandomForestClassifier(n_estimators=10,
                                                     random_state=23)),
                             ('svc', SVC(gamma='auto', random_state=23))])

In [17]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(train_X, train_y)
    pred_y = clf.predict(valid_X)
    print(clf.__class__.__name__, accuracy_score(valid_y, pred_y))

LogisticRegression 0.7834394904458599
RandomForestClassifier 0.7711555959963603
SVC 0.7029117379435851
VotingClassifier 0.7720655141037307


## Voting set to 'soft'

In [18]:
log_clf = LogisticRegression(solver="liblinear", random_state=23)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=23)
svm_clf = SVC(gamma="auto", probability=True, random_state=23)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')

voting_clf.fit(train_X, train_y)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(random_state=23,
                                                 solver='liblinear')),
                             ('rf',
                              RandomForestClassifier(n_estimators=10,
                                                     random_state=23)),
                             ('svc',
                              SVC(gamma='auto', probability=True,
                                  random_state=23))],
                 voting='soft')

In [19]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(train_X, train_y)
    pred_y = clf.predict(valid_X)
    print(clf.__class__.__name__, accuracy_score(valid_y, pred_y))

LogisticRegression 0.7834394904458599
RandomForestClassifier 0.7711555959963603
SVC 0.7029117379435851
VotingClassifier 0.7793448589626933


In [20]:
classificationSummary(train_y, voting_clf.predict(train_X))

classificationSummary(valid_y, voting_clf.predict(valid_X))

Confusion Matrix (Accuracy 0.8681)

       Prediction
Actual    0    1
     0 6149  115
     1 1044 1482
Confusion Matrix (Accuracy 0.7793)

       Prediction
Actual    0    1
     0 1455   74
     1  411  258


# Neural Net

In [35]:
from sklearn.neural_network import MLPClassifier

clf_NN = MLPClassifier(hidden_layer_sizes=(20), max_iter = 1000, activation='logistic', solver='lbfgs', 
                    random_state=1)

scaler = preprocessing.StandardScaler().fit(train_X)
X_scaled = scaler.transform(train_X)

clf_NN.fit(X_scaled, train_y)

MLPClassifier(activation='logistic', hidden_layer_sizes=20, max_iter=1000,
              random_state=1, solver='lbfgs')

In [36]:
classificationSummary(train_y, clf_NN.predict(X_scaled))

X_scaled_val = scaler.transform(valid_X)

classificationSummary(valid_y, clf_NN.predict(X_scaled_val))

Confusion Matrix (Accuracy 0.8918)

       Prediction
Actual    0    1
     0 5921  343
     1  608 1918
Confusion Matrix (Accuracy 0.7334)

       Prediction
Actual    0    1
     0 1264  265
     1  321  348


In [46]:
#attempt at pipeline
from sklearn.pipeline import make_pipeline

nn_clf = MLPClassifier(hidden_layer_sizes=(20), max_iter = 1000, activation='logistic', solver='lbfgs', 
                    random_state=1)

for clf in (log_clf, rnd_clf, svm_clf, nn_clf):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    pipe = make_pipeline(preprocessing.StandardScaler(), clf)
    pipe.fit(X_train, y_train)  
    print(f'{clf} score is {pipe.score(X_test, y_test)}')


LogisticRegression(random_state=23, solver='liblinear') score is 0.7970882620564149
RandomForestClassifier(n_estimators=10, random_state=23) score is 0.764634516226873
SVC(gamma='auto', probability=True, random_state=23) score is 0.8092205034880194
MLPClassifier(activation='logistic', hidden_layer_sizes=20, max_iter=1000,
              random_state=1, solver='lbfgs') score is 0.7561419472247498


In [47]:
for clf in (log_clf, rnd_clf, svm_clf, nn_clf):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    pipe = make_pipeline(preprocessing.StandardScaler(), clf)
    pipe.fit(X_train, y_train)  
    classificationSummary(y_test, pipe.predict(X_test))


Confusion Matrix (Accuracy 0.7971)

       Prediction
Actual    0    1
     0 2151  187
     1  482  477
Confusion Matrix (Accuracy 0.7646)

       Prediction
Actual    0    1
     0 2144  194
     1  582  377
Confusion Matrix (Accuracy 0.8092)

       Prediction
Actual    0    1
     0 2171  167
     1  462  497
Confusion Matrix (Accuracy 0.7561)

       Prediction
Actual    0    1
     0 1996  342
     1  462  497


# Multinomial

In [48]:
X = tracks[predictors + top_word]
y = tracks['chart_status']

In [49]:
y.head(10)

0    0
1    0
2    0
3    2
4    0
5    0
6    1
7    0
8    0
9    0
Name: chart_status, dtype: int64

In [58]:
#maybe there is value in this. If we're looking for just songs that will chart, we can ignore the predicted zeros.
#We will miss some hits, but we're trying to identify songs that have a good chance of being a hit

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.2, random_state=23)

logit_reg = LogisticRegression(penalty="l2", C=1e42, solver='lbfgs', max_iter=2000, multi_class='multinomial')
logit_reg.fit(train_X, train_y)

classificationSummary(train_y, logit_reg.predict(train_X))

classificationSummary(valid_y, logit_reg.predict(valid_X))


Confusion Matrix (Accuracy 0.7644)

       Prediction
Actual    0    1    2
     0 5991   86  187
     1  673  296  222
     2  836   67  432
Confusion Matrix (Accuracy 0.7461)

       Prediction
Actual    0    1    2
     0 1456   27   46
     1  193   85   42
     2  228   22   99


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [60]:
(85+22+42+99)/(27+85+22+42+99+46)

0.7725856697819314

In [57]:
#ordinal logistic regression - this didn't work at all

from mord import LogisticIT

logit = LogisticIT(alpha=0)
logit.fit(train_X, train_y)

classificationSummary(train_y, logit.predict(train_X))

classificationSummary(valid_y, logit.predict(valid_X))

Confusion Matrix (Accuracy 0.7403)

       Prediction
Actual    0    1    2
     0 6045    0  219
     1  817    0  374
     2  873    0  462
Confusion Matrix (Accuracy 0.7197)

       Prediction
Actual    0    1    2
     0 1478    0   51
     1  234    0   86
     2  245    0  104
