In [6]:
from __future__ import print_function    # (at top of module)
import warnings
warnings.filterwarnings('ignore')
from spotipy.oauth2 import SpotifyClientCredentials
import json
import spotipy
import time
import csv
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [10, 10]
from matplotlib.pyplot import figure
import math
import seaborn as sns
import io, os, sys, types

# Initial preparations

In [7]:
# Read the data from the file
data = pd.read_csv('Data/data_500_entries_youtube.csv')
# data = pd.read_csv('Data/data_3000_entries_youtube.csv')

print("Number of entries in original data: " + str(len(data.index)))
data.head()

Number of entries in original data: 570


Unnamed: 0,song_id,song_title,artist,popularity,energy,liveness,tempo,speechiness,acousticness,instrumentalness,time_signature,danceability,key,duration,loudness,valence,mode,youtube_view_count,youtube_video_title
0,spotify:track:5ygDXis42ncn6kYG14lEVG,Baby Shark,[Pinkfong],77,0.84,0.341,115.062,0.227,0.245,0.0,4,0.825,7,96333,-3.651,0.52,1,1956582159,Baby Shark Dance | Sing and Dance! | Animal So...
1,spotify:track:7fa9MBXhVfQ8P8Df9OEbD8,Girls Like You (feat. Cardi B),"[Maroon 5, Cardi B]",86,0.541,0.13,124.959,0.0505,0.568,0.0,4,0.851,0,235545,-6.825,0.448,1,1300452389,Maroon 5 - Girls Like You ft. Cardi B
2,spotify:track:6De0lHrwBfPfrhorm9q1Xl,Me Rehúso,[Danny Ocean],83,0.804,0.0494,104.823,0.0677,0.0231,0.0,4,0.744,1,205715,-6.327,0.426,1,1229501096,Danny Ocean - Me Rehúso (Official Audio)
3,spotify:track:1j6xOGusnyXq3l6IryKF3G,Déjala Que Vuelva (feat. Manuel Turizo),"[Piso 21, Manuel Turizo]",74,0.788,0.0753,170.019,0.0785,0.0482,0.0,4,0.681,1,220117,-4.323,0.839,1,1216075058,Piso 21 - Déjala Que Vuelva (feat. Manuel Turi...
4,spotify:track:2ijef6ni2amuunRoKTlgww,Sin Pijama,"[Becky G, Natti Natasha]",90,0.745,0.104,94.014,0.0464,0.354,2.9e-05,4,0.791,11,188560,-3.695,0.82,0,1071141995,Becky G Natti Natasha - Sin Pijama (Video Ofic...


In [8]:
from project_modules import *

# Depending on the dataset we have different ways of labeling the data
if 'total_no_streams' in data.columns:
    # In the 3000 song dataset there are 3 features that can be used for 
    # labeling : popularity, total_no_streams, youtube_view_count
    data = data[data.popularity > 10]
    data = data[data.total_no_streams > 999999]
    data = data[data.youtube_view_count > 999999]
    final_data = label_data_combined(data, 89, 700000000, 700000000)
else:
    # In the 500 song dataset, there are 2 features that can be used for
    # labeling : popularity and youtube_view_count
    final_data = label_data_yt(data, 89, 1000000000)

Number of popular examples after thresholding :  59
Number of not popular examples after thresholding :  511


In [9]:
# Drop unnecessary text columns or columns used for labeling from data
if 'total_no_streams' in data.columns:
    final_data.drop(['song_id', 'song_title', 'artist', 'popularity', 'youtube_view_count', 'youtube_video_title', 'total_no_streams'], 1, inplace=True)
else:
    final_data.drop(['song_id', 'song_title', 'artist', 'popularity', 'youtube_view_count', 'youtube_video_title'], 1, inplace=True) 

In [10]:
# X will be our examples and y will be our labels
X = final_data.drop('is_popular', axis=1)
y = final_data['is_popular']

# Sanity checks
print("Number of entries in actual data: " + str(len(X.index)))
print("Number of entries in label data: " + str(len(y.index)))

Number of entries in actual data: 570
Number of entries in label data: 570


# Grid search

In [11]:
# this code is from http://www.davidsbatista.net/blog/2018/02/23/model_optimization/
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV

class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=10, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [12]:
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score, recall_score, precision_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB

models1 = { 
    "KNN":KNeighborsClassifier(),
    'Logistic regression': LogisticRegression(class_weight='balanced', random_state=3),
    'SVM balanced weights': svm.SVC(class_weight='balanced'),
    'Multilayer Perceptron': make_pipeline_imb(SMOTE(random_state=5), MLPClassifier(activation=activ, solver=solv, alpha=al, learning_rate=lrn, hidden_layer_sizes=hdn, random_state=3)),
    'Random Forest Classifier': RandomForestClassifier(n_estimators=100, max_depth=2, random_state=3, class_weight="balanced")
}

params1 = { 
     'KNN':{'n_neighbors':[1,3,5,7,9,11,13,15,17, 19, 21]},
     'Logistic regression': { 'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
                              'C' : [0.001, 0.0001, 0.0001, 0.01, 0.1, 1]},
     'SVM balanced weights': { 'kernel' : ['rbf', 'linear', 'poly', 'sigmoid'], 
                               'gamma' : ['auto', 'scale'], 
                               'decision_function_shape' : ['ovo', 'ovr'],
                               'C' : [0.001, 0.0001, 0.0001, 0.01, 0.1, 1]},
    'Multilayer Perceptron': { 'activation' : ['identity', 'logistic', 'tanh', 'relu'],                                 'solver' : ['lbfgs', 'sgd', 'adam'], 
                               'alpha' : [1e-5, 1e-4, 1, 1e-3, 1e-2, 1e-1], 
                               'learning_rate' : ['constant', 'invscaling', 'adaptive'],
                               'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],},
     'Random Forest Classifier':{ 'n_estimators' : [10, 100, 200, 500, 1000], 
                                  'max_depth': [2, 5, 7, 10, 13], 
                                  'criterion' : ['gini', 'entropy']}
}

In [None]:
from sklearn import preprocessing
scaler2 = preprocessing.StandardScaler()
scaled_X = pd.DataFrame(scaler2.fit_transform(X))

# Initialize Grid Search class
helper1 = EstimatorSelectionHelper(models1, params1)
# Run Grid Search
helper1.fit(scaled_X, y, scoring='roc_auc', n_jobs=10, refit=True)
# Print summary sorted by mean score
helper1.score_summary(sort_by='mean_score')


In [None]:
helper1.score_summary(sort_by='mean_score').to_csv("scores.csv")

# Grid search for imbalanced pipelines

In [None]:
# Grid search for SMOTE pipelines
# This is some code I implemented for imblearn pipelines because
# they don't work with the algorithm above
from sklearn.model_selection import cross_val_score
scaler2 = preprocessing.StandardScaler()
scaled_X = pd.DataFrame(scaler2.fit_transform(X))
print(params1)
scores_array = []
param_columns = ["activation", "solver", "alpha", "learning_rate", "hidden_layer_sizes, score"]

for activ in params1['Multilayer Perceptron']['activation']:
    for solv in params1['Multilayer Perceptron']['solver']:
        for al in params1['Multilayer Perceptron']['alpha']:
            for lrn in params1['Multilayer Perceptron']['learning_rate']:
                for hdn in params1['Multilayer Perceptron']['hidden_layer_sizes']:
                    mlp_smote = make_pipeline_imb(SMOTE(random_state=5), MLPClassifier(activation=activ, solver=solv, alpha=al, learning_rate=lrn, hidden_layer_sizes=hdn, random_state=3))
                    score = cross_val_score(mlp_smote, scaled_X, y, cv=StratifiedShuffleSplit(n_splits=10), scoring='roc_auc')
                    scores_array.append([activ, solv, al, lrn, hdn, score])
scores_grid = pd.DataFrame(scores_array, columns=param_columns)
scores_grid.head(50)

In [None]:
new_array=[]
for i in range(0, len(scores_array), 1):
    new_array.append([scores_array[i][0], scores_array[i][1], scores_array[i][2], scores_array[i][3], scores_array[i][4], scores_array[i][5].mean()])

In [None]:
param_columns = ["index","activation", "solver", "alpha", "learning_rate", "hidden_layer_sizes, score"]
scores_grid = pd.DataFrame(new_array, columns=param_columns)

In [None]:
scores_grid.head(50)

In [None]:
scores_grid.to_csv("mlp_smote_scores.csv")