### Imports

In [None]:
from __future__ import print_function    # (at top of module)
import warnings
warnings.filterwarnings('ignore')
from spotipy.oauth2 import SpotifyClientCredentials
import json
import spotipy
import time
import csv
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [10, 10]
from matplotlib.pyplot import figure
import math
import seaborn as sns
import io, os, sys, types

### This notebook is used to save the models to a file as a pickle representation so they can be loaded later into the web app.

### Data preparations

In [None]:
# Read the data from the file
data = pd.read_csv('Data/data_500_entries_youtube.csv')
print("Number of entries in original data: " + str(len(data.index)))
data.head()

In [None]:
from project_modules import *

#final_data = label_data_combined(data, 90, 1000000000)
final_data = label_data_yt(data, 89, 1000000000)

In [None]:
# Drop unnecessary columns from original data - also some legacy support for a different data format
if 'song_id' in data.columns:
    final_data.drop(['song_id', 'song_title', 'artist', 'popularity', 'youtube_view_count', 'youtube_video_title'], 1, inplace=True)
else:
    final_data.drop(['song_title', 'artist', 'popularity'], 1, inplace=True) 

In [None]:
# X will be our examples and y will be our labels
X = final_data.drop('is_popular', axis=1)
y = final_data['is_popular']
# Sanity checks
print("Number of entries in actual data: " + str(len(X.index)))
print("Number of entries in label data: " + str(len(y.index)))
X.head()

In [None]:
from sklearn import preprocessing
import pickle
COLUMNS_TO_SCALE = ["energy", "liveness", "tempo", 
                    "speechiness", "acousticness", "instrumentalness", 
                    "time_signature", "danceability", "key", 
                    "duration", "loudness", "valence", "mode"]

In [None]:
from sklearn.neural_network import MLPClassifier
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
print(Counter(y))
sm = SMOTE()
X_res, y_res = sm.fit_resample(X, y)
print(Counter(y_res))

### Use pickle to write the models and a scaler to files

We also keep a scaler because if we just use certain features a scaler that was fitted on data with all the features won't work. Also we need the scaler to scale new examples the same way we scaled our data.

In [None]:
# Model 1
# For each with less features we need own scaler which we need to export
#['energy', 'tempo', 'instrumentalness', 'danceability', 'loudness', 'valence'
scaler1 = preprocessing.StandardScaler()
scaler1.fit(X[['energy', 'tempo', 'instrumentalness', 'danceability', 'loudness', 'valence']])
pickle.dump(scaler1, open("3yp_scaler_log.pkl","wb"))
# Copy data back
X_log = scaler1.transform(X[['energy', 'tempo', 'instrumentalness', 'danceability', 'loudness', 'valence']])

model1 = LogisticRegression(solver='lbfgs',class_weight='balanced', random_state=3)
model1.fit(X_log, y)

pickle.dump(model1, open("3yp_log.pkl", "wb"))

In [None]:
# Model 2
#['energy', 'tempo', 'speechiness', 'instrumentalness', 'time_signature', 'duration', 'loudness']
scaler2 = preprocessing.StandardScaler()
scaler2.fit(X[['energy', 'tempo', 'speechiness', 'instrumentalness', 'time_signature', 'duration', 'loudness']])
pickle.dump(scaler2, open("3yp_scaler_svm.pkl","wb"))
# Copy data back
X_svm = scaler2.transform(X[['energy', 'tempo', 'speechiness', 'instrumentalness', 'time_signature', 'duration', 'loudness']])

model2 = svm.SVC(probability=True, gamma='scale', class_weight='balanced')
model2.fit(X_svm, y)

pickle.dump(model2, open("3yp_svm.pkl", "wb"))

In [None]:
# Model 3
# all

scaler3 = preprocessing.StandardScaler()
scaler3.fit(X)
pickle.dump(scaler3, open("3yp_scaler_mlp_smote.pkl","wb"))
# Copy data back
X_mlp_smote = scaler3.transform(X)
model3 = make_pipeline_imb(SMOTE(), MLPClassifier(solver="lbfgs", activation="relu", alpha=1, learning_rate="constant")
model3.fit(X_mlp_smote,y)

pickle.dump(model3, open("3yp_mlp_smote.pkl", "wb"))

In [None]:
# Model 4
# all
# Try to also fit with oversampled data

scaler4 = preprocessing.StandardScaler()
scaler4.fit(X)
pickle.dump(scaler4, open("3yp_scaler_mlp_online.pkl","wb"))
X_mlp_online = scaler4.transform(X)

print(Counter(y))
sm2 = SMOTE()
X_res_mlp, y_res_mlp = sm2.fit_resample(X_mlp_online, y)
print(Counter(y_res))

model4 = MLPClassifier(activation = 'relu', solver='sgd', alpha=0.0001, learning_rate="constant")
model4.fit(X_res_mlp, y_res_mlp)

pickle.dump(model4, open("3yp_mlp_online.pkl", "wb"))

In [None]:
# Model 5
# ['energy', 'liveness', 'tempo', 'acousticness', 'instrumentalness', 'time_signature', 'duration', 'loudness', 'valence']
scaler5 = preprocessing.StandardScaler()
scaler5.fit(X)
pickle.dump(scaler5, open("3yp_scaler_mlp_simple.pkl","wb"))
# Copy data back
X_mlp_simple = scaler5.transform(X)
model5 = MLPClassifier(learning_rate="constant", solver="adam", alpha=0.001)
model5.fit(X_mlp_simple,y)

pickle.dump(model5, open("3yp_mlp_simple.pkl", "wb"))

In [None]:
# Model 6
# make_pipeline_imb(SMOTE(random_state=4), svm.SVC(probability=True, gamma='scale', random_state=3))
# feature_set :['energy', 'tempo', 'speechiness', 'loudness', 'valence']
scaler6 = preprocessing.StandardScaler()
scaler6.fit(X[['energy', 'tempo', 'speechiness', 'loudness', 'valence']])
pickle.dump(scaler6, open("3yp_scaler_svm_smote.pkl","wb"))
# Copy data back
X_svm_smote = scaler6.transform(X[['energy', 'tempo', 'speechiness', 'loudness', 'valence']])
model6 = make_pipeline_imb(SMOTE(random_state=4), svm.SVC(probability=True, gamma='scale'))
model6.fit(X_svm_smote,y)

pickle.dump(model6, open("3yp_svm_smote.pkl", "wb"))

## Testing with a set of new examples

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import config

In [None]:
client_credentials_manager = SpotifyClientCredentials(config.client_id, config.client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
test_song = sp.audio_features(["spotify:track:4kV4N9D1iKVxx1KLvtTpjS" 
                               "spotify:track:6ocbgoVGwYJhOv1GgI9NsF",
                               "spotify:track:5WHTFyqSii0lmT9R21abT8",
                               "spotify:track:2TIlqbIneP0ZY1O0EzYLlc",
                               "spotify:track:5itOtNx0WxtJmi1TQ3RuRd",
                               "spotify:track:1dAw715CaUd1HKGKXCzimK",
                               "spotify:track:5n2KsLTepK1vPeIkMw7UpV",
                               "spotify:track:6scFQGR5c6XYV33pcLbBIt",
                               "spotify:track:38df12R7YuZj8fIkhS3nRp",
                                  ])

In [None]:
test_song1 = []
for i in range(0, 8, 1):
    test_song1.append([test_song[i]['energy'], test_song[i]["liveness"], test_song[i]["tempo"], 
                    test_song[i]["speechiness"], test_song[i]["acousticness"], test_song[i]["instrumentalness"], 
                    test_song[i]["time_signature"], test_song[i]["danceability"], test_song[i]["key"], 
                    test_song[i]["duration_ms"], test_song[i]["loudness"], test_song[i]["valence"], test_song[i]["mode"]])
test_df = pd.DataFrame(test_song1)

In [None]:
test_df.head(10)

In [None]:
print(test_song1)
data_np = scaler3.transform(test_df)
print(data_np)

In [None]:
model7 = pickle.load(open("Webserver/models/3yp_mlp_smote.pkl","rb"))
predicted_labels = model7.predict(data_np)
print(predicted_labels)
print()
print()
predicted_probabilities = model7.predict_proba(data_np)
print(predicted_probabilities)
print()
print()
for i in range(0, 8, 1):
    #print("Predicted label: ", model.predict(data_np))
    print(predicted_labels[i])
    print(predicted_probabilities[i])
    #print("Class 0 probability: ",model.predict_proba(data_np[i])[0][0]," Class 1 probability: ",model.predict_proba(data_np[i])[0][1])

In [None]:
model7.steps