# Imports and data preparations

In [None]:
from __future__ import print_function    # (at top of module)
import warnings
#warnings.filterwarnings('always')
from spotipy.oauth2 import SpotifyClientCredentials
import json
import spotipy
import time
import sys
import csv
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline
plt.rcParams['figure.figsize'] = [10, 10]
from matplotlib.pyplot import figure
import math
import seaborn as sns
import config

In [None]:
# Read the data from the file
data = pd.read_csv('Data/data_500_entries_youtube.csv')
print("Number of entries in original data: " + str(len(data.index)))
data.head()

In [None]:
if 'song_id' in data.columns:
    data = data.drop_duplicates(subset=['song_id'], keep='first')
else:
    data = data.drop_duplicates(subset=['song_title'], keep='first')
    
print("Number of entries in original data after cleaning: " + str(len(data.index)))

In [None]:
from project_modules import *
# Make a copy of the data to which we will ad labels and then remove any 
# columns that we will not need
# This is currently a duplicate of the functionality above - could maybe only do this in one place

final_data = label_data(data, 89)

# Drop unnecessary columns from original data
if 'total_no_streams' in data.columns:
    final_data.drop(['song_id', 'song_title', 'artist', 'popularity', 'total_no_streams', 'youtube_view_count', 'youtube_video_title'], 1, inplace=True)
else:
    final_data.drop(['song_id','song_title', 'artist', 'popularity', 'youtube_view_count', 'youtube_video_title'], 1, inplace=True) 


In [None]:
from sklearn.model_selection import train_test_split
# X will be our examples and y will be our labels
X = final_data.drop('is_popular', axis=1)
y = final_data['is_popular']
X_fs, X_test, y_fs, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
# Sanity checks
print("Number of entries in actual data: " + str(len(X.index)))
print("Number of entries in label data: " + str(len(y.index)))

In [None]:
from sklearn.preprocessing import scale, MinMaxScaler

COLUMNS_TO_SCALE = ["energy", "liveness", "tempo", 
                    "speechiness", "acousticness", "instrumentalness", 
                    "time_signature", "danceability", "key", 
                    "duration", "loudness", "valence", "mode"]
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
scaler.fit(X_fs)

# Copy data back
X_fs = pd.DataFrame(scaler.transform(X_fs), columns=COLUMNS_TO_SCALE)
X_test = pd.DataFrame(scaler.transform(X_test), columns=COLUMNS_TO_SCALE)

In [None]:
X_fs.head()

# Wrapper methods

## Exhaustive search

### Generating all the different possible feature subsets

In [None]:
import itertools
columns = ["energy", "liveness", "tempo", "speechiness", "acousticness", "instrumentalness", "time_signature", "danceability",
          "key", "duration", "loudness", "valence", "mode"]
allSubsets = []
total_no_subsets = 0
for m in range(1, len(columns) + 1, 1):
    subsets_with_m_elem = list(itertools.combinations(columns, m))
    total_no_subsets += len(subsets_with_m_elem)
    allSubsets.append(subsets_with_m_elem)
print(total_no_subsets)

### Printing scores

In [None]:
def print_scores(fs_scores):

    # Print feature sets with best scores
    for cur_list in fs_scores:
        print(cur_list[0])
        print()
        
        max_score_gen = 0
        max_score_cv = 0
        score_cv = 0
        score_gen = 0
        for i in range(0, len(cur_list[1]), 1):
            cur_score_gen = cur_list[1][i][1]
            cur_score_cv = cur_list[1][i][2]

            if cur_score_gen > max_score_gen:
                max_score_gen = cur_score_gen
                score_cv = cur_score_cv
                sel_feature_set_gen = cur_list[1][i][0]

            if cur_score_cv > max_score_cv:
                max_score_cv = cur_score_cv
                score_gen = cur_score_gen
                sel_feature_set_cv = cur_list[1][i][0]
        
        print("Max generalized score: ")
        print(max_score_gen)
        print("Average cv score for that feature subset: ")
        print(score_cv)
        print()
        print(sel_feature_set_gen)
        print("-------------------")
        print("Max average cv score: ")
        print(max_score_cv)
        print("Generalized score for that feature subset: ")
        print(score_gen)
        print()
        print(sel_feature_set_cv)
        print()
        print("=========================================================")

## Comparing same size feature subsets against eachother

In [None]:
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score, recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate
models = [
          {'title':"Logistic regression", 'model':LogisticRegression(random_state=3)},
          {'title':"Logistic regression balanced weights", 'model':LogisticRegression(class_weight='balanced', random_state=3)},
          {'title':"Oversampling logistic regression", 'model':make_pipeline_imb(SMOTE(random_state=4), LogisticRegression(random_state=3))},
          {'title':"Oversampling logistic regression balanced weights", 'model':make_pipeline_imb(SMOTE(random_state=4), LogisticRegression(class_weight='balanced',random_state=3))},
          {'title':"KNN", 'model':KNeighborsClassifier(n_neighbors = 17)},
          {'title':"Oversampling KNN", 'model':make_pipeline_imb(SMOTE(random_state=4), KNeighborsClassifier(n_neighbors = 17))},
          {'title':"SVM", 'model':svm.SVC(probability=True, gamma='scale', random_state=3)},
          {'title':"SVM balanced weights", 'model':svm.SVC(probability=True, gamma='scale', random_state=3, class_weight='balanced')},
          {'title':"Oversampling SVM", 'model':make_pipeline_imb(SMOTE(random_state=4), svm.SVC(probability=True, gamma='scale', random_state=3))}
         ]
models_scores_list = []

X_fs, X_test, y_fs, y_test = train_test_split(X, y, test_size=0.3, random_state=5)
for item in models:
    steps_taken = 0
    scores_list = []
    # Loop thorugh all the different subsets of 1,2,3, ..., (#features) elements
    for n_elem_subsets in range(0, len(columns), 1):
        len_elem_subs = len(allSubsets[n_elem_subsets])
        best_score = 0
        best_feature_set = []
        
        # Loop through all the subsets of n_elem_subsets elements
        for cur_comb in range(0, len_elem_subs, 1):

            # Current feature set
            cur_feature_set = list(allSubsets[n_elem_subsets][cur_comb])

            # Get the data with the current subset of features
            cur_data = X_fs[cur_feature_set]

            # Transform to np array for faster computation
            cur_data_np = np.array(cur_data)
            cur_labels_np = np.array(y_fs)
            
            # Counter of number of iterations
            steps_taken += 1

            # Instantiate model
            model = item['model']
            # model = svm.SVC(probability=True, gamma='scale', random_state=3, class_weight='balanced')

            # Instantiate cross validation strategy
            cv = StratifiedKFold(n_splits=10, random_state=3)

            scores = np.array([])
            for train, test in cv.split(cur_data_np, cur_labels_np):
                probas_ = model.fit(cur_data_np[train], cur_labels_np[train]).predict_proba(cur_data_np[test])
                predicts = model.predict(cur_data_np[test])
                scores = np.append(scores, roc_auc_score(cur_labels_np[test], probas_[:, 1]))

            mean_score = scores.mean()

            # average_score = (float(mean_score) + test_after_auc)/2
            # print("Curent score for {} features : {} | Test after score: {} | Average: {}".format(len(cur_feature_set), mean_score, test_after_auc, average_score))

            # For brevity, we only keep the best feature subset of one size
            # Alternatively, we could keep the scores for all the different combinations
            if mean_score > best_score:
                best_score = mean_score
                best_feature_set = list(allSubsets[n_elem_subsets][cur_comb])

        #predictions = model.predict(X_test[feature_set])
        model = item['model']
        model.fit(X_fs[best_feature_set], y_fs)
        y_pred_prob = model.predict_proba(X_test[best_feature_set])[:,1]
        test_after_auc = roc_auc_score(y_test, y_pred_prob)   
        #Keep the best score in a list    
        scores_list.append([best_feature_set, test_after_auc])

    print("Number of iterations: ", steps_taken)
    models_scores_list.append([item['title'], scores_list])

#### Scoring looks like: 
[Model Name, [feature subset, generalize score, average cv score]]

In [None]:
print_scores(models_scores_list)

## Comparing all subsets against eachother

In [None]:
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score, recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
models = [
#          {'title':"Logistic regression", 'model':LogisticRegression(random_state=3), 'feature_set':['liveness', 'tempo', 'acousticness', 'instrumentalness', 'time_signature', 'danceability', 'key', 'loudness', 'valence']},
#           {'title':"Logistic regression balanced weights", 'model':LogisticRegression(class_weight='balanced', n_jobs=-1, solver="lbfgs"), 'feature_set':['tempo', 'acousticness', 'danceability', 'valence']},
#          {'title':"Oversampling logistic regression", 'model':make_pipeline_imb(SMOTE(random_state=4), LogisticRegression(random_state=3, n_jobs=-1)), 'feature_set':['tempo', 'acousticness', 'danceability', 'key', 'valence']},
#          {'title':"Oversampling logistic regression balanced weights", 'model':make_pipeline_imb(SMOTE(random_state=4), LogisticRegression(class_weight='balanced',random_state=3)), 'feature_set':['tempo', 'acousticness', 'danceability', 'key', 'valence']},
#          {'title':"KNN", 'model':KNeighborsClassifier(n_neighbors = 17), 'feature_set':['liveness', 'tempo', 'acousticness', 'danceability']},
#          {'title':"Oversampling KNN", 'model':make_pipeline_imb(SMOTE(random_state=4), KNeighborsClassifier(n_neighbors = 17)), 'feature_set':['energy', 'instrumentalness', 'duration', 'valence', 'mode']},
#          {'title':"SVM", 'model':svm.SVC(probability=True, gamma='scale', random_state=3), 'feature_set':['liveness', 'tempo', 'acousticness', 'danceability', 'loudness', 'mode']},
#           {'title':"SVM balanced weights", 'model':svm.SVC(probability=True, gamma='scale',class_weight='balanced', kernel='rbf'), 'feature_set':['speechiness', 'key']},
#          {'title':"Oversampling SVM", 'model':make_pipeline_imb(SMOTE(random_state=4), svm.SVC(probability=True, gamma='scale', random_state=3, kernel='rbf')), 'feature_set':['tempo', 'time_signature']},
#           {'title':"Multilayer Perceptron", 'model':MLPClassifier(solver='lbfgs', alpha=1e-2), 'feature_set':['energy', 'acousticness', 'danceability', 'duration', 'valence', 'mode']},
#          {'title':"Multilayer Perceptron", 'model':MLPClassifier(), 'feature_set':['energy', 'acousticness', 'danceability', 'duration', 'valence', 'mode']},
          {'title':"Oversampling Multilayer Perceptron", 'model':make_pipeline_imb(SMOTE(random_state=4), MLPClassifier(solver="lbfgs", activation="relu", alpha=1, learning_rate="constant")), 'feature_set':['loudness', 'speechiness']},
          {'title':"SGD Multilayer Perceptron", 'model':make_pipeline_imb(SMOTE(random_state=4), MLPClassifier(activation = 'relu', solver='sgd', alpha=0.0001, learning_rate="constant")), 'feature_set':['loudness', 'speechiness']},
#          {'title':"Random Forest Classifier balanced weights", 'model':RandomForestClassifier(n_estimators=100, max_depth=2, random_state=3, class_weight="balanced", n_jobs=-1), 'feature_set':['loudness', 'speechiness']}
         ]
models_scores_list = []

for item in models:
    steps_taken = 0
    scores_list = []
    # Loop thorugh all the different subsets of 1,2,3, ..., (#features) elements
    for n_elem_subsets in range(0, len(columns), 1):
        len_elem_subs = len(allSubsets[n_elem_subsets])
        
        # Loop through all the subsets of n_elem_subsets elements
        for cur_comb in range(0, len_elem_subs, 1):

            # Current feature set
            cur_feature_set = list(allSubsets[n_elem_subsets][cur_comb])

            # Get the data with the current subset of features
            cur_data = X_fs[cur_feature_set]

            # Transform to np array for faster computation
            cur_data_np = np.array(cur_data)
            cur_labels_np = np.array(y_fs)
            
            # Counter of number of iterations
            steps_taken += 1
            if steps_taken % 1000 == 0:
                print(steps_taken)

            # Instantiate model
            model = item['model']

            # Instantiate cross validation strategy
            cv = StratifiedShuffleSplit(n_splits=10, random_state=5)

            scores = np.array([])
            
            # Do CV for this current feature set
            for train, test in cv.split(cur_data_np, cur_labels_np):
                probas_ = model.fit(cur_data_np[train], cur_labels_np[train]).predict_proba(cur_data_np[test])
                predicts = model.predict(cur_data_np[test])
                scores = np.append(scores, roc_auc_score(cur_labels_np[test], probas_[:, 1]))
            
            # Average CV scores
            mean_score = scores.mean()
            
            # Calculate generalization score
            model.fit(cur_data_np, cur_labels_np)
            y_pred_prob = model.predict_proba(X_test[cur_feature_set])[:,1]
            test_after_auc = roc_auc_score(y_test, y_pred_prob)   
            #Keep the best score in a list    
            scores_list.append([cur_feature_set, test_after_auc, mean_score])

    print("Number of iterations: ", steps_taken)
    models_scores_list.append([item['title'], scores_list])
    
    max_score_gen = 0
    max_score_cv = 0
    score_cv = 0
    score_gen = 0
    sel_feature_set_gen = []
    sel_feature_set_cv = []
    for i in range(0, len(scores_list), 1):
        cur_score_gen = scores_list[i][1]
        cur_score_cv = scores_list[i][2]
        if cur_score_gen > max_score_gen:
            max_score_gen = cur_score_gen
            score_cv = cur_score_cv
            sel_feature_set_gen = scores_list[i][0]
            
        if cur_score_cv > max_score_cv:
            max_score_cv = cur_score_cv
            score_gen = cur_score_gen
            sel_feature_set_cv = scores_list[i][0]
    print(item['title'])
    print()
    print("Max generalized score: ")
    print(max_score_gen)
    print("Average cv score for that feature subset: ")
    print(score_cv)
    print()
    print(sel_feature_set_gen)
    print("-------------------")
    print("Max average cv score: ")
    print(max_score_cv)
    print("Generalized score for that feature subset: ")
    print(score_gen)
    print()
    print(sel_feature_set_cv)
    print()
    print("=========================================================")

#### Scoring looks like: 
[Model Name, [feature subset, generalize score, average cv score]]

# Filter methods

### SelectKBest

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=5)

sel = SelectKBest(chi2, k = 5)
sel.fit(X_train, y_train)

selected = sel.transform(X_train)
#print(selected.shape)
#print(X_train.shape)

mask = sel.get_support() #list of booleans
new_features = [] # The list of your K best features

for bool, feature in zip(mask, columns):
    if bool:
        new_features.append(feature)
print(new_features)

### Pearson correlation

In [None]:
import scipy
from scipy.stats import pearsonr

print(pearsonr(X['valence'], y))

### Mutual information

In [None]:
from sklearn.feature_selection import mutual_info_classif

feature_scores = mutual_info_classif(X_train, y_train)

for score, fname in sorted(zip(feature_scores, columns)):
    print(fname, score)

### Joint mutual information

In [None]:
columns = ["energy", "liveness", "tempo", "speechiness", "acousticness", "instrumentalness", "time_signature", "danceability",
          "key", "duration", "loudness", "valence", "mode", 'youtube_view_count']
jmi = 0
for col in columns:
    for next_col in columns:
        if col != next_col:
            jmi += mutual_info_classif(X_train[[col, next_col]], y_train)[0]
            #print(col, next_col)
    print("JMI for {} is {}".format(col, jmi))

### ANOVA

In [None]:
from sklearn.feature_selection import f_classif

feature_scores = f_classif(X_train, y_train)[0]

for score, fname in sorted(zip(feature_scores, columns)):
    print(fname, score)

# Embedded methods

### LASSO regression

In [None]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
lasso = LassoCV(cv=StratifiedKFold(n_splits=10, random_state=3), random_state=3)
lasso.fit(X_train,y_train)
train_score=lasso.score(X_train,y_train)
test_score=lasso.score(X_test,y_test)
coeff_used = np.sum(lasso.coef_!=0)

print("training score:", train_score)
print("test score: ", test_score)
print("number of features used: ", coeff_used)

#print(lasso.coef_)

ind = 0
sel_f = []
for coef in lasso.coef_:
    if coef != 0:
        sel_f.append(columns[ind]) 
    ind += 1
print("Alpha(amount of penalization) chosen by CV: ", lasso.alpha_)
print(sel_f)

In [None]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegression
loglasso = LogisticRegression(penalty='l1', class_weight='balanced', random_state=3)
loglasso.fit(X_train,y_train)
train_score=loglasso.score(X_train,y_train)
test_score=loglasso.score(X_test,y_test)
coeff_used = np.sum(loglasso.coef_!=0)

print("training score:", train_score)
print("test score: ", test_score)
print("number of features used: ", coeff_used)

print(loglasso.coef_)

ind = 0
sel_f = []
for coef in loglasso.coef_[0]:
    if coef != 0:
        sel_f.append(columns[ind]) 
    ind += 1
print(sel_f)