<a href="https://colab.research.google.com/github/yashrajkakkad/ml-spotify/blob/master/SVM_GridSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/My Drive/ml-spotify/data_merged.csv')
df.head()

In [None]:
df.describe()

In [None]:
len(df.columns), df.columns

In [None]:
mean_popularity = 42
y_prime = [1 if i >= mean_popularity else 0 for i in df["popularity"]]
# y = pd.concat([df.id, pd.Series(y_prime, name="popularity")], axis=1)
y_popl = pd.concat([pd.Series(y_prime, name="popularity")], axis=1)
# y_popl["popularity"].value_counts()
y = y_popl["popularity"]

In [None]:
drop_cols = ["popularity", "mode", "loudness", "loudness_ar", "loudness_yr"]
X = df.drop(drop_cols, axis=1)
len(X.columns), X.columns

In [None]:
X.shape, y.shape

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, chi2
import matplotlib.pyplot as plt

clf = Pipeline([('anova', SelectPercentile(chi2)),
                ('scaler', StandardScaler()),
                ('svc', SVC(gamma="auto"))])

# #############################################################################
# Plot the cross-validation score as a function of percentile of features
score_means = list()
score_stds = list()
percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)

for percentile in percentiles:
    clf.set_params(anova__percentile=percentile)
    this_scores = cross_val_score(clf, X_train[:20000], y_train[:20000])
    score_means.append(this_scores.mean())
    score_stds.append(this_scores.std())

plt.errorbar(percentiles, score_means, np.array(score_stds))
plt.title(
    'Performance of the SVM-Anova varying the percentile of features selected')
plt.xticks(np.linspace(0, 100, 11, endpoint=True))
plt.xlabel('Percentile')
plt.ylabel('Accuracy Score')
plt.axis('tight')
plt.show()

In [None]:
from sklearn.feature_selection import SelectPercentile, chi2
X = SelectPercentile(chi2, percentile=10).fit_transform(X, y)

scaler = StandardScaler().fit(X)
X = scaler.transform(X)
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

In [None]:
%%time
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, classification_report

# Set the parameters by cross-validation
tuned_parameters = [
#                         'kernel': ['rbf'], 
#                         'gamma': [1e-3, 1e-4],
#                         'C': [1, 10, 100, 1000]
#                     },
                    {
                        'kernel': ['linear'],
                        'C': [1000, 1e4]
                    }]

scores = ['recall', 'f1']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(
        SVC(), tuned_parameters, scoring='%s' % score
    )
    clf.fit(X_train[:10000], y_train[:10000])

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_true, y_pred))
    print()

# svm = LinearSVC(random_state=42, tol=1e-5, max_iter=10000)
# svm.fit(X_train[:50000], y_train[:50000])
# # print("Train accuracy of svm algo:", svm.score(X_train, y_train))
# # print("Test accuracy of svm algo:", svm.score(X_test, y_test))


# cv_scores = cross_val_score(svm, X_train[:100000], y_train[:100000], cv=3)
# print("Cross Validation scores - ", cv_scores)

# y_train_pred = cross_val_predict(svm, X_train[:100000], y_train[:100000])

# confusion_matrix = confusion_matrix(y_train[:100000], y_train_pred)
# print("Confusion matrix\n", confusion_matrix)

# p_score = precision_score(y_train[:100000], y_train_pred)
# rc_score = recall_score(y_train[:100000], y_train_pred)
# f1_score = f1_score(y_train[:100000], y_train_pred)
# p_score, rc_score, f1_score
# print("Recall score = ", rc_score)
# print("Precision score = ", p_score)
# print("F1 score = ", f1_score)

# print("Classification report\n")
# print(classification_report(y_train[:100000], y_train_pred))