In [None]:
# Try out diff gridsearchcv parameters

Importing necessary tools, loading dataset, and preprocessing the data:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import math
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import precision_score, accuracy_score, f1_score, confusion_matrix, classification_report, roc_curve, auc, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier

In [4]:
data = pd.read_csv('data/dataset-of-10s.csv')

In [5]:
X = data.drop(columns=['target', 'uri', 'artist', 'track'], axis=1)
y = data['target']

X = X.apply(lambda x : (x - x.min()) /(x.max() - x.min()), axis=0)
X.head()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state= 42)

Trying other models for baseline (KNN, logistic regression):

In [6]:
knn_model = KNeighborsClassifier()

knn_model.fit(X_train, y_train)

train_preds = knn_model.predict(X_train)
test_preds = knn_model.predict(X_test)


In [7]:
print("Training Precision Score for KNN Classifier: {:.4}%".format(precision_score(y_train, train_preds) * 100))
print("Testing Precision Score for KNN Classifier: {:.4}%".format(precision_score(y_test, test_preds) * 100))

print("Training Accuracy Score for KNN Classifier: {:.4}%".format(accuracy_score(y_train, train_preds) * 100))
print("Testing Accuracy Score for KNN Classifier: {:.4}%".format(accuracy_score(y_test, test_preds) * 100))

Training Precision Score for KNN Classifier: 79.18%
Testing Precision Score for KNN Classifier: 71.64%
Training Accuracy Score for KNN Classifier: 84.14%
Testing Accuracy Score for KNN Classifier: 76.0%


In [8]:
logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')

model_log = logreg.fit(X_train, y_train)

y_hat_train_log = logreg.predict(X_train)
y_hat_test_log = logreg.predict(X_test)


In [9]:
print("Training precision score for logistic regression model: {:.4}%".format(precision_score(y_train, y_hat_train_log) * 100))
print("Testing precision score logistic regression model: {:.4}%".format(precision_score(y_test, y_hat_test_log) * 100))

print("Training accuracy score logistic regression model: {:.4}%".format(accuracy_score(y_train, y_hat_train_log)* 100))
print("Testing accuracy score logistic regression model:{:.4}%".format(accuracy_score(y_test, y_hat_test_log)* 100))

Training precision score for logistic regression model: 73.67%
Testing precision score logistic regression model: 72.91%
Training accuracy score logistic regression model: 78.93%
Testing accuracy score logistic regression model:78.12%


##### Trying out various different parameters for GridSearchCV (baseline dt):

In [10]:
baseline_tree = DecisionTreeClassifier(criterion='entropy', max_depth=5)
baseline_tree.fit(X_train, y_train)
y_hat_train = baseline_tree.predict(X_train)
y_hat_test = baseline_tree.predict(X_test)


In [11]:
dt_param_grid ={
    "criterion": ["gini", "entropy"],
    "max_depth": [None,5,10],
    "min_samples_split": [0.2, 0.4, 0.6],
    "max_features": [5, 10, 15],
    'min_samples_leaf': [2, 4, 6, 8]
    
    
}
# Options: max. depth, min. samples split, min. leaf sample size, max. leaf nodes, max. features 


In [12]:
dt_grid_search = GridSearchCV(baseline_tree , dt_param_grid, cv=3, return_train_score=True)

dt_grid_search.fit(X_train, y_train)

# Mean training score
dt_gs_training_score = np.mean(dt_grid_search.cv_results_["mean_train_score"])

# Mean test score
dt_gs_testing_score = dt_grid_search.score(X_test, y_test)

print(f"Mean Training Score: {dt_gs_training_score :.2%}")
print(f"Mean Test Score: {dt_gs_testing_score :.2%}")
print("Best Parameter Combination Found During Grid Search:")
dt_grid_search.best_params_

Mean Training Score: 79.81%
Mean Test Score: 79.38%
Best Parameter Combination Found During Grid Search:


{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 10,
 'min_samples_leaf': 4,
 'min_samples_split': 0.2}

In [26]:
dt_param_grid2 ={
    "max_depth": [8, 10, 12],
    "min_samples_split": [0.1, 0.2, 0.3],
    "max_features": [8, 10, 12],
    'min_samples_leaf': [3, 4, 5],
    
    
}
# Options: criterion, max. depth, min. samples split, min. leaf sample size, max. leaf nodes, max. features 


In [27]:
dt_grid_search2 = GridSearchCV(baseline_tree , dt_param_grid2, cv=3, return_train_score=True)

dt_grid_search2.fit(X_train, y_train)

# Mean training score
dt_gs_training_score2 = np.mean(dt_grid_search2.cv_results_["mean_train_score"])

# Mean test score
dt_gs_testing_score2 = dt_grid_search2.score(X_test, y_test)

print(f"Mean Training Score: {dt_gs_training_score2 :.2%}")
print(f"Mean Test Score: {dt_gs_testing_score2 :.2%}")
print("Best Parameter Combination Found During Grid Search:")
dt_grid_search2.best_params_

Mean Training Score: 80.74%
Mean Test Score: 79.31%
Best Parameter Combination Found During Grid Search:


{'max_depth': 10,
 'max_features': 10,
 'min_samples_leaf': 4,
 'min_samples_split': 0.1}

In [104]:
dt_param_grid3 ={
    "max_depth": [None,5,10],
    "max_features": [5, 10, 15],
    'min_samples_leaf': [4, 6, 8, 10],
    "min_samples_split": [ 0.2, 0.4, 0.6],
    
}
# Options: max. depth, min. samples split, min. leaf sample size, max. leaf nodes, max. features 


In [105]:
dt_grid_search3 = GridSearchCV(baseline_tree , dt_param_grid3, cv=3, return_train_score=True)

dt_grid_search3.fit(X_train, y_train)

# Mean training score
dt_gs_training_score3 = np.mean(dt_grid_search3.cv_results_["mean_train_score"])

# Mean test score
dt_gs_testing_score3 = dt_grid_search3.score(X_test, y_test)

print(f"Mean Training Score: {dt_gs_training_score3 :.2%}")
print(f"Mean Test Score: {dt_gs_testing_score3 :.2%}")
print("Best Parameter Combination Found During Grid Search:")
dt_grid_search3.best_params_

Mean Training Score: 79.62%
Mean Test Score: 78.06%
Best Parameter Combination Found During Grid Search:


{'max_depth': 10,
 'max_features': 5,
 'min_samples_leaf': 6,
 'min_samples_split': 0.2}

In [107]:
# Train a classifier with optimal values identified above
dt = DecisionTreeClassifier(
                            max_depth=5,
                            min_samples_leaf=6,
                            max_features = 5,
                            random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.8048249006564565

In [94]:
dt = DecisionTreeClassifier(
                            max_depth=3,
                            min_samples_leaf=4,
                            max_features = 10,
                            min_samples_split = 0.2,
                            random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.7788393173852373

##### Trying out various different parameters for GridSearchCV (random forests model):

In [109]:
forest = RandomForestClassifier(max_features='sqrt',
                               max_samples = 0.5,
                               random_state=42)
forest.fit(X_train, y_train)

y_hat_train3 = forest.predict(X_train)
y_hat_test3 = forest.predict(X_test)


In [114]:
rf_param_grid = {
    "max_depth": [5, 7, 10],
    "max_features": [5, 10, 15],
    'min_samples_leaf': [4, 6, 8, 10],
    "min_samples_split": [ 0.2, 0.4, 0.6],
}


In [115]:
rf_grid_search = GridSearchCV(forest, rf_param_grid, cv=3)
rf_grid_search.fit(X_train, y_train)

print(f"Testing Accuracy: {rf_grid_search.best_score_ :.2%}")
print("")
print(f"Optimal Parameters: {rf_grid_search.best_params_}")

Testing Accuracy: 82.20%

Optimal Parameters: {'max_depth': 5, 'max_features': 5, 'min_samples_leaf': 4, 'min_samples_split': 0.2}


In [146]:
rf_param_grid2 = {
    "max_depth": [None, 5],
    "max_features": [5, 7, 10],
    'min_samples_leaf': [4, 6, 8, 10]
}


In [147]:
rf_grid_search2 = GridSearchCV(forest, rf_param_grid2, cv=3)
rf_grid_search2.fit(X_train, y_train)

print(f"Testing Accuracy: {rf_grid_search2.best_score_ :.2%}")
print("")
print(f"Optimal Parameters: {rf_grid_search2.best_params_}")

Testing Accuracy: 84.39%

Optimal Parameters: {'max_depth': None, 'max_features': 10, 'min_samples_leaf': 6}


In [148]:
rf = RandomForestClassifier(max_depth=None,
                            max_features = 10,
                            min_samples_leaf= 6,
                            random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_rf)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.8329465849729119