# Loading Dataset

In [None]:
%pip install datasets

Traceback (most recent call last):
  File "/usr/local/bin/pip3", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main.py", line 78, in main
    command = create_command(cmd_name, isolated=("--isolated" in cmd_args))
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/__init__.py", line 114, in create_command
    module = importlib.import_module(module_path)
  File "/usr/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_

In [None]:
import seaborn as sns
import matplotlib as plt
import pandas as pd
import numpy as np
import plotly.express as px

from datasets import load_dataset

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# this cell may take up to 30 seconds to run to convert the Hugginface Dataset class to Pandas DataFrame for better EDA

HF_SPOTIFY_DATASET_PATH = "maharshipandya/spotify-tracks-dataset"
# https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset

ds = load_dataset(HF_SPOTIFY_DATASET_PATH) #only has 'train' split as key
df = pd.DataFrame(ds["train"])
df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [None]:
# show all columns with only 2 values
df[[col for col in df.columns if len(df[col].unique()) == 2]]

Unnamed: 0,explicit,mode
0,False,0
1,False,1
2,False,1
3,False,1
4,False,1
...,...,...
113995,False,1
113996,False,0
113997,False,0
113998,False,1


In [None]:
df.dropna(inplace=True)

In [None]:
num_not_explicit = (df['explicit']==0).sum()
num_explicit = (df['explicit']==1).sum()
THRESHOLD = round(num_explicit/num_not_explicit, 3)
print(f"ratio of explicit to not explicit: {THRESHOLD}")


ratio of explicit to not explicit: 0.093


There appears to be a significant class imbalance.  Therefore, our threshold for explicit will be .093 instead of 0.5

# Partitioning the Dataset

In [None]:
# Setting up the response variable 'explicit' for binary classification.
y = df['explicit']

# Selecting features 'speechiness', 'danceability', and 'instrumentalness' based on their high correlation with 'explicit'.
# highest absolute correlation with explicit (> |0.1|)
X = df[['speechiness', 'danceability', 'instrumentalness']]

x, X_test, Y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(x, Y, test_size=0.25, random_state=42)


In [None]:
# Checking the shapes of training, testing, and validation sets to confirm the data split.
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(68399, 3)
(22800, 3)
(22800, 3)
(68399,)
(22800,)
(22800,)


# Initializing Models

## KNN

In [None]:
# Testing different values of k (from 1 to 20) for KNN and recording the cross-validation score for each to find the optimal k.
k_values = [i for i in range (1,10)]
scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    score = cross_val_score(knn, X_train, y_train, cv=5)
    scores.append(np.mean(score))
    print("K:", k, "score:", np.mean(score))

K: 1 score: 0.902966465008685
K: 2 score: 0.9195310137321518
K: 3 score: 0.9088291043726479
K: 4 score: 0.9169139809217954
K: 5 score: 0.9121186005013021
K: 6 score: 0.9168408569245811
K: 7 score: 0.9140922986060076
K: 8 score: 0.9170455588003807
K: 9 score: 0.9151157032442695


Best K value is 4. 2 has a higher score but is too small and is likely overfitting the data set. 4 is a safer bet

In [None]:
# Training the KNN model with the optimal number of neighbors (k=4) determined from cross-validation.
knn = KNeighborsClassifier(n_neighbors = 4)
knn.fit(X_train, y_train)

# Generating probability predictions and class predictions on the training set using the trained KNN model.
knn_val_pred = knn.predict_proba(X_val)
knn_val_pred = knn_val_pred[:, 1]

## CART


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [None]:
min_samples_range = np.arange(10, 101, 10)
samples_scores = []

for samples in min_samples_range:
    c = DecisionTreeClassifier(min_samples_split=samples, max_depth=30)
    score = cross_val_score(c, X_train, y_train, cv=5)
    mean_score = np.mean(score)
    samples_scores.append(mean_score)
    print("samples:", samples, "score:", mean_score)

samples: 10 score: 0.9028055736179887
samples: 20 score: 0.9061243501264797
samples: 30 score: 0.9089460110666071
samples: 40 score: 0.9102471881387315
samples: 50 score: 0.9110366778546874
samples: 60 score: 0.9116653531323251
samples: 70 score: 0.9121916892286764
samples: 80 score: 0.9124256137700295
samples: 90 score: 0.9124402422033346
samples: 100 score: 0.9122648089507587


use min_samples_split of 70

In [None]:
depth_range = np.arange(3, 31, 3)
depth_scores = []

for d in depth_range:
    c = DecisionTreeClassifier(min_samples_split=70, max_depth=d)
    score = cross_val_score(c, X_train, y_train, cv=5)
    mean_score = np.mean(score)
    depth_scores.append(mean_score)
    print("depth:", d, "score:", mean_score)

depth: 3 score: 0.9146624922139157
depth: 6 score: 0.9147209685397304
depth: 9 score: 0.9152472907419021
depth: 12 score: 0.9149110120563
depth: 15 score: 0.9137560434336323
depth: 18 score: 0.9130542708783558
depth: 21 score: 0.9123378901966518
depth: 24 score: 0.9122501676920571
depth: 27 score: 0.9122209289947583
depth: 30 score: 0.9122209289947583


best depth is 9

In [None]:
# Create the Decision Tree Classifier with the same default values as R's rpart
cart = DecisionTreeClassifier(min_samples_split=70, max_depth=9)
cart.fit(X_train, y_train)

# compute validation set predictions (probability)
cart_val_pred = cart.predict_proba(X_val)
# extract the positive class predictions from cart_val_pred
cart_val_pred = cart_val_pred[:, 1]

## RF

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# compute validation set predictions (probability)
rf_val_pred = rf.predict_proba(X_val)
# extract just the positive class prediction from rf_val_pred
rf_val_pred = rf_val_pred[:, 1]

# Evaluating CART vs KNN vs RF

In [None]:
# compute the accuracy, true positive rate, true negative rate, and AUC for each fit
val_results = []
val_results.append({'model': 'cart',
                    'accuracy': metrics.accuracy_score(y_val, cart_val_pred > THRESHOLD),
                    'true_positive_rate': metrics.recall_score(y_val, cart_val_pred > THRESHOLD),
                    'true_negative_rate': metrics.recall_score(y_val == False, cart_val_pred <= THRESHOLD),
                    'auc': metrics.roc_auc_score(y_val, cart_val_pred)})
val_results.append({'model': 'rf',
                    'accuracy': metrics.accuracy_score(y_val, rf_val_pred > THRESHOLD),
                    'true_positive_rate': metrics.recall_score(y_val, rf_val_pred > THRESHOLD),
                    'true_negative_rate': metrics.recall_score(y_val == False, rf_val_pred <= THRESHOLD),
                    'auc': metrics.roc_auc_score(y_val, rf_val_pred)})
val_results.append({'model': 'knn',
                    'accuracy': metrics.accuracy_score(y_val, knn_val_pred > THRESHOLD),
                    'true_positive_rate': metrics.recall_score(y_val, knn_val_pred > THRESHOLD),
                    'true_negative_rate': metrics.recall_score(y_val == False, knn_val_pred <= THRESHOLD),
                    'auc': metrics.roc_auc_score(y_val, knn_val_pred)})

# convert the results into a data frame
val_results_df = pd.DataFrame(val_results)
val_results_df

Unnamed: 0,model,accuracy,true_positive_rate,true_negative_rate,auc
0,cart,0.782018,0.638947,0.795024,0.792101
1,rf,0.823684,0.742105,0.8311,0.862067
2,knn,0.818772,0.678947,0.831483,0.776628


In [None]:
# compute the ROC curve variables for cart
cart_fpr, cart_tpr, cart_thresholds = metrics.roc_curve(y_val, cart_val_pred)

# compute the ROC curve variables for rf
rf_fpr, rf_tpr, rf_thresholds = metrics.roc_curve(y_val, rf_val_pred)

# compute the ROC curve variables for knn
knn_fpr, knn_tpr, knn_thresholds = metrics.roc_curve(y_val, knn_val_pred)

# create dataframes for each model's ROC curve
roc_cart = pd.DataFrame({
    'False Positive Rate': cart_fpr,
    'True Positive Rate': cart_tpr,
    'Model': 'CART'
})

roc_rf = pd.DataFrame({
    'False Positive Rate': rf_fpr,
    'True Positive Rate': rf_tpr,
    'Model': 'RF'
})

roc_knn = pd.DataFrame({
    'False Positive Rate': knn_fpr,
    'True Positive Rate': knn_tpr,
    'Model': 'KNN'
})

# concatenate the dataframes
roc_df = pd.concat([roc_rf, roc_cart, roc_knn])

# plot the ROC curve
px.line(roc_df, y='True Positive Rate', x='False Positive Rate',
        color='Model',
        width=700, height=500)
