In [1]:

from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from matplotlib import pyplot as plt
from numpy import sqrt, argmax, save
import numpy as np



In [2]:
# Load and preprocess data
data = pd.read_csv(r'C:\Users\Neil\OneDrive\Desktop\song-popularity-prediction-master\Dataset\data_merged.csv')
popularity_threshold = 42
data["popularity"] = data["popularity"].apply(lambda x: 1 if x >= popularity_threshold else 0)



In [3]:
# Split features and target
features = data.drop(['popularity', 'mode'], axis=1)
target = data['popularity']



In [4]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, stratify=target, random_state=42
)



In [None]:
# Define hyperparameters for Grid Search
hyperparameters = {
    'max_depth': [8, 10, 12],
    'min_samples_split': [8, 10, 12],
    'min_samples_leaf': [1, 2, 3, 4],
}

# Train Decision Tree Classifier with Grid Search
dt_model = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator=dt_model, param_grid=hyperparameters)
grid_search.fit(X_train, y_train)



In [None]:
# Display best parameters
optimal_params = grid_search.best_estimator_.get_params()
print(f"Optimal parameters: {optimal_params}")



In [None]:
# Train the model with optimal parameters
final_model = DecisionTreeClassifier(
    max_depth=optimal_params['max_depth'],
    min_samples_split=optimal_params['min_samples_split'],
    min_samples_leaf=optimal_params['min_samples_leaf'],
    criterion='gini'
)
final_model.fit(X_train, y_train)



In [None]:
# Save the trained model
from joblib import dump
dump(final_model, r'C:\Users\Neil\OneDrive\Desktop\song-popularity-prediction-master\models\optimized_decision_tree_model.joblib')



In [None]:
# Evaluate model performance
y_pred_test = final_model.predict(X_test)
y_pred_train = final_model.predict(X_train)

print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test)}")
print(f"Train Accuracy: {accuracy_score(y_train, y_pred_train)}")
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))



In [None]:
# ROC Curve and AUC
from sklearn.metrics import roc_curve, roc_auc_score  # Ensure necessary imports are present
predicted_probabilities = final_model.predict_proba(X_test)[:, 1]
fpr, tpr, roc_thresholds = roc_curve(y_test, predicted_probabilities)
roc_auc = roc_auc_score(y_test, predicted_probabilities)
print(f"AUC: {roc_auc}")



In [None]:
# Save performance metrics
np.save("fpr_dt", fpr)
np.save("tpr_dt", tpr)

plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(fpr, tpr, marker='.')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()



In [None]:
# Precision-Recall Curve
from sklearn.metrics import precision_recall_curve, auc  # Ensure necessary imports are present
precision, recall, pr_thresholds = precision_recall_curve(y_test, predicted_probabilities)
pr_auc = auc(recall, precision)
print(f"Precision-Recall AUC: {pr_auc}")

# Save Precision-Recall metrics
np.save("precision_dt", precision)
np.save("recall_dt", recall)

# Plot the Precision-Recall Curve
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()
