In [10]:
# Step 1: Import libraries and load the dataset

import pandas as pd  # For handling data
import numpy as np   # For numerical operations

# Load the dataset
df = pd.read_csv("spotifydataset.csv")

# Show the shape of the dataset (rows, columns)
print("Dataset shape:", df.shape)

# Display the first 5 rows of the dataset
print("\nFirst 5 rows:")
print(df.head())

# Show info about each column: data type and non-null values
print("\nDataset info:")
print(df.info())

# Check if any column has missing values
print("\nMissing values in each column:")
print(df.isnull().sum())


Dataset shape: (2017, 17)

First 5 rows:
   Unnamed: 0  acousticness  danceability  duration_ms  energy  \
0           0        0.0102         0.833       204600   0.434   
1           1        0.1990         0.743       326933   0.359   
2           2        0.0344         0.838       185707   0.412   
3           3        0.6040         0.494       199413   0.338   
4           4        0.1800         0.678       392893   0.561   

   instrumentalness  key  liveness  loudness  mode  speechiness    tempo  \
0          0.021900    2    0.1650    -8.795     1       0.4310  150.062   
1          0.006110    1    0.1370   -10.401     1       0.0794  160.083   
2          0.000234    2    0.1590    -7.148     1       0.2890   75.044   
3          0.510000    5    0.0922   -15.236     1       0.0261   86.468   
4          0.512000    5    0.4390   -11.648     0       0.0694  174.004   

   time_signature  valence  target      song_title            artist  
0               4    0.286       1

In [11]:
# Step 2: Select Features and Target for the Classifier

# Define the list of feature columns we'll use to predict the song's popularity
features = [
    'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
    'speechiness', 'tempo', 'time_signature', 'valence'
]

# Create the input feature matrix X from the DataFrame using the selected features
X = df[features]

# Define the target variable y using the existing 'target' column
# This column should contain 0 (not popular) or 1 (popular)
y = df['target']

# Print the shapes of X and y to confirm
print("Features shape (X):", X.shape)
print("Target shape (y):", y.shape)

# Print a preview of the input features
print("\n Preview of features (X):")
print(X.head())

# Print a preview of the target values
print("\n Preview of target (y):")
print(y.head())


Features shape (X): (2017, 13)
Target shape (y): (2017,)

 Preview of features (X):
   acousticness  danceability  duration_ms  energy  instrumentalness  key  \
0        0.0102         0.833       204600   0.434          0.021900    2   
1        0.1990         0.743       326933   0.359          0.006110    1   
2        0.0344         0.838       185707   0.412          0.000234    2   
3        0.6040         0.494       199413   0.338          0.510000    5   
4        0.1800         0.678       392893   0.561          0.512000    5   

   liveness  loudness  mode  speechiness    tempo  time_signature  valence  
0    0.1650    -8.795     1       0.4310  150.062               4    0.286  
1    0.1370   -10.401     1       0.0794  160.083               4    0.588  
2    0.1590    -7.148     1       0.2890   75.044               4    0.173  
3    0.0922   -15.236     1       0.0261   86.468               4    0.230  
4    0.4390   -11.648     0       0.0694  174.004               4   

In [12]:
from sklearn.model_selection import train_test_split

# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Show the shapes of the splits
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training target shape:", y_train.shape)
print("Testing target shape:", y_test.shape)


Training features shape: (1613, 13)
Testing features shape: (404, 13)
Training target shape: (1613,)
Testing target shape: (404,)


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy on Test Set: {accuracy:.2f}")
print("\nClassification Report:")
print(report)


Accuracy on Test Set: 0.77

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.76      0.77       206
           1       0.76      0.78      0.77       198

    accuracy                           0.77       404
   macro avg       0.77      0.77      0.77       404
weighted avg       0.77      0.77      0.77       404



In [14]:
#Predict New Songs' Popularity

In [15]:
# Predict Manually

In [16]:
import numpy as np

# Example new song's features (dummy values)
# Make sure the order of features matches X.columns
new_song = np.array([[0.35, 0.75, 210000, 0.55, 0.0, 5, 0.1, -6.0, 1, 0.04, 120.0, 4, 0.6]])

# Predict popularity
prediction = model.predict(new_song)

if prediction[0] == 1:
    print("The song is likely to be POPULAR!")
else:
    print("The song is likely to be NOT popular.")


The song is likely to be NOT popular.




In [17]:
#Test on Real Songs from Your Dataset

In [18]:
# Pick a song row (e.g., row 10)
song_features = df[features].iloc[10].values.reshape(1, -1)

# Predict
prediction = model.predict(song_features)
title = df.iloc[10]['song_title']
artist = df.iloc[10]['artist']

if prediction[0] == 1:
    print(f"'{title}' by {artist} is likely to be POPULAR!")
else:
    print(f"'{title}' by {artist} is likely to be NOT popular.")


'Subways - In Flagranti Extended Edit' by The Avalanches is likely to be POPULAR!




In [19]:
#Improve Model Performance
#Check Feature Importance
import pandas as pd
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(feature_importances)


             Feature  Importance
4   instrumentalness    0.135026
7           loudness    0.124215
9        speechiness    0.103947
3             energy    0.096407
2        duration_ms    0.092485
1       danceability    0.092418
0       acousticness    0.090106
12           valence    0.078812
10             tempo    0.069444
6           liveness    0.064945
5                key    0.037040
8               mode    0.010887
11    time_signature    0.004267


In [20]:
# retraining the data after finding out the important features

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# ----------------------
# 1. Load dataset
# ----------------------
df = pd.read_csv("spotifydataset.csv")
df = df.dropna()

# ----------------------
# 2. Separate target and features
# ----------------------
target_col = "target"  # change this to your actual target column
y = df[target_col]
X = df.drop(columns=[target_col])

# ----------------------
# 3. Keep only numeric columns for ML
# ----------------------
X = X.select_dtypes(include=["number"])

# ----------------------
# 4. Train-test split
# ----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ----------------------
# 5. Random Forest for feature importance
# ----------------------
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("Feature Importance:")
print(feature_importance_df)

# ----------------------
# 6. Retrain with top features
# ----------------------
top_n = 5
top_features = feature_importance_df['Feature'].head(top_n)

rf_top = RandomForestClassifier(random_state=42)
rf_top.fit(X_train[top_features], y_train)

y_pred = rf_top.predict(X_test[top_features])

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Feature Importance:
             Feature  Importance
0         Unnamed: 0    0.781442
5   instrumentalness    0.048220
8           loudness    0.031081
1       acousticness    0.022988
4             energy    0.021247
10       speechiness    0.019680
2       danceability    0.019432
3        duration_ms    0.018254
13           valence    0.012834
11             tempo    0.010873
7           liveness    0.007591
6                key    0.004682
9               mode    0.001175
12    time_signature    0.000501

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       206
           1       1.00      1.00      1.00       198

    accuracy                           1.00       404
   macro avg       1.00      1.00      1.00       404
weighted avg       1.00      1.00      1.00       404



In [22]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# -----------------------
# Load your dataset
# -----------------------
df = pd.read_csv("spotifydataset.csv")  # change to your CSV name

# -----------------------
# Automatically encode non-numeric columns
# -----------------------
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

# -----------------------
# Define features and target
# -----------------------
X = df.drop("target", axis=1)  # replace target_column with your target column name
y = df["target"]

# -----------------------
# Train-test split
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------
# Train model
# -----------------------
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# -----------------------
# Feature importance
# -----------------------
feature_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("\nFeature Importance:\n", feature_importance)

# -----------------------
# Accuracy & classification report
# -----------------------
y_pred = model.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Feature Importance:
              Feature  Importance
0         Unnamed: 0    0.755383
5   instrumentalness    0.047639
8           loudness    0.038610
2       danceability    0.024267
3        duration_ms    0.021402
1       acousticness    0.021355
10       speechiness    0.020814
4             energy    0.020673
13           valence    0.013620
15            artist    0.008904
11             tempo    0.008728
7           liveness    0.007559
14        song_title    0.006256
6                key    0.003341
9               mode    0.001024
12    time_signature    0.000426

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       206
           1       1.00      1.00      1.00       198

    accuracy                           1.00       404
   macro avg       1.00      1.00      1.00       404
weighted avg       1.00      1.00      1.00       404



In [23]:
# Drop ID-like columns that leak info
cols_to_drop = ["Unnamed: 0", "song_title", "artist"]  # you can add/remove as needed
df = df.drop(columns=cols_to_drop, errors='ignore')

# Redo train-test split & training here...


In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("spotifydataset.csv")

# Drop non-numeric columns (optional if your data is already clean)
df = df.select_dtypes(include=['number'])

# Define features (X) and target (y)
X = df.drop(columns=['target'])  # replace with your actual target column name
y = df['target']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train RandomForest
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Feature importance
importances = pd.Series(model.feature_importances_, index=X.columns)
importances = importances.sort_values(ascending=False)
print("\nFeature Importances:\n", importances)


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       206
           1       1.00      1.00      1.00       198

    accuracy                           1.00       404
   macro avg       1.00      1.00      1.00       404
weighted avg       1.00      1.00      1.00       404


Feature Importances:
 Unnamed: 0          0.781442
instrumentalness    0.048220
loudness            0.031081
acousticness        0.022988
energy              0.021247
speechiness         0.019680
danceability        0.019432
duration_ms         0.018254
valence             0.012834
tempo               0.010873
liveness            0.007591
key                 0.004682
mode                0.001175
time_signature      0.000501
dtype: float64


In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load data
df = pd.read_csv("spotifydataset.csv")

# Drop leakage/index columns
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

# Features & target
X = df.drop(columns=["target"])
y = df["target"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Feature importances
importances = pd.Series(model.feature_importances_, index=X.columns)
print("\nFeature Importances:\n", importances.sort_values(ascending=False))


ValueError: could not convert string to float: 'Pleasure Power (DJ Smash Disco Remix)'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load data
df = pd.read_csv("spotifydataset.csv")

# Drop leakage/index columns
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

# Features & target
X = df.drop(columns=["target"])
y = df["target"]

# Keep only numeric columns
X = X.select_dtypes(include=["number"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Feature importances
importances = pd.Series(model.feature_importances_, index=X.columns)
print("\nFeature Importances:\n", importances.sort_values(ascending=False))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load dataset
file_path = ("spotifydataset.csv")
data = pd.read_csv(file_path)

# Encode categorical features
label_encoders = {}
for col in ["artist", "song_title"]:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # store encoder if needed later

# Define features & target
X = data.drop(columns=["target"])
y = data["target"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# RandomForest with tuned parameters
model = RandomForestClassifier(
    n_estimators=300,       # more trees
    max_depth=15,           # limit depth to prevent overfitting
    min_samples_split=5,    # require more samples to split
    min_samples_leaf=2,     # leaf size
    random_state=42
)

# Train model
model.fit(X_train, y_train)

# Evaluate accuracy
train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Testing Accuracy: {test_acc:.4f}")


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("spotifydataset.csv")

# Target column
target_col = "target"

# Step 1: Detect leakage columns
def find_leakage_columns(df, target_col, threshold=0.999):
    leakage_cols = []
    for col in df.columns:
        if col != target_col and pd.api.types.is_numeric_dtype(df[col]):
            corr = df[col].corr(df[target_col])
            if abs(corr) >= threshold:  # suspiciously high correlation
                leakage_cols.append(col)
    return leakage_cols

# Step 2: Detect ID-like columns
def find_id_columns(df):
    id_like_cols = []
    for col in df.columns:
        if df[col].nunique() == len(df):  # unique for every row
            id_like_cols.append(col)
        elif "id" in col.lower() or "index" in col.lower():
            id_like_cols.append(col)
    return id_like_cols

# Find leakage & ID-like columns
leakage_columns = find_leakage_columns(df, target_col)
id_like_columns = find_id_columns(df)

# Combine and drop duplicates
columns_to_drop = list(set(leakage_columns + id_like_columns))

print("🔍 Detected leakage columns:", leakage_columns)
print("🔍 Detected ID-like columns:", id_like_columns)
print("🗑 Dropping columns:", columns_to_drop)

# Drop leakage columns
df_clean = df.drop(columns=columns_to_drop)

# Step 3: Train-Test split
X = df_clean.drop(columns=[target_col])
y = df_clean[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 5: Evaluate
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("\n✅ Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("✅ Testing Accuracy:", accuracy_score(y_test, y_test_pred))


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("spotifydataset.csv")

target_col = "target"

# Step 1: Detect leakage columns
def find_leakage_columns(df, target_col, threshold=0.999):
    leakage_cols = []
    for col in df.columns:
        if col != target_col and pd.api.types.is_numeric_dtype(df[col]):
            corr = df[col].corr(df[target_col])
            if abs(corr) >= threshold:  # suspiciously high correlation
                leakage_cols.append(col)
    return leakage_cols

# Step 2: Detect ID-like columns
def find_id_columns(df):
    id_like_cols = []
    for col in df.columns:
        if df[col].nunique() == len(df):  # all unique
            id_like_cols.append(col)
        elif "id" in col.lower() or "index" in col.lower():
            id_like_cols.append(col)
    return id_like_cols

# Step 3: Detect non-numeric columns (to drop or encode)
def find_non_numeric_columns(df, target_col):
    return [col for col in df.columns if col != target_col and not pd.api.types.is_numeric_dtype(df[col])]

# Find problematic columns
leakage_columns = find_leakage_columns(df, target_col)
id_like_columns = find_id_columns(df)
non_numeric_columns = find_non_numeric_columns(df, target_col)

# Combine drop list
columns_to_drop = list(set(leakage_columns + id_like_columns + non_numeric_columns))

print("🔍 Detected leakage columns:", leakage_columns)
print("🔍 Detected ID-like columns:", id_like_columns)
print("🔍 Detected non-numeric columns:", non_numeric_columns)
print("🗑 Dropping columns:", columns_to_drop)

# Drop the problematic columns
df_clean = df.drop(columns=columns_to_drop)

# Train-test split
X = df_clean.drop(columns=[target_col])
y = df_clean[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("\n✅ Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("✅ Testing Accuracy:", accuracy_score(y_test, y_test_pred))


In [None]:
#

In [33]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import numpy as np

# Assuming you already have X (features) and y (target)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# XGBoost model
xgb_model = xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)

# Parameter grid for tuning
param_dist = {
    'n_estimators': np.arange(50, 500, 50),           # Number of trees
    'max_depth': np.arange(2, 12, 1),                 # Tree depth
    'learning_rate': np.linspace(0.01, 0.3, 10),      # Step size shrinkage
    'subsample': np.linspace(0.6, 1.0, 5),            # Fraction of samples
    'colsample_bytree': np.linspace(0.6, 1.0, 5),     # Fraction of features
    'gamma': np.linspace(0, 5, 10),                   # Minimum loss reduction
    'reg_lambda': np.linspace(0.1, 5, 10),            # L2 regularization
    'reg_alpha': np.linspace(0, 5, 10)                # L1 regularization
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=50,                # Number of random combinations
    scoring='accuracy',
    cv=5,                     # 5-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the search
random_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


ModuleNotFoundError: No module named 'xgboost'

In [35]:
!pip install xgboost




In [40]:
import sys
print(sys.executable)


C:\Users\Dell\anaconda3\envs\tf_env\python.exe


In [42]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import numpy as np

# Assuming you already have X (features) and y (target)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# XGBoost model
xgb_model = xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)

# Parameter grid for tuning
param_dist = {
    'n_estimators': np.arange(50, 500, 50),           # Number of trees
    'max_depth': np.arange(2, 12, 1),                 # Tree depth
    'learning_rate': np.linspace(0.01, 0.3, 10),      # Step size shrinkage
    'subsample': np.linspace(0.6, 1.0, 5),            # Fraction of samples
    'colsample_bytree': np.linspace(0.6, 1.0, 5),     # Fraction of features
    'gamma': np.linspace(0, 5, 10),                   # Minimum loss reduction
    'reg_lambda': np.linspace(0.1, 5, 10),            # L2 regularization
    'reg_alpha': np.linspace(0, 5, 10)                # L1 regularization
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=50,                # Number of random combinations
    scoring='accuracy',
    cv=5,                     # 5-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the search
random_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 50 candidates, totalling 250 fits


ValueError: 
All the 250 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
250 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\data.py", line 407, in pandas_feature_info
    new_feature_types.append(_pandas_dtype_mapper[dtype.name])
KeyError: 'object'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\sklearn.py", line 1664, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\sklearn.py", line 628, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\sklearn.py", line 1137, in _create_dmatrix
    return QuantileDMatrix(
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\core.py", line 1614, in __init__
    self._init(
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\core.py", line 1678, in _init
    it.reraise()
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\core.py", line 572, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\core.py", line 553, in _handle_exception
    return fn()
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\core.py", line 640, in <lambda>
    return self._handle_exception(lambda: int(self.next(input_data)), 0)
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\data.py", line 1654, in next
    input_data(**self.kwargs)
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\core.py", line 620, in input_data
    new, cat_codes, feature_names, feature_types = _proxy_transform(
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\data.py", line 1707, in _proxy_transform
    df, feature_names, feature_types = _transform_pandas_df(
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\data.py", line 640, in _transform_pandas_df
    feature_names, feature_types = pandas_feature_info(
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\data.py", line 409, in pandas_feature_info
    _invalid_dataframe_dtype(data)
  File "C:\Users\Dell\anaconda3\envs\tf_env\lib\site-packages\xgboost\data.py", line 372, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:song_title: object, artist: object


In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# ===== Load Dataset =====
df = pd.read_csv("spotifydataset.csv")

# ===== Detect Leakage Columns =====
def detect_leakage(df):
    leakage_cols = []
    id_like_cols = []
    non_numeric_cols = []

    for col in df.columns:
        if df[col].nunique() == df.shape[0]:  # Unique values = rows → ID-like
            id_like_cols.append(col)
        elif any(keyword in col.lower() for keyword in ["id", "uuid", "number"]):
            id_like_cols.append(col)

        if not np.issubdtype(df[col].dtype, np.number):
            non_numeric_cols.append(col)

    return leakage_cols, id_like_cols, non_numeric_cols

leakage_cols, id_like_cols, non_numeric_cols = detect_leakage(df)

# Columns to drop
drop_cols = list(set(leakage_cols + id_like_cols + non_numeric_cols))
print(f"🗑 Dropping columns: {drop_cols}")

df_clean = df.drop(columns=drop_cols)

# ===== Features & Target =====
X = df_clean.drop(columns=["target"])  # Change 'target' to your label column
y = df_clean["target"]

# ===== Train-Test Split =====
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ===== Model & Hyperparameter Tuning =====
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.3],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

xgb = XGBClassifier(eval_metric="mlogloss", random_state=42)
grid_search = GridSearchCV(xgb, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# ===== Best Params & Accuracy =====
print("🎯 Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

print(f"✅ Training Accuracy: {accuracy_score(y_train, y_pred_train):.4f}")
print(f"✅ Testing Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")


🗑 Dropping columns: ['artist', 'Unnamed: 0', 'song_title']
Fitting 3 folds for each of 108 candidates, totalling 324 fits
🎯 Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
✅ Training Accuracy: 0.9616
✅ Testing Accuracy: 0.7723
