In [1]:
# Step 1: Import libraries and load the dataset

import pandas as pd  # For handling data
import numpy as np   # For numerical operations

# Load the dataset
df = pd.read_csv("spotifydataset.csv")

# Show the shape of the dataset (rows, columns)
print("Dataset shape:", df.shape)

# Display the first 5 rows of the dataset
print("\nFirst 5 rows:")
print(df.head())

# Show info about each column: data type and non-null values
print("\nDataset info:")
print(df.info())

# Check if any column has missing values
print("\nMissing values in each column:")
print(df.isnull().sum())


Dataset shape: (2017, 17)

First 5 rows:
   Unnamed: 0  acousticness  danceability  duration_ms  energy  \
0           0        0.0102         0.833       204600   0.434   
1           1        0.1990         0.743       326933   0.359   
2           2        0.0344         0.838       185707   0.412   
3           3        0.6040         0.494       199413   0.338   
4           4        0.1800         0.678       392893   0.561   

   instrumentalness  key  liveness  loudness  mode  speechiness    tempo  \
0          0.021900    2    0.1650    -8.795     1       0.4310  150.062   
1          0.006110    1    0.1370   -10.401     1       0.0794  160.083   
2          0.000234    2    0.1590    -7.148     1       0.2890   75.044   
3          0.510000    5    0.0922   -15.236     1       0.0261   86.468   
4          0.512000    5    0.4390   -11.648     0       0.0694  174.004   

   time_signature  valence  target      song_title            artist  
0               4    0.286       1

In [17]:
# Step 2: Select Features and Target for the Classifier

# Define the list of feature columns we'll use to predict the song's popularity
features = [
    'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
    'speechiness', 'tempo', 'time_signature', 'valence'
]

# Create the input feature matrix X from the DataFrame using the selected features
X = df[features]

# Define the target variable y using the existing 'target' column
# This column should contain 0 (not popular) or 1 (popular)
y = df['target']

# Print the shapes of X and y to confirm
print("Features shape (X):", X.shape)
print("Target shape (y):", y.shape)

# Print a preview of the input features
print("\n Preview of features (X):")
print(X.head())

# Print a preview of the target values
print("\n Preview of target (y):")
print(y.head())


Features shape (X): (2017, 13)
Target shape (y): (2017,)

 Preview of features (X):
   acousticness  danceability  duration_ms  energy  instrumentalness  key  \
0        0.0102         0.833       204600   0.434          0.021900    2   
1        0.1990         0.743       326933   0.359          0.006110    1   
2        0.0344         0.838       185707   0.412          0.000234    2   
3        0.6040         0.494       199413   0.338          0.510000    5   
4        0.1800         0.678       392893   0.561          0.512000    5   

   liveness  loudness  mode  speechiness    tempo  time_signature  valence  
0    0.1650    -8.795     1       0.4310  150.062               4    0.286  
1    0.1370   -10.401     1       0.0794  160.083               4    0.588  
2    0.1590    -7.148     1       0.2890   75.044               4    0.173  
3    0.0922   -15.236     1       0.0261   86.468               4    0.230  
4    0.4390   -11.648     0       0.0694  174.004               4   

In [19]:
from sklearn.model_selection import train_test_split

# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Show the shapes of the splits
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training target shape:", y_train.shape)
print("Testing target shape:", y_test.shape)


Training features shape: (1613, 13)
Testing features shape: (404, 13)
Training target shape: (1613,)
Testing target shape: (404,)


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy on Test Set: {accuracy:.2f}")
print("\nClassification Report:")
print(report)


Accuracy on Test Set: 0.77

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.76      0.77       206
           1       0.76      0.77      0.77       198

    accuracy                           0.77       404
   macro avg       0.77      0.77      0.77       404
weighted avg       0.77      0.77      0.77       404



In [23]:
#Predict New Songs' Popularity

In [25]:
# Predict Manually

In [35]:
import numpy as np

# Example new song's features (dummy values)
# Make sure the order of features matches X.columns
new_song = np.array([[0.35, 0.75, 210000, 0.55, 0.0, 5, 0.1, -6.0, 1, 0.04, 120.0, 4, 0.6]])

# Predict popularity
prediction = model.predict(new_song)

if prediction[0] == 1:
    print("The song is likely to be POPULAR!")
else:
    print("The song is likely to be NOT popular.")


The song is likely to be NOT popular.




In [29]:
#Test on Real Songs from Your Dataset

In [33]:
# Pick a song row (e.g., row 10)
song_features = df[features].iloc[10].values.reshape(1, -1)

# Predict
prediction = model.predict(song_features)
title = df.iloc[10]['song_title']
artist = df.iloc[10]['artist']

if prediction[0] == 1:
    print(f"'{title}' by {artist} is likely to be POPULAR!")
else:
    print(f"'{title}' by {artist} is likely to be NOT popular.")


'Subways - In Flagranti Extended Edit' by The Avalanches is likely to be POPULAR!


