In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

### Load the data

In [2]:
# Load the data
final_tracks_df = pd.read_csv('CSVs/final_CM_consolidatedTracksForML.csv', low_memory=False)
new_tracks_df = pd.read_csv('CSVs/Final_ML_ListOfSpotifyTracksThatArePotentialNewHits.csv', low_memory=False)

### Process both dataframes with the same encoding and cleaning steps

In [3]:
# Combine for uniform preprocessing
combined_df = pd.concat([final_tracks_df, new_tracks_df])

# Handle categorical variables using Label Encoding for simplicity
cat_columns = combined_df.select_dtypes(include=['object']).columns

In [4]:
# Apply Label Encoder to each categorical column
label_encoders = {}
for col in cat_columns:
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col].astype(str))  # Ensure conversion to string
    label_encoders[col] = le  # Store the label encoder

# Split combined_df back into original datasets
final_tracks_df = combined_df.iloc[:len(final_tracks_df)]
new_tracks_df = combined_df.iloc[len(final_tracks_df):]

# Check for missing values and fill them
combined_df.fillna(combined_df.mean(), inplace=True)

### Final df and train/test split

In [5]:
# Define features and target in the training data
X = final_tracks_df.drop('IsHit', axis=1)
Y = final_tracks_df['IsHit']

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

## Classification

In [6]:
# Initialize and train the classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)

# Predict on the testing data from final_tracks_df
predictions = model.predict(X_test)

### Metrics

In [7]:
# Calculate metrics
precision = precision_score(Y_test, predictions)
recall = recall_score(Y_test, predictions)
f1 = f1_score(Y_test, predictions)
accuracy = accuracy_score(Y_test, predictions)

# Print out the scores
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")

Precision: 0.9287387085981934
Recall: 0.8821099459802987
F1 Score: 0.9048239895697523
Accuracy: 0.8921514312096029


### Predicting 'IsHit' for new tracks

In [8]:
# Now predict the IsHit status for new tracks
new_tracks_features = new_tracks_df.drop('IsHit', axis=1, errors='ignore')  # Drop IsHit if it exists
new_hits_predictions = model.predict(new_tracks_features)

# Append predictions to the new_tracks_df
new_tracks_df['PredictedIsHit'] = new_hits_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_tracks_df['PredictedIsHit'] = new_hits_predictions


## Save predictions

In [9]:
# Save or display predictions
new_tracks_df.to_csv('CSVs/Final_PredictedNewHits.csv', index=False)
print("Predictions for new tracks saved to 'Final_PredictedNewHits.csv'.")

Predictions for new tracks saved to 'Final_PredictedNewHits.csv'.
