In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.utils import resample

# Load the data
final_tracks_df = pd.read_csv('final_CM_consolidatedTracksForML.csv', low_memory=False)
new_tracks_df = pd.read_csv('ListOfSpotifyTracksThatArePotentialNewHits.csv', low_memory=False)

# Process both dataframes with the same encoding and cleaning steps
combined_df = pd.concat([final_tracks_df, new_tracks_df], ignore_index=True)


# Handle categorical variables using Label Encoding for simplicity
cat_columns = combined_df.select_dtypes(include=['object']).columns

# Apply Label Encoder to each categorical column
label_encoders = {}
for col in cat_columns:
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col].astype(str))  # Ensure conversion to string
    label_encoders[col] = le  # Store the label encoder
    
# Check for missing values and fill them
combined_df.fillna(combined_df.mean(), inplace=True)

# Define features and target
X = combined_df.drop('IsHit', axis=1)
Y = combined_df['IsHit']

# Address potential class imbalance
majority = combined_df[combined_df.IsHit == 0]
minority = combined_df[combined_df.IsHit == 1]
minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
upsampled = pd.concat([majority, minority_upsampled])
X_upsampled = upsampled.drop('IsHit', axis=1)
Y_upsampled = upsampled['IsHit']

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_upsampled, Y_upsampled, test_size=0.3, random_state=42)

# Initialize and train the classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)

# Predict on the testing data
predictions = model.predict(X_test)

# Calculate metrics
precision = precision_score(Y_test, predictions)
recall = recall_score(Y_test, predictions)
f1 = f1_score(Y_test, predictions)
accuracy = accuracy_score(Y_test, predictions)
cm = confusion_matrix(Y_test, predictions)

# Perform cross-validation
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
cv_scores = cross_val_score(model, X_upsampled, Y_upsampled, cv=cv, scoring='accuracy')

# Print out the scores and cross-validation results
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{cm}")
print(f"Cross-Validation Accuracy: {cv_scores.mean()}")

# Now predict the IsHit status for new tracks
new_hits_predictions = model.predict(new_tracks_df.drop('IsHit', axis=1, errors='ignore'))

# Append predictions to the new_tracks_df
new_tracks_df['PredictedIsHit'] = new_hits_predictions

# Save or display predictions
new_tracks_df.to_csv('PredictedNewHits.csv', index=False)
print("Predictions for new tracks saved to 'PredictedNewHits.csv'.")


Precision: 0.9898767605633803
Recall: 0.9881370826010545
F1 Score: 0.9890061565523307
Accuracy: 0.98901340364755
Confusion Matrix:
[[2252   23]
 [  27 2249]]
Cross-Validation Accuracy: 0.9918259723137772


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- added_at
- artist_covers_y
- artist_type
- chart_name
- chart_type
- ...
