# Generate the model from processed.csv

imports

In [13]:
from sklearn import clone
from sklearn.base import RegressorMixin, TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import r_regression, SelectPercentile
from sklearn.model_selection import KFold, GridSearchCV
from xgboost import XGBRegressor
import pandas as pd
import pickle
import numpy as np
import json

Read CSV

In [14]:

df = pd.read_csv('data/processed.csv')

# makes models perform worse
# df['difficulty_transform'], lambda_value = stats.boxcox(df['difficulty'])

print(df.head())


   difficulty  average_tempo  average_bpm  note_count  tick_count  \
0           4  413197.921548   145.208862        3656      155036   
1           5  600000.000000   100.000000        3926       28352   
2           5  600000.000000   100.000000        3926       28352   
3           5  350818.280820   171.028716        2674      151680   
4           5  441740.064844   135.826484        3570       22816   

   note_density  tempo_deviation  unique_note_count  total_duration  \
0      0.023582     97297.985266                 65          155036   
1      0.138473         0.000000                 57           28352   
2      0.138473         0.000000                 57           28352   
3      0.017629     11112.029407                 73          452520   
4      0.156469      7352.818211                 63           43392   

   overlapping_notes  ...  odd_time_signature_count  consecutive_note_std  \
0                283  ...                         1             43.313047   
1   

## Selecting best features

In [15]:
X = df.drop(columns=['difficulty'])
y = df['difficulty']

features_names = X.columns

# Keep the best features
selector = SelectPercentile(r_regression, percentile=40)
X = selector.fit_transform(X, y)
features_kept = features_names[selector.get_support()]
print("Features kept:", features_kept)

Features kept: Index(['note_count', 'note_density', 'unique_note_count', 'notes_per_second',
       'pitch_range', 'tempo_change_count', 'note_to_note_transition',
       'note_to_chord_transition', 'chord_to_note_transition',
       'chord_to_chord_transition'],
      dtype='object')


## Scaling data and splitting
features have a large difference in scale, scaling makes model perform slightly better

In [16]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Removes randomness from the model across runs
rng = np.random.RandomState(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rng)


## Tuning hyperparameters

In [21]:
model_params = {
  'XGB': {
    'model': XGBRegressor(random_state=rng),
    'params': {
        'n_estimators': [100, 200, 500, 1000],
        'gamma': [0.005, 0.01, 0.1, 0],
        'max_depth': [1, 2, 3, 6, 9],
        'learning_rate': [0.001, 0.01, 0.1, 0.015, 1],
        'min_child_weight': [1, 2, 3],
    }
  },
  'RandomForest': {
    'model': RandomForestRegressor(random_state=rng),
    'params': {
      'n_estimators': [100, 200, 500],
      'max_features': ['sqrt', 'log2', None],
      'min_samples_split': [2, 5, 10],
      'max_depth': [None, 1, 2, 10, 20, 30],
      'max_leaf_nodes': [None, 2, 5, 10],
    }
  },
  'SVR': {
    'model': SVR(),
    'params': {
      'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
      'C': [0.1, 1, 10, 20, 50, 100],
      'gamma': ['scale', 'auto'],
    }
  }
}

results =  {}

for name, config in model_params.items():
  print('tuning hyperparamerts for ', name)
  # Can edit cv to be more or less, less is faster
  grid_search = GridSearchCV(config['model'], config['params'], cv=3, scoring='r2')
  grid_search.fit(X_train, y_train)
  
  results[name] = {
    'best_params': grid_search.best_params_,
    'best_score': grid_search.best_score_,
  }
  
  print(f"Best parameters for {name}:", grid_search.best_params_)
  print(f"Best cross-validation score for {name}:", grid_search.best_score_)

with open(f'model_params.json', 'w') as f:
    json.dump(results, f)

tuning hyperparamerts for  XGB
Best parameters for XGB: {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 2, 'n_estimators': 1000}
Best cross-validation score for XGB: 0.6936408629359881
tuning hyperparamerts for  RandomForest
Best parameters for RandomForest: {'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation score for RandomForest: 0.7179281339930079
tuning hyperparamerts for  SVR
Best parameters for SVR: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Best cross-validation score for SVR: 0.669228374002386


In [22]:
with open ('model_params.json', 'r') as f:
    params = json.load(f)
    
models = {
    'XGB': XGBRegressor(**params['XGB']['best_params']),
    'RandomForest': RandomForestRegressor(**params['RandomForest']['best_params']),
    'SVR': SVR(**params['SVR']['best_params'])
}


Validation Function

In [23]:
n_folds = 5

def rmse_cross_validation(model):
    kf = KFold(n_folds, shuffle=True, random_state=42)
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [24]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {name}")
    # Round and clip to ensure valid difficulty levels
    y_pred_rounded = np.clip(np.round(y_pred), 1, 5)
    print("Mean squared error:", mean_squared_error(y_test, y_pred))
    print("Mean absolute error:", mean_absolute_error(y_test, y_pred))
    print("R^2 score:", r2_score(y_test, y_pred))
    print("Root mean squared error:", rmse_cross_validation(model).mean())
    print("======================")
    # Save the model to disk
    with open(f'./models/{name}_saved_model.pkl', 'wb') as file:
      pickle.dump(model, file)

Model: XGB
Mean squared error: 0.2417248541571918
Mean absolute error: 0.36344913549201435
R^2 score: 0.8882067895606184
Root mean squared error: 0.7845504811494957
Model: RandomForest
Mean squared error: 0.3185553868269104
Mean absolute error: 0.42526585360743696
R^2 score: 0.8526741094940077
Root mean squared error: 0.717761674469414
Model: SVR
Mean squared error: 0.23036365208346002
Mean absolute error: 0.3998919642275555
R^2 score: 0.8934611323906159
Root mean squared error: 0.768929666256393


## AveragingModel using the models trained earlier

In [33]:
# referenced from https://www.kaggle.com/code/serigne/stacked-regressions-top-4-on-leaderboard
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models, scaler):
        self.models = models
        self.scaler = scaler

    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

Validation:

In [36]:
averaged_models = AveragingModels(models=list(models.values()), scaler = scaler)
score = rmse_cross_validation(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

averaged_models.fit(X_train, y_train)
y_pred = averaged_models.predict(X_test)
print("Mean squared error:", mean_squared_error(y_test, y_pred))
print("Mean absolute error:", mean_absolute_error(y_test, y_pred))
print("R^2 score:", r2_score(y_test, y_pred))

with open(f'./models/averaged_models.pkl', 'wb') as file:
    pickle.dump(averaged_models, file)



 Averaged base models score: 0.7145 (0.1652)

Mean squared error: 0.21052522639447882
Mean absolute error: 0.35158797977946504
R^2 score: 0.9026360321152097


# Predicting the difficulty for the rest of the songs

In [None]:
with open('./models/averaged_models.pkl', 'rb') as file:
    averaged_models = pickle.load(file)

df = pd.read_csv('data/all_song_features.csv')

# Select only the columns used during training
X_unseen = df.loc[:, features_kept]

# Scale the data using the same scaler from training
X_unseen = scaler.transform(X_unseen)

y_pred = averaged_models.predict(X_unseen)

predictions = pd.DataFrame({
    'file': df['file'],
    'predicted_difficulty': y_pred
}).to_csv('predictions.csv', index=False)



