In [1]:
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint
import pickle as pkl
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
df = pd.read_csv("formatted_data.csv", index_col = 0)

In [3]:
X = df.drop(columns = ["Score"])
y = df["Score"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

In [5]:
X_test_index = X_test.index

In [6]:
model = RandomForestRegressor().fit(X_train, y_train)

param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}

random_search = RandomizedSearchCV(model, param_dist, n_iter=100, cv=5)
random_search.fit(X_train, y_train)

model = random_search.best_estimator_

In [7]:
y_pred = pd.Series(model.predict(X_test), index = X_test_index, name = "Predicted Score")

In [8]:
error = (y_test - y_pred) / y_test * 100

In [9]:
max_error = error.abs().max()
avg_error = error.abs().mean()

In [10]:
score = model.score(X_test, y_test)

In [11]:
y_test.to_csv("y_test.csv")
y_pred.to_csv("y_pred.csv")
error.to_csv("error.csv")
print(f"Max Error: {max_error}")
print(f"Average Error: {avg_error}")
print(f"Score: {score}")
with open("model.pkl", "wb") as f:
    pkl.dump(model, f)

Max Error: 16.075767336347937
Average Error: 8.282723073170297
Score: 0.8093531043567285
