In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [2]:
# Step 1: Load the data
data = pd.read_csv("synth_2020_22.csv.zip", compression="zip")

# Step 2: Preprocess the data
# You'll need to specify the columns that are categorical and numerical.
# Replace "categorical_columns" and "numerical_columns" with your actual column names.
# For this example, I'll assume "categorical_columns" and "numerical_columns" are defined properly.

# Handle missing values if needed
# data.fillna(0, inplace=True)  # You can replace 0 with an appropriate value

# # Encode categorical variables using one-hot encoding
# data = pd.get_dummies(data, columns=categorical_columns)
BASIC_COLS = ['PTS', 'REB', 'AST', 'STL', 'BLK', 'min_sec', 'FGM', 'FG3M', 'FTM', 'TO', 'PLUS_MINUS']
PAIR_COLS = ['pts_plus_reb', 'pts_plus_ast', 'ast_plus_reb', 'ast_minus_to', 'str_plus_blk']
COLS = BASIC_COLS + PAIR_COLS

In [3]:
# Split the dataset into training and testing sets based on GAME_DATE
split_date = "2022-02-01"  # Replace with your desired date
train_data = data[data["GAME_DATE"] < split_date]
test_data = data[data["GAME_DATE"] >= split_date]

# Define your target variable (y) and features (X)
X_train = train_data[COLS]
y_train = train_data[["synth_max_season", "synth_max_league"]].max(axis=1)

X_test = test_data[COLS]
y_test = test_data[["synth_max_season", "synth_max_league"]].max(axis=1)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(39300, 16) (9793, 16) (39300,) (9793,)


In [6]:
# Step 3: Train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=300, n_jobs=5, random_state=42)
rf_classifier.fit(X_train, y_train)

# Step 4: Evaluate the model
y_pred = rf_classifier.predict(X_test)

# Calculate proper metrics for unbalanced binary classification
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9976513836413765
Precision: 0.7307692307692307
Recall: 0.9661016949152542
F1 Score: 0.8321167883211679
ROC AUC Score: 0.981972154217438
Confusion Matrix:
 [[9713   21]
 [   2   57]]
