In [None]:
# rf2 will use differential team stats instead of 2 sets of team stats.
# Also, instead of quantiles, using avg/std

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [7]:
# load data

training_data = pd.read_csv(f"v2_data/training_data_2025-08-14.csv")
training_data = training_data.drop(['Unnamed: 0_Team0', 'Unnamed: 0_Team1'], axis=1)
training_data = training_data.round(2)
training_data['ph_kd_ratio_max_Team0'] = training_data['ph_kd_ratio_max_Team0'].replace('inf', 0,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  training_data['ph_kd_ratio_max_Team0'] = training_data['ph_kd_ratio_max_Team0'].replace('inf', 0,inplace=True)


In [8]:
training_data.to_csv("quick_Test.csv")

PermissionError: [Errno 13] Permission denied: 'quick_Test.csv'

In [None]:
# This was resolved earlier in the pipeline for future datasets
# # Check for infinite values
# inf_values = np.isinf(training_data.select_dtypes(include=['float64', 'float32'])).sum()
# print("Infinite values:\n", inf_values[inf_values > 0])

# # Check column data types
# print(training_data.dtypes)

# # Check for extremely large values
# for col in training_data.select_dtypes(include=['float64', 'float32']).columns:
#     if training_data[col].max() > 1e20:  # Adjust threshold as needed
#         print(f"Column {col} has extremely large values: {training_data[col].max()}")

Infinite values:
 Series([], dtype: int64)
Unnamed: 0                          int64
match_id                            int64
p_total_time_played_min_Team0       int64
p_total_time_played_min_Team1       int64
p_total_time_played_max_Team0       int64
                                   ...   
ph_win_rate_ratio_median_Team0    float64
ph_win_rate_ratio_median_Team1    float64
ph_win_rate_ratio_q75_Team0       float64
ph_win_rate_ratio_q75_Team1       float64
team_0_win                         object
Length: 133, dtype: object


In [None]:
# training_data['ph_kd_ratio_max_Team0']

In [10]:
# Prep data
y = training_data['team_0_win']
X = training_data.drop(columns=['team_0_win'])


# Split  data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [24]:
def train_random_forest(X_train, X_test, y_train, y_test, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', random_state=42):

    # Initialize the model with these starting parameters
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=random_state,
        n_jobs=-1                # Use all available cores
    )
    
    # Train the model
    rf_model.fit(X_train, y_train)

    # Make predictions
    y_pred = rf_model.predict(X_test)


    return rf_model, y_test, y_pred

def evaluate_model(model, y_test, y_pred):
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Feature importance
    feature_importance = pd.DataFrame(
        {'feature': X.columns, 'importance': model.feature_importances_}
    ).sort_values('importance', ascending=False)

    print("\nTop 25 important features:")
    print(feature_importance.head(25))

In [20]:
model, y_t, y_p = train_random_forest(X_train, X_test, y_train, y_test)


Accuracy: 0.6629

Classification Report:
              precision    recall  f1-score   support

           N       0.65      0.70      0.67        88
           Y       0.68      0.62      0.65        90

    accuracy                           0.66       178
   macro avg       0.66      0.66      0.66       178
weighted avg       0.66      0.66      0.66       178


Confusion Matrix:
[[62 26]
 [34 56]]

Top 25 important features:
                               feature  importance
123        ph_win_rate_ratio_min_Team1    0.022840
125        ph_win_rate_ratio_max_Team1    0.022370
126        ph_win_rate_ratio_q25_Team0    0.021401
122        ph_win_rate_ratio_min_Team0    0.019266
124        ph_win_rate_ratio_max_Team0    0.017883
128     ph_win_rate_ratio_median_Team0    0.016599
130        ph_win_rate_ratio_q75_Team0    0.014678
127        ph_win_rate_ratio_q25_Team1    0.014486
131        ph_win_rate_ratio_q75_Team1    0.012987
129     ph_win_rate_ratio_median_Team1    0.011821
55   

In [25]:
evaluate_model(model, y_t, y_p)

Accuracy: 0.6629

Classification Report:
              precision    recall  f1-score   support

           N       0.65      0.70      0.67        88
           Y       0.68      0.62      0.65        90

    accuracy                           0.66       178
   macro avg       0.66      0.66      0.66       178
weighted avg       0.66      0.66      0.66       178


Confusion Matrix:
[[62 26]
 [34 56]]

Top 25 important features:
                               feature  importance
123        ph_win_rate_ratio_min_Team1    0.022840
125        ph_win_rate_ratio_max_Team1    0.022370
126        ph_win_rate_ratio_q25_Team0    0.021401
122        ph_win_rate_ratio_min_Team0    0.019266
124        ph_win_rate_ratio_max_Team0    0.017883
128     ph_win_rate_ratio_median_Team0    0.016599
130        ph_win_rate_ratio_q75_Team0    0.014678
127        ph_win_rate_ratio_q25_Team1    0.014486
131        ph_win_rate_ratio_q75_Team1    0.012987
129     ph_win_rate_ratio_median_Team1    0.011821
55   