In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [1]:
start_date = "2025-08-19"
end_date = "2025-08-21"
folder_name = f"v2_data//pred_data//test_pred_v2_{start_date}_{end_date}//training"

In [2]:
# Set path for .csv where training data is stored.
training_path = f"{folder_name}//training_data.csv"
diff_training_path = f"{folder_name}//differential_training_data.csv"

In [5]:
# load data
training_data = pd.read_csv(training_path)
training_data = training_data.round(2)

diff_training_data = pd.read_csv(diff_training_path)
diff_training_data = diff_training_data.round(2)

In [6]:
def check_data_issues(df):
    """Check for common missing/null/error values in the DataFrame and print summary. Also print rows with errors."""
    import numpy as np
    print("--- Data Issues Summary ---")
    # Check for NaN values
    nan_counts = df.isna().sum()
    if nan_counts.any():
        print("NaN values found:")
        print(nan_counts[nan_counts > 0])
    else:
        print("No NaN values found.")

    # Check for infinite values
    inf_counts = np.isinf(df.select_dtypes(include=[float, int])).sum()
    if inf_counts.any():
        print("Infinite values found:")
        print(inf_counts[inf_counts > 0])
    else:
        print("No infinite values found.")

    # Check for string 'inf', '-inf', 'nan', 'None', or empty string
    error_strings = ['inf', '-inf', 'nan', 'None', '']
    error_rows = set()
    for col in df.select_dtypes(include=[object]).columns:
        for err in error_strings:
            mask = (df[col] == err)
            count = mask.sum()
            if count > 0:
                print(f"Column '{col}' has {count} occurrences of '{err}'")
                error_rows.update(df[mask].index.tolist())

    # Collect all error rows (NaN, inf, error strings)
    nan_rows = set(df[df.isna().any(axis=1)].index.tolist())
    inf_rows = set(df[np.isinf(df.select_dtypes(include=[float, int])).any(axis=1)].index.tolist())
    all_error_rows = nan_rows.union(inf_rows).union(error_rows)

    print("--- End of Data Issues Summary ---")
    if all_error_rows:
        print(f"\nDetailed view of rows with data errors ({len(all_error_rows)} rows) saved to .csv:")
        df.loc[sorted(all_error_rows)].to_csv(f"{df}_data_errors.csv", index=False)
    else:
        print("No rows with data errors found.")

In [7]:
check_data_issues(training_data)
check_data_issues(diff_training_data)

--- Data Issues Summary ---
No NaN values found.
No infinite values found.
--- End of Data Issues Summary ---
No rows with data errors found.
--- Data Issues Summary ---
No NaN values found.
No infinite values found.
--- End of Data Issues Summary ---
No rows with data errors found.


Create training split. Currently 25% test, team_o_win prediction

In [9]:
def prep_training_data(train_data):
    """ Split trianing data into features and targets"""
    y = training_data['team_0_win']
    X = training_data.drop(columns=['team_0_win'])

    # Split  data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    return X_train, X_test, y_train, y_test

In [10]:
X_train, X_test, y_train, y_test = prep_training_data(training_data)
# X_train, X_test, y_train, y_test = prep_training_data(diff_training_data)

In [17]:
def train_random_forest(X_train, X_test, y_train, y_test, params, model_type):

    # Create unique model identifier
    ## {model_type} {date_time} {random_state}
    from datetime import datetime

    date_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_id = f"{model_type}_{date_time}_{params.get('random_state', 42)}"

    # Initialize the model with these starting parameters
    rf_model = RandomForestClassifier(
        n_estimators=params.get("n_estimators", 100),
        max_depth=params.get("max_depth", None),
        min_samples_split=params.get("min_samples_split", 2),
        min_samples_leaf=params.get("min_samples_leaf", 1),
        max_features=params.get("max_features", "sqrt"),
        random_state=params.get("random_state", 42),
        n_jobs=-1                # Use all available cores
    )
    
    # Print Model params and start
    print(f"Starting Training, Modelid = {model_id}")
    print("Training Random Forest with parameters:")
    for key, value in params.items():
        print(f"{key}: {value}")

    # Train the model
    rf_model.fit(X_train, y_train)

    # Make predictions
    y_pred = rf_model.predict(X_test)


    return rf_model, model_id, y_pred

def evaluate_model(model, y_test, y_pred, X)-> dict:
    """Create a report evaluating model passed, returns dict of report data"""

    report = classification_report(y_test, y_pred, output_dict=True)
    conf_matrix = confusion_matrix(y_test, y_pred).tolist()
    feature_importance = pd.DataFrame(
        {'feature': X, 'importance': model.feature_importances_},
    ).sort_values('importance', ascending=False)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nTop 25 important features:")
    print(feature_importance.head(25))

    return {
        'accuracy': accuracy,
        'classification_report': report,
        'confusion_matrix': conf_matrix,
        'feature_importance': feature_importance.to_dict(orient='records')
    }


In [12]:
# Create Params

params = {
    "n_estimators": 100,
    "max_depth": None,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_features": "sqrt",
    "random_state": 42
}

# Change per run! Set model ID and Folder to save data to

In [13]:
# name model for ID
model_type = "rf_std_v2"

#### MAKE SURE TO CHANGE THE FOLDER!!!!! #####
model_folder = f"models//8.24.25//{model_type}"

In [14]:
model, model_id, y_pred = train_random_forest(X_train, X_test, y_train, y_test, params, model_type)


Starting Training, Modelid = rf_std_v2_20250824_124805_42
Training Random Forest with parameters:
n_estimators: 100
max_depth: None
min_samples_split: 2
min_samples_leaf: 1
max_features: sqrt
random_state: 42


In [16]:
X = X_train.columns

In [18]:
report = evaluate_model(model, y_test, y_pred,X)

Accuracy: 0.5872

Classification Report:
              precision    recall  f1-score   support

           N       0.58      0.50      0.54       105
           Y       0.59      0.66      0.62       113

    accuracy                           0.59       218
   macro avg       0.59      0.58      0.58       218
weighted avg       0.59      0.59      0.58       218


Confusion Matrix:
[[53 52]
 [38 75]]

Top 25 important features:
                               feature  importance
153        ph_win_rate_ratio_min_Team0    0.016596
157       ph_win_rate_ratio_mean_Team0    0.015258
158       ph_win_rate_ratio_mean_Team1    0.013649
142             ph_win_rate_mean_Team1    0.013378
137              ph_win_rate_min_Team0    0.013116
86              ph_kd_ratio_mean_Team1    0.011736
0                             match_id    0.011588
101     ph_avg_match_length_mean_Team0    0.011248
141             ph_win_rate_mean_Team0    0.011101
105  ph_avg_damage_per_match_min_Team0    0.011092
33   

Save Model, parameters, trianing data, etc.

In [20]:
def save_model(model, params, folder, model_id,feature_names = X):
    from sklearn.pipeline import Pipeline
    import joblib, json, platform, os

    # create subfolders
    os.makedirs(f"{folder}/samples", exist_ok=True)

    joblib.dump(model, f"{folder}/model.joblib")

    print(f"Model saved to {folder}")

    json.dump(params, open(f"{folder}/params.json","w"))

    json.dump({
    "run_id": model_id, "python": platform.python_version(),
    "feature_names": feature_names,"random_state": params.get("random_state", 42),
    }, open(f"{folder}/meta.json","w"))

def save_report(training_data, model_id, model_folder, results):
    import json

    training_data.to_csv(f"{model_folder}/{model_id}_training_data.csv")

    with open(f"{model_folder}/{model_id}_results.txt", "w") as f:
        f.write(f"Accuracy: {results['accuracy']}\n\n")
        f.write("Classification Report:\n")
        f.write(json.dumps(results['classification_report'], indent=2))
        f.write("\n\nConfusion Matrix:\n")
        f.write(str(results['confusion_matrix']))
        f.write("\n\nTop Features:\n")
        for feat in results['feature_importance'][:25]:
            f.write(f"{feat['feature']}: {feat['importance']}\n")

Save Model and trianing information

In [22]:

features = X.tolist()

save_model(model, params, model_folder, model_id, features)
save_report(training_data, model_id, model_folder, report)

train = X_train.copy()
train["target"] = y_train
train.to_csv(f"{model_folder}/samples/Xy_train.csv", index=False)

test = X_test.copy()
test["target"] = y_test
test.to_csv(f"{model_folder}/samples/Xy_test.csv", index=False)

Model saved to models//8.24.25//rf_std_v2
