In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [46]:
training_path = "v2_data/training_data_2025-08-17.csv"

In [50]:
# load data

training_data = pd.read_csv(training_path)
training_data = training_data.round(2)

In [61]:
def check_data_issues(df):
    """Check for common missing/null/error values in the DataFrame and print summary. Also print rows with errors."""
    import numpy as np
    print("--- Data Issues Summary ---")
    # Check for NaN values
    nan_counts = df.isna().sum()
    if nan_counts.any():
        print("NaN values found:")
        print(nan_counts[nan_counts > 0])
    else:
        print("No NaN values found.")

    # Check for infinite values
    inf_counts = np.isinf(df.select_dtypes(include=[float, int])).sum()
    if inf_counts.any():
        print("Infinite values found:")
        print(inf_counts[inf_counts > 0])
    else:
        print("No infinite values found.")

    # Check for string 'inf', '-inf', 'nan', 'None', or empty string
    error_strings = ['inf', '-inf', 'nan', 'None', '']
    error_rows = set()
    for col in df.select_dtypes(include=[object]).columns:
        for err in error_strings:
            mask = (df[col] == err)
            count = mask.sum()
            if count > 0:
                print(f"Column '{col}' has {count} occurrences of '{err}'")
                error_rows.update(df[mask].index.tolist())

    # Collect all error rows (NaN, inf, error strings)
    nan_rows = set(df[df.isna().any(axis=1)].index.tolist())
    inf_rows = set(df[np.isinf(df.select_dtypes(include=[float, int])).any(axis=1)].index.tolist())
    all_error_rows = nan_rows.union(inf_rows).union(error_rows)

    print("--- End of Data Issues Summary ---")
    if all_error_rows:
        print(f"\nDetailed view of rows with data errors ({len(all_error_rows)} rows) saved to .csv:")
        df.loc[sorted(all_error_rows)].to_csv("data_errors.csv", index=False)
    else:
        print("No rows with data errors found.")

In [62]:
check_data_issues(training_data)

--- Data Issues Summary ---
NaN values found:
ph_kd_ratio_std_Team0    3
dtype: int64
Infinite values found:
ph_kd_ratio_max_Team0     3
ph_kd_ratio_mean_Team0    3
dtype: int64
--- End of Data Issues Summary ---

Detailed view of rows with data errors (3 rows) saved to .csv:


In [None]:
# needed for v1
training_data['ph_kd_ratio_max_Team0'] = training_data['ph_kd_ratio_max_Team0'].replace('inf', 0,inplace=True)

In [8]:
training_data.to_csv("quick_Test.csv")

PermissionError: [Errno 13] Permission denied: 'quick_Test.csv'

In [None]:
# This was resolved earlier in the pipeline for future datasets
# # Check for infinite values
# inf_values = np.isinf(training_data.select_dtypes(include=['float64', 'float32'])).sum()
# print("Infinite values:\n", inf_values[inf_values > 0])

# # Check column data types
# print(training_data.dtypes)

# # Check for extremely large values
# for col in training_data.select_dtypes(include=['float64', 'float32']).columns:
#     if training_data[col].max() > 1e20:  # Adjust threshold as needed
#         print(f"Column {col} has extremely large values: {training_data[col].max()}")

Infinite values:
 Series([], dtype: int64)
Unnamed: 0                          int64
match_id                            int64
p_total_time_played_min_Team0       int64
p_total_time_played_min_Team1       int64
p_total_time_played_max_Team0       int64
                                   ...   
ph_win_rate_ratio_median_Team0    float64
ph_win_rate_ratio_median_Team1    float64
ph_win_rate_ratio_q75_Team0       float64
ph_win_rate_ratio_q75_Team1       float64
team_0_win                         object
Length: 133, dtype: object


In [None]:
# training_data['ph_kd_ratio_max_Team0']

Create training split. Currently 25% test, team_o_win prediction

In [51]:
# Prep data
y = training_data['team_0_win']
X = training_data.drop(columns=['team_0_win'])

# Split  data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [56]:
print(X_train)
print(f"\n\ny_train")

     Unnamed: 0  match_id  p_total_time_played_min_Team0  \
571         571  38721421                         490949   
707         707  38735600                         385277   
548         548  38719367                         828878   
176         176  38673339                         781368   
291         291  38690827                        1147443   
..          ...       ...                            ...   
71           71  38660058                         658377   
106         106  38664946                         909952   
270         270  38688592                         303190   
435         435  38706622                          77608   
102         102  38664450                         955101   

     p_total_time_played_min_Team1  p_total_time_played_max_Team0  \
571                         146121                        1863994   
707                         231665                        5080198   
548                         231304                        3227511   
176

In [52]:
def train_random_forest(X_train, X_test, y_train, y_test, params):

    # Create unique model identifier
    ## {model_type} {date_time} {random_state}
    from datetime import datetime

    model_type = "rf_std"
    date_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_id = f"{model_type}_{date_time}_{params.get('random_state', 42)}"

    # Initialize the model with these starting parameters
    rf_model = RandomForestClassifier(
        n_estimators=params.get("n_estimators", 100),
        max_depth=params.get("max_depth", None),
        min_samples_split=params.get("min_samples_split", 2),
        min_samples_leaf=params.get("min_samples_leaf", 1),
        max_features=params.get("max_features", "sqrt"),
        random_state=params.get("random_state", 42),
        n_jobs=-1                # Use all available cores
    )
    
    # Print Model params and start
    print(f"Starting Training, Modelid = {model_id}")
    print("Training Random Forest with parameters:")
    for key, value in params.items():
        print(f"{key}: {value}")

    # Train the model
    rf_model.fit(X_train, y_train)

    # Make predictions
    y_pred = rf_model.predict(X_test)


    return rf_model, model_id, y_pred

def evaluate_model(model, y_test, y_pred)-> dict:
    """Create a report evaluating model passed, returns dict of report data"""

    report = classification_report(y_test, y_pred, output_dict=True)
    conf_matrix = confusion_matrix(y_test, y_pred).tolist()
    feature_importance = pd.DataFrame(
        {'feature': X.columns, 'importance': model.feature_importances_},
    ).sort_values('importance', ascending=False)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nTop 25 important features:")
    print(feature_importance.head(25))

    return {
        'accuracy': accuracy,
        'classification_report': report,
        'confusion_matrix': conf_matrix,
        'feature_importance': feature_importance.to_dict(orient='records')
    }


In [53]:
# Create Params

params = {
    "n_estimators": 100,
    "max_depth": None,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_features": "sqrt",
    "random_state": 42
}

In [54]:
model, model_id, y_pred = train_random_forest(X_train, X_test, y_train, y_test, params)


Starting Training, Modelid = rf_std_20250817_125003_42
Training Random Forest with parameters:
n_estimators: 100
max_depth: None
min_samples_split: 2
min_samples_leaf: 1
max_features: sqrt
random_state: 42


ValueError: Input X contains infinity or a value too large for dtype('float32').

In [32]:
report = evaluate_model(model, y_test, y_pred)

Accuracy: 0.6629

Classification Report:
              precision    recall  f1-score   support

           N       0.65      0.70      0.67        88
           Y       0.68      0.62      0.65        90

    accuracy                           0.66       178
   macro avg       0.66      0.66      0.66       178
weighted avg       0.66      0.66      0.66       178


Confusion Matrix:
[[62 26]
 [34 56]]

Top 25 important features:
                               feature  importance
123        ph_win_rate_ratio_min_Team1    0.022840
125        ph_win_rate_ratio_max_Team1    0.022370
126        ph_win_rate_ratio_q25_Team0    0.021401
122        ph_win_rate_ratio_min_Team0    0.019266
124        ph_win_rate_ratio_max_Team0    0.017883
128     ph_win_rate_ratio_median_Team0    0.016599
130        ph_win_rate_ratio_q75_Team0    0.014678
127        ph_win_rate_ratio_q25_Team1    0.014486
131        ph_win_rate_ratio_q75_Team1    0.012987
129     ph_win_rate_ratio_median_Team1    0.011821
55   

Save Model, parameters, trianing data, etc.

In [1]:
# create a folder structure for this model adjust per model
# Currently manual

In [42]:
def save_model(model, params, folder, model_id,feature_names = X.columns):
    from sklearn.pipeline import Pipeline
    import joblib, json, platform, os

    # create subfolders
    os.makedirs(f"{folder}/samples", exist_ok=True)

    joblib.dump(model, f"{folder}/model.joblib")

    print(f"Model saved to {folder}")

    json.dump(params, open(f"{folder}/params.json","w"))

    json.dump({
    "run_id": model_id, "python": platform.python_version(),
    "feature_names": feature_names,"random_state": params.get("random_state", 42),
    }, open(f"{folder}/meta.json","w"))

def save_report(results):
    import json

    with open("results.txt", "w") as f:
        f.write(f"Accuracy: {results['accuracy']}\n\n")
        f.write("Classification Report:\n")
        f.write(json.dumps(results['classification_report'], indent=2))
        f.write("\n\nConfusion Matrix:\n")
        f.write(str(results['confusion_matrix']))
        f.write("\n\nTop Features:\n")
        for feat in results['feature_importance'][:25]:
            f.write(f"{feat['feature']}: {feat['importance']}\n")

Save Model and trianing information

In [43]:
model_folder = "models/8.14.25_rf_1_quantiles/"
features = X.columns.tolist()

save_model(model, params, model_folder, model_id, features)
save_report(report)

train = X_train.copy()
train["target"] = y_train
train.to_csv(f"{model_folder}/samples/Xy_train.csv", index=False)

test = X_test.copy()
test["target"] = y_test
test.to_csv(f"{model_folder}/samples/Xy_test.csv", index=False)

Model saved to models/8.14.25_rf_1_quantiles/
