In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.stats.proportion import proportions_ztest
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import joblib
import m2cgen as m2c

# file_path = "New_Combined_MarchMadness_Data.csv"
# file_path = "all_march_madness_data.csv"
file_path = "data_sets/least_variables.csv"
# file_path = "no_rank_variable.csv"

In [None]:
# ---------------------------------------------------------
# 1. CREATE TARGET + KEEP ROUND 64 ONLY
# ---------------------------------------------------------
df['first_round_win'] = df['ROUND'].apply(lambda x: 0 if x == 64 else 1)

X_numeric = X.select_dtypes(include='number')

df_first_round = df[df['ROUND'] == 64].copy()
df_first_round = df_first_round.dropna(subset=['first_round_win'])

y = df_first_round['first_round_win']
X = df_first_round.drop(columns=['first_round_win', 'ROUND'], errors='ignore')

cols_to_drop = [
    'CURRENT ROUND',
    'team',
    'SEED',  # drop duplicates
]

X = X.drop(columns=cols_to_drop, errors='ignore')


# ---------------------------------------------------------
# 3. TRAIN/TEST SPLIT
# ---------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    random_state=92,
    stratify=y
)

# ---------------------------------------------------------
# 4. FIT RANDOM FOREST
# ---------------------------------------------------------
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    random_state=92,
    class_weight='balanced'
)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
base_acc = accuracy_score(y_test, y_pred)

print("Base Accuracy (all features):", base_acc)

# ---------------------------------------------------------
# 5. FEATURE IMPORTANCE PLOT
# ---------------------------------------------------------
importances = rf.feature_importances_
feat_df = pd.DataFrame({
    "feature": X_train.columns,
    "importance": importances
}).sort_values("importance", ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feat_df['feature'], feat_df['importance'])
plt.xlabel("Importance")
plt.title("Random Forest Feature Importance")
plt.gca().invert_yaxis()
plt.show()

print("\nTop features sorted by importance:\n")
print(feat_df)

# ---------------------------------------------------------
# 6. SEARCH BEST NUMBER OF TOP FEATURES
# ---------------------------------------------------------
best_acc = 0
best_k = None
best_features = None

sorted_features = feat_df['feature'].tolist()

for k in range(1, len(sorted_features) + 1):
    selected = sorted_features[:k]

    X_train_k = X_train[selected]
    X_test_k = X_test[selected]

    rf_k = RandomForestClassifier(
        n_estimators=500,
        random_state=92,
        class_weight='balanced'
    )
    rf_k.fit(X_train_k, y_train)
    y_pred_k = rf_k.predict(X_test_k)

    acc_k = accuracy_score(y_test, y_pred_k)

    if acc_k > best_acc:
        best_acc = acc_k
        best_k = k
        best_features = selected

print("\n===================================================")
print(f"BEST ACCURACY FOUND: {best_acc:.4f} using top {best_k} features")
print("===================================================")
print("Best feature set:")
print(best_features)

# ---------------------------------------------------------
# 7. PRINT THEIR IMPORTANCES
# ---------------------------------------------------------
best_imp_df = feat_df[feat_df['feature'].isin(best_features)]

print("\nFeature importances for BEST model:\n")
print(best_imp_df)

In [None]:
# Load data
df = pd.read_csv(file_path)

# Create first_round_win: 0 if ROUND == 64, else 1
df['first_round_win'] = df['ROUND'].apply(lambda x: 0 if x == 64 else 1)

# Keep only the first-round row (ROUND == 64) for each team/year
df_first_round = df[df['ROUND'] == 64].copy()

# Drop any missing targets
df_first_round = df_first_round.dropna(subset=['first_round_win'])

# Now target and features
y = df_first_round['first_round_win']
X = df_first_round.drop(columns=['first_round_win', 'ROUND'], errors='ignore')

# --- Create difference features for opponent stats ---
opponent_cols = [col for col in X.columns if col.startswith("Opp")]

for opp_col in opponent_cols:
    # Match the team column by removing 'Opp'
    team_col = opp_col.replace("Opp", "")
    
    if team_col in X.columns:
        diff_col = team_col + "_diff"
        X[diff_col] = X[team_col] - X[opp_col]

# Drop the original team and opponent columns to avoid redundancy
cols_to_drop = opponent_cols + [col for col in X.columns if col in [c.replace("Opp", "") for c in opponent_cols]]
X = X.drop(columns=cols_to_drop, errors='ignore')

# Add additional interaction features
X["AdjEM_AdjTempo_ratio"] = X["AdjEM"] / (X["AdjTempo"] + 1e-6)  # avoid divide by zero
X["Height_Experience_interaction"] = X["AvgHeight"] * X["Experience"]
X["Seed_AdjEM_interaction"] = X["SEED"] * X["AdjEM"]

print("Features used by the model:", X.columns.tolist())

In [None]:
# TEST AND TRAIN
from sklearn.preprocessing import LabelEncoder

X_encoded = X.copy()
label_encoders = {}

for col in X_encoded.columns:
    if X_encoded[col].dtype == 'object':
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
        label_encoders[col] = le
        
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,          
    y,          
    test_size=0.3,   
    random_state=92, 
    stratify=y      
)
print("Training set:", X_train.shape, y_train.shape)
print("Testing set:", X_test.shape, y_test.shape)

In [None]:
# --- Train Random Forest without hyperparameter tuning ---
        
rf_base = RandomForestClassifier(
    random_state=42,
    class_weight='balanced'  # optional
)

# Fit the model
rf_base.fit(X_train, y_train)

# Make predictions
y_pred = rf_base.predict(X_test)

importances = rf_base.feature_importances_
feat_df = pd.DataFrame({
    "feature": X.columns,
    "importance": importances
}).sort_values("importance", ascending=False)

plt.figure(figsize=(10,6))
plt.barh(feat_df['feature'], feat_df['importance'])
plt.xlabel("Importance")
plt.title("Random Forest Feature Importance")
plt.gca().invert_yaxis()
plt.show()



In [None]:
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    random_state=92
)
rf.fit(X_train, y_train)

# ------------------------------------------------------
# 4. FEATURE IMPORTANCE PLOT
# ------------------------------------------------------
importances = rf.feature_importances_
feat_df = pd.DataFrame({
    "feature": X.columns,
    "importance": importances
}).sort_values("importance", ascending=False)

plt.figure(figsize=(10,6))
plt.barh(feat_df['feature'], feat_df['importance'])
plt.xlabel("Importance")
plt.title("Random Forest Feature Importance")
plt.gca().invert_yaxis()
plt.show()

# ------------------------------------------------------
# 5. LOOP: TEST ACCURACY USING TOP k FEATURES
# ------------------------------------------------------
accuracies = []

for k in range(1, len(feat_df)+1):
    top_k_features = feat_df['feature'].iloc[:k]
    
    # Train RF using only top k variables
    rf_k = RandomForestClassifier(
        n_estimators=500,
        random_state=92
    )
    rf_k.fit(X_train[top_k_features], y_train)

    y_pred_k = rf_k.predict(X_test[top_k_features])
    acc_k = accuracy_score(y_test, y_pred_k)
    
    accuracies.append((k, acc_k))

# Find best number of variables
best_k, best_acc = max(accuracies, key=lambda x: x[1])

print("\n-------------------------------")
print(f"Best accuracy achieved with k = {best_k} variables")
print(f"Accuracy: {best_acc:.4f}")
print("-------------------------------\n")

# ------------------------------------------------------
# 6. PRINT BEST FEATURES + IMPORTANCE VALUES
# ------------------------------------------------------
best_features = feat_df.iloc[:best_k]
print("Top features producing highest accuracy:\n")
print(best_features)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import numpy as np

# --- Train Random Forest without hyperparameter tuning ---
rf_base = RandomForestClassifier(
    random_state=42,
    class_weight='balanced'  # optional
)

# Fit the model
rf_base.fit(X_train, y_train)

# Make predictions
y_pred = rf_base.predict(X_test)

# --- Evaluate ---
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# --- Feature Importance ---
importances = rf_base.feature_importances_
features = X_train.columns
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10,6))
plt.title("Feature Importances (Random Forest)")
plt.bar(range(len(importances)), importances[indices], align='center')
plt.xticks(range(len(importances)), [features[i] for i in indices], rotation=45)
plt.ylabel("Importance")
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# --- Step 1: Determine feature ranking ---
importances = rf_base.feature_importances_
indices = np.argsort(importances)[::-1]  # descending order
features = X_train.columns

# --- Step 2: Track best accuracy for subsets (your existing loop) ---
best_acc = 0
best_n = 0
best_features = []
accuracy_results = []

rf_params = {
    'random_state': 42,
    'class_weight': 'balanced',
}

for n in range(1, len(features) + 1):
    top_features = features[indices[:n]]
    
    rf_model = RandomForestClassifier(**rf_params)
    rf_model.fit(X_train[top_features], y_train)
    
    y_pred = rf_model.predict(X_test[top_features])
    acc = accuracy_score(y_test, y_pred)
    accuracy_results.append((n, acc))
    
    print(f"Top {n} features â†’ Accuracy = {acc:.4f}")
    
    if acc > best_acc:
        best_acc = acc
        best_n = n
        best_features = top_features

print("\n=== Best Result ===")
print(f"Number of top features: {best_n}")
print(f"Features: {list(best_features)}")
print(f"Highest accuracy: {best_acc:.4f}")

# --- Step 3: Build final model using top 7 features ---
top_7_features = features[indices[:7]]
final_model = RandomForestClassifier(**rf_params)
final_model.fit(X_train[top_7_features], y_train)

# --- Step 4: Evaluate final model ---
y_pred_final = final_model.predict(X_test[top_7_features])
print("\n=== Final Model Evaluation (Top 7 Features) ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_final):.4f}")

In [None]:
# Export the trained RF to JS code
js_code = m2c.export_to_javascript(final_model)

# Save it to a JS file
with open("model.js", "w") as f:
    f.write(js_code)

In [None]:
year_to_test = 2024
X_year = X[df['year'] == year_to_test]
y_year = y[df['year'] == year_to_test]

# Select top 7 features
X_year_top7 = X_year[top_7_features]

# Predict
y_pred_year = final_model.predict(X_year_top7)
y_prob_year = final_model.predict_proba(X_year_top7)  # optional: probabilities

# Create a results DataFrame
results_year = X_year_top7.copy()
results_year['Actual'] = y_year
results_year['Predicted'] = y_pred_year
results_year['Prob_A'] = y_prob_year[:,0]   # probability team A wins
results_year['Prob_B'] = y_prob_year[:,1]   # probability team B wins

# Print the results
print(results_year)


In [None]:
df_tourney = df[(df['year'] == 2019) & (df['ROUND'] < 64)]
print(df_tourney)

year_to_test = 2019
X_year = X[df['year'] == year_to_test]
y_year = y[df['year'] == year_to_test]

# --- Select top 7 features ---
X_year_top7 = X_year[top_7_features]

# --- Predict ---
y_pred_year = final_model.predict(X_year_top7)
y_prob_year = final_model.predict_proba(X_year_top7)  # gives probabilities for each class

# --- Evaluate ---
from sklearn.metrics import accuracy_score, confusion_matrix

print(f"\n=== Final Model Evaluation on {year_to_test} ===")
print(f"Accuracy: {accuracy_score(y_year, y_pred_year):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_year, y_pred_year))

# --- Optional: Inspect individual predictions ---
results_year = X_year_top7.copy()
results_year['Actual'] = y_year
results_year['Predicted'] = y_pred_year
results_year['Prob_A'] = y_prob_year[:,0]
results_year['Prob_B'] = y_prob_year[:,1]

print(results_year)

In [None]:
print(top_features)
print(X_train.columns.tolist())

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import numpy as np

# --- Define base Random Forest ---
# Holding number of estimators fixed
rf_base = RandomForestClassifier(
    n_estimators=100,          # <-- held constant
    random_state=42,
    class_weight='balanced'
)

# --- Define hyperparameter grid ---
# Explore depth from 2 to 30 (step = 2)
param_grid = {
    'max_depth': list(range(2, 32, 2)),
    'min_samples_split': [60],
    'min_samples_leaf': [7]
}

# --- Grid Search ---
grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# --- Use only top features ---
top_features = [
    'AdjEM_AdjTempo_ratio', 'AdjEM', 'SEED',
    'Seed_AdjEM_interaction', 'FG2Pct_diff', 'ORPct', 'TOPct'
]

grid_search.fit(X_train[top_features], y_train)

# --- Best model ---
best_rf = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

In [None]:
p_obs = 86.76 / 100

# Hypothesized proportion
p0 = 78.125 / 100

# Sample size
n = 100  # replace with your actual sample size

# Number of "successes" in the sample
count = int(p_obs * n)

# Perform one-proportion z-test
stat, pval = proportions_ztest(count, n, value=p0)
print(f"Z-statistic: {stat:.4f}")
print(f"P-value: {pval:.4f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import numpy as np

# --- Define base Random Forest ---
rf_base = RandomForestClassifier(
    random_state=42,
    class_weight='balanced'
)

# --- Define hyperparameter grid ---
param_grid = {
    'n_estimators': [100],
    'max_depth': [20],
    'min_samples_split': [60],
    'min_samples_leaf': [7]
}

# # --- Run Grid Search with 3-fold cross-validation ---
grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# Use only top features
top_features = ['AdjEM_AdjTempo_ratio', 'AdjEM', 'SEED', 
                'Seed_AdjEM_interaction', 'FG2Pct_diff', 'ORPct', 'TOPct']

grid_search.fit(X_train[top_features], y_train)

# --- Best model ---
best_rf = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

# --- Evaluate on test set ---
y_pred = best_rf.predict(X_test[top_features])
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# --- Feature Importance ---
importances = best_rf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10,6))
plt.title("Feature Importances (Tuned Random Forest)")
plt.bar(range(len(importances)), importances[indices], align='center')
plt.xticks(range(len(importances)), [top_features[i] for i in indices], rotation=45)
plt.ylabel("Importance")
plt.show()

# --- Visualize accuracy as a function of certain hyperparameters ---
# Example: n_estimators vs mean CV accuracy
results = grid_search.cv_results_
mean_test_scores = results['mean_test_score']
n_estimators_list = param_grid['min_samples_leaf']

# Aggregate scores by n_estimators
avg_scores = []
for n in n_estimators_list:
    idx = [i for i, params in enumerate(results['params']) if params['min_samples_leaf'] == n]
    avg_scores.append(np.mean(mean_test_scores[idx]))

plt.figure(figsize=(8,5))
plt.plot(n_estimators_list, avg_scores, marker='o')
plt.title("Mean CV Accuracy vs min_samples_leaf")
plt.xlabel("param_min_samples_leaf")
plt.ylabel("Mean CV Accuracy")
plt.grid(True)
plt.show()

In [None]:
results = pd.DataFrame(grid_search.cv_results_)

# Function to plot accuracy vs parameter
def plot_param_vs_accuracy(param_name):
    param_values = results[param_name]
    mean_scores = results['mean_test_score']
    
    # For discrete parameters, aggregate by value
    if param_values.dtype == object or len(np.unique(param_values)) < 10:
        agg_scores = results.groupby(param_name)['mean_test_score'].mean()
        plt.figure(figsize=(8,5))
        plt.plot(agg_scores.index, agg_scores.values, marker='o')
    else:
        plt.figure(figsize=(8,5))
        plt.scatter(param_values, mean_scores)
    
    plt.title(f"Mean CV Accuracy vs {param_name}")
    plt.xlabel(param_name)
    plt.ylabel("Mean CV Accuracy")
    plt.grid(True)
    plt.show()

# Plot for each hyperparameter
for param in ['param_n_estimators', 'param_max_depth', 
              'param_min_samples_split', 'param_min_samples_leaf']:
    plot_param_vs_accuracy(param)

In [None]:
n_features, accs = zip(*accuracy_results)
plt.figure(figsize=(10,6))
plt.plot(n_features, accs, marker='o')
plt.xlabel("Number of Top Features")
plt.ylabel("Accuracy")
plt.title("Accuracy vs Number of Top Features")
plt.show()

In [None]:
n_correct = int(best_acc * len(y_test))  # number of correct predictions
n_total = len(y_test)                    # total predictions
p_null = .75                            # null hypothesis (random guessing)
stat, p_value = proportions_ztest(count=n_correct, nobs=n_total, value=p_null, alternative='larger')
print(f"\nProportion test against 0.5:")
print(f"Z-statistic: {stat:.4f}")
print(f"P-value: {p_value:.4f}")

In [None]:
# Predict first-round winner based on seed
# Assuming smaller seed number = stronger team
y_pred_seed = (X_test['SEED'] <= 8).astype(int)  # Example: seeds 1-8 are predicted to win

# If your target is 1 = team won first round, adjust logic accordingly:
# For a more general approach, predict the higher seed in each matchup:
y_pred_seed = (X_test['SEED'] <= X_test['SEED'].median()).astype(int)

# Compute accuracy of seed-based prediction
baseline_acc = accuracy_score(y_test, y_pred_seed)
print(f"Seed-based baseline accuracy: {baseline_acc:.4f}")

In [None]:
n_correct = int(best_acc * len(y_test))  # number of correct predictions
n_total = len(y_test)                    # total predictions
p_null = 0.78125                            # null hypothesis using just seed

stat, p_value = proportions_ztest(count=n_correct, nobs=n_total, value=p_null, alternative='larger')
print(f"\nProportion test against 0.5:")
print(f"Z-statistic: {stat:.4f}")
print(f"P-value: {p_value:.4f}")

In [None]:
df['predicted'] = final_model.predict(X[best_features])
df['correct'] = df['predicted'] == df['first_round_win']
wrong_cases = df[df['correct'] == False]
print(wrong_cases.head())
print(f"Total wrong: {len(wrong_cases)} out of {len(df)}")
from sklearn.metrics import confusion_matrix

print(confusion_matrix(df['first_round_win'], df['predicted']))

year_to_check = 2019

# Filter for that year
df_year = df[df['year'] == year_to_check]

# Find wrong predictions in that year
wrong_year = df_year[df_year['predicted'] != df_year['first_round_win']]

# Show the first few wrong cases
print(wrong_year.head())

# Total wrong in that year
print(f"Total wrong in {year_to_check}: {len(wrong_year)} out of {len(df_year)}")