Load needed models

In [26]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV


Load Data

In [27]:
training = pd.DataFrame()
training = pd.read_excel("NCAA_Seed_Training_Set2.0.xlsx", sheet_name="NCAA_Seed_Training_Set2.0")
training_sheets = pd.read_excel("NCAA_Seed_Training_Set2.0.xlsx", sheet_name=None)
testing = pd.DataFrame()
testing = pd.read_excel("NCAA_Seed_Test_Set2.0.xlsx", sheet_name="NCAA_Seed_Test_Set2.0")
testing_sheets = pd.read_excel("NCAA_Seed_Training_Set2.0.xlsx", sheet_name=None)


Add outside data NET conference ranking to the model

In [28]:
def get_conf_rank(row, sheets):
    season = row["Season"]
    conf = row["Conference"]
    
    # Convert season to string in case it's numeric
    season = str(season)
    
    # Get the appropriate sheet
    if season in sheets:
        season_df = sheets[season]
        
        match = season_df.loc[
            season_df["Conference"] == conf,
            "Rank"
        ]
        
        if not match.empty:
            return match.iloc[0]
    
    return None  # if no match found

training["Conf Rank"] = np.nan
training["Conf Rank"] = training.apply(get_conf_rank, axis=1, args=(training_sheets,))

testing["Conf Rank"] = np.nan
testing["Conf Rank"] = training.apply(get_conf_rank, axis=1, args=(training_sheets,))

In [None]:
ind_vars1 = ["Q1 Win", "Q1 Loss", "Q2 Win", "Q2 Loss", "Q3 Win", "Q3 Loss", "Q4 Win", "Q4 Loss", "NET Rank", "Conf Rank"]

training_filt = training.dropna(subset = ["NET Rank"])
testing_filt = testing.dropna(subset = ["NET Rank"])

In [30]:
seed_train = training_filt.dropna(subset=["Bid Type"])
X_reg1 = seed_train[ind_vars1]
y_reg = seed_train["Overall Seed"]

XTrain, XTest, yTrain, yTest = train_test_split(
    X_reg1, y_reg, random_state=0, test_size=0.2
)

In [31]:
# Gradient Boosting Model
gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=0
)

# Fit model
gbr.fit(XTrain, yTrain)

# Evaluate
yPred = gbr.predict(XTest)
print('Test R²:', round(metrics.r2_score(yTest, yPred), 3))

# Cross-validation
scores = cross_val_score(gbr, X_reg1, y_reg, cv=5, scoring='r2')

print("Cross-validated R² scores:", scores)
print("Mean CV R²:", round(scores.mean(), 3))
print("Std Dev:", round(scores.std(), 3))

importance = pd.Series(
    gbr.feature_importances_,
    index=ind_vars1
).sort_values(ascending=False)

print("\nFeature Importances:")
print(importance)

Test R²: 0.935
Cross-validated R² scores: [0.86229868 0.95594984 0.88826479 0.94293319 0.9332869 ]
Mean CV R²: 0.917
Std Dev: 0.035

Feature Importances:
NET Rank     0.757055
Q1 Win       0.118567
NETSOS       0.039140
Conf Rank    0.038234
Q1 Loss      0.013096
Q2 Win       0.010841
Q2 Loss      0.009822
Q3 Win       0.004513
Q4 Win       0.004410
Q4 Loss      0.003387
Q3 Loss      0.000934
dtype: float64


Generate predictions for submission

In [32]:
final_pred = testing.copy()
final_pred["Initial Seed"] = np.nan

filtered_pred = testing_filt[
    (testing_filt["Bid Type"] == "AL") |
    (testing_filt["Bid Type"] == "AQ")
]

X_reg1_test = filtered_pred[ind_vars1]

predictions = gbr.predict(X_reg1_test)

final_pred.loc[filtered_pred.index, "Initial Seed"] = predictions
final_pred["Overall Seed"] = final_pred["Initial Seed"].round(0)
final_pred["Overall Seed"] = final_pred["Overall Seed"].clip(lower=1, upper=68)
final_pred["Overall Seed"] = final_pred["Overall Seed"].fillna(0).astype(int)

final_pred = final_pred[["RecordID", "Overall Seed"]]

final_pred.to_csv("predictions.csv", index=False)