In [85]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingRegressor

In [86]:
training = pd.DataFrame()
training = pd.read_csv("NCAA_Seed_Training_Set2.0.csv", dtype=str)
training_sheets = pd.read_excel("NCAA_Seed_Training_Set2.0.xlsx", sheet_name=None)
testing = pd.DataFrame()
testing = pd.read_csv("NCAA_Seed_Test_Set_2026_20260208.csv", dtype=str)
conf_ranks = pd.read_excel("2026 Conference Ranks.xlsx")


In [87]:
testing = testing.merge(
    conf_ranks[["Conference", "Rank"]],
    on="Conference",
    how="left"
)

# Rename column
testing.rename(columns={"Rank": "Conf Rank"}, inplace=True)

def get_conf_rank_train(row, sheets):
    season = str(row["Season"])
    conf = row["Conference"]
    
    if season in sheets:
        season_df = sheets[season]
        
        match = season_df.loc[
            season_df["Conference"] == conf,
            "Rank"
        ]
        
        if not match.empty:
            return match.iloc[0]
    
    return np.nan

training["Conf Rank"] = training.apply(
    get_conf_rank_train,
    axis=1,
    args=(training_sheets,)
)

In [88]:
num_neighbors = 50
ind_vars1 = ["Quadrant1_Win", 
            "Quadrant1_Loss", 
            "Quadrant2_Win", 
            "Quadrant2_Loss", 
            "Quadrant3_Win", 
            "Quadrant3_Loss", 
            "Quadrant4_Win", 
            "Quadrant4_Loss",
            "NET Rank",
            "Conf Rank"]

model = KNeighborsClassifier(n_neighbors = num_neighbors)

training_filt = training.dropna(subset = ["NET Rank"])

In [89]:
# Map month abbreviations to numbers
month_map = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
    'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
    'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

cols_to_split = [
    "Quadrant1",
    "Quadrant2",
    "Quadrant3",
    "Quadrant4",
    "WL",
    "Conf.Record",
    "Non-ConferenceRecord",
    "RoadWL"
]

for col in cols_to_split:
    
    # ---------- TRAINING ----------
    split_train = training_filt[col].astype(str).str.split("-", expand=True)

    if split_train.shape[1] == 2:
        left_train = split_train[0].replace(month_map)
        right_train = split_train[1].replace(month_map)

        training_filt[f"{col}_Win"] = pd.to_numeric(left_train, errors="coerce").fillna(0)
        training_filt[f"{col}_Loss"] = pd.to_numeric(right_train, errors="coerce").fillna(0)
    else:
        training_filt[f"{col}_Win"] = 0
        training_filt[f"{col}_Loss"] = 0


    # ---------- TESTING ----------
    split_test = testing[col].astype(str).str.split("-", expand=True)

    if split_test.shape[1] == 2:
        left_test = split_test[0].replace(month_map)
        right_test = split_test[1].replace(month_map)

        testing[f"{col}_Win"] = pd.to_numeric(left_test, errors="coerce").fillna(0)
        testing[f"{col}_Loss"] = pd.to_numeric(right_test, errors="coerce").fillna(0)
    else:
        testing[f"{col}_Win"] = 0
        testing[f"{col}_Loss"] = 0

training_filt.drop(columns=cols_to_split, inplace=True)
testing.drop(columns=cols_to_split, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_filt[f"{col}_Win"] = pd.to_numeric(left_train, errors="coerce").fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_filt[f"{col}_Loss"] = pd.to_numeric(right_train, errors="coerce").fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_filt[f"{col}_Win"] = pd.to_n

In [90]:
# Create Non-AQ column
training_filt["Non-AQ"] = ""

# Convert Overall Seed to numeric (invalid values become NaN)
training_filt["Overall Seed"] = pd.to_numeric(
    training_filt["Overall Seed"], 
    errors="coerce"
)

# Condition:
# 1) Bid Type == "AL"
# OR
# 2) Bid Type == "AQ" AND Overall Seed <= 45

condition = (
    (training_filt["Bid Type"] == "AL") |
    ((training_filt["Bid Type"] == "AQ") & (training_filt["Overall Seed"] <= 45))
)

training_filt.loc[condition, "Non-AQ"] = "X"

X_cls = training_filt[ind_vars1]
y_cls = training_filt['Non-AQ']

XTrain, XTest, yTrain, yTest = train_test_split(X_cls, y_cls, random_state=0, test_size=0.2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_filt["Non-AQ"] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_filt["Overall Seed"] = pd.to_numeric(


In [91]:
model.fit(X = XTrain, y = yTrain)
y_pred = model.predict(XTest)

print('Confusion Matrix:')
print(metrics.confusion_matrix(y_true = yTest, y_pred = y_pred))
print('*****************')
print("Accuracy Train:", metrics.accuracy_score(y_true=yTrain, y_pred=model.predict(XTrain)))
print("Accuracy Test:", metrics.accuracy_score(y_true=yTest, y_pred=y_pred))
print('*****************')
scores = cross_val_score(model, XTrain, yTrain, cv=5, scoring='accuracy')
print("Cross-validated accuracy scores:", scores)
print("Mean Cross-validated accuracy scores:", round(scores.mean(), 3))
print("Standard deviation:", round(scores.std(), 3))

Confusion Matrix:
[[232   4]
 [  1  33]]
*****************
Accuracy Train: 0.9721706864564007
Accuracy Test: 0.9814814814814815
*****************
Cross-validated accuracy scores: [0.98148148 0.97685185 0.97222222 0.97209302 0.9627907 ]
Mean Cross-validated accuracy scores: 0.973
Standard deviation: 0.006


In [92]:
n_neighbors = [1, 5, 10, 20, 35, 50, 75, 100, 150, 200]
for neighbor in n_neighbors:
    model = KNeighborsClassifier(neighbor)
    print('Number of Neighbors = ', neighbor)
    model.fit(XTrain, yTrain)
    y_pred_train = model.predict(XTrain)
    print('Accuracy on training data = ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
    y_pred = model.predict(XTest)
    print('Accuracy on test data = ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))
    print('*****************')

Number of Neighbors =  1
Accuracy on training data =  1.0
Accuracy on test data =  0.9740740740740741
*****************
Number of Neighbors =  5
Accuracy on training data =  0.9786641929499073
Accuracy on test data =  0.9777777777777777
*****************
Number of Neighbors =  10
Accuracy on training data =  0.9786641929499073
Accuracy on test data =  0.9740740740740741
*****************
Number of Neighbors =  20
Accuracy on training data =  0.9758812615955473
Accuracy on test data =  0.9814814814814815
*****************
Number of Neighbors =  35
Accuracy on training data =  0.9758812615955473
Accuracy on test data =  0.9777777777777777
*****************
Number of Neighbors =  50
Accuracy on training data =  0.9721706864564007
Accuracy on test data =  0.9814814814814815
*****************
Number of Neighbors =  75
Accuracy on training data =  0.9712430426716141
Accuracy on test data =  0.9814814814814815
*****************
Number of Neighbors =  100
Accuracy on training data =  0.9730983

In [93]:
# Apply model to actual test data
testing["Prob_Non_AQ"] = model.predict_proba(testing[ind_vars1])[:, 1]

In [94]:
# Get index of best NET Rank in each conference
best_idx = testing.groupby("Conference")["NET Rank"].idxmin()

# Assign AQ to those teams
testing.loc[best_idx, "Bid Type"] = "AQ"

# Use AQs plus probabilities to get to 68
# Separate AQs and non-AQs
aq_df = testing[testing["Bid Type"] == "AQ"].copy()
non_aq_df = testing[testing["Bid Type"] != "AQ"].copy()

num_needed = 68 - len(aq_df)

# 3Select top non-AQs by probability (highest first)
top_non_aq = (
    non_aq_df
    .sort_values("Prob_Non_AQ", ascending=False)
    .head(num_needed)
)

# Combine them
final_68 = pd.concat([aq_df, top_non_aq]) 

# Get the TeamIDs of selected non-AQ teams
selected_non_aq_ids = final_68.loc[
    final_68["Bid Type"] != "AQ", "RecordID"
]

# Update original testing dataframe
testing.loc[
    testing["RecordID"].isin(selected_non_aq_ids),
    "Bid Type"
] = "AL"

In [95]:
seed_train = training_filt.dropna(subset=["Bid Type"])
X_reg1 = seed_train[ind_vars1]
y_reg = seed_train["Overall Seed"]

XTrain, XTest, yTrain, yTest = train_test_split(
    X_reg1, y_reg, random_state=0, test_size=0.2
)

# Gradient Boosting Model
gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=0
)

# Fit model
gbr.fit(XTrain, yTrain)

# Evaluate
yPred = gbr.predict(XTest)
print('Test R²:', round(metrics.r2_score(yTest, yPred), 3))

# Cross-validation
scores = cross_val_score(gbr, X_reg1, y_reg, cv=5, scoring='r2')

print("Cross-validated R² scores:", scores)
print("Mean CV R²:", round(scores.mean(), 3))
print("Std Dev:", round(scores.std(), 3))

importance = pd.Series(
    gbr.feature_importances_,
    index=ind_vars1
).sort_values(ascending=False)

print("\nFeature Importances:")
print(importance)

Test R²: 0.905
Cross-validated R² scores: [0.83974476 0.92698761 0.84883041 0.93437998 0.90529452]
Mean CV R²: 0.891
Std Dev: 0.039

Feature Importances:
NET Rank          0.810217
Quadrant1_Loss    0.075626
Conf Rank         0.068355
Quadrant1_Win     0.019098
Quadrant2_Win     0.007397
Quadrant2_Loss    0.006908
Quadrant4_Win     0.003969
Quadrant3_Loss    0.003466
Quadrant3_Win     0.002936
Quadrant4_Loss    0.002029
dtype: float64


In [96]:
final_pred = testing

#Intialize Columns 
final_pred["Initial Seed"] = np.nan
final_pred["Overall Seed"] = 0

# Filter to only selected teams
filtered_pred = final_pred[
    (testing["Bid Type"] == "AL") |
    (testing["Bid Type"] == "AQ")
]

# Prepare regression input
X_reg1_test = filtered_pred[ind_vars1]
X_reg1_test = X_reg1_test.apply(pd.to_numeric, errors="coerce").fillna(0)

# Get regression predictions
predictions = gbr.predict(X_reg1_test)

# Store predictions
filtered_pred["Initial Seed"] = predictions

# Sort by Initial Seed ascending (lowest = best)
filtered_pred = filtered_pred.sort_values("Initial Seed").reset_index()
# Assign preliminary rank
filtered_pred["Overall Seed"] = np.arange(1, len(filtered_pred) + 1)

# Merge ranks back into original dataframe
final_pred.loc[
    filtered_pred["index"],
    "Overall Seed"
] = filtered_pred["Overall Seed"].values

# Ensure proper type
final_pred["Overall Seed"] = final_pred["Overall Seed"].fillna(0).astype(int)

# Export only required columns
output = final_pred[["RecordID", "Overall Seed"]]

output.to_csv("final_predictions.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_pred["Initial Seed"] = predictions
