# Method
We pulled data from [here](https://www.kaggle.com/andrewsundberg/college-basketball-dataset/version/4) which has team stats and their success. From this we'll build a model to predict whether a team is in the elite eight by their stats. Finally we'll pull Duke's stats from this year and see what chance they're in the elite eight.

In [28]:
import pandas as pd
import os

df = pd.read_csv(
    os.path.join(
        os.path.abspath(""), "cbb.csv"
    )
)


In [29]:
df.columns

Index(['TEAM', 'CONF', 'G', 'W', 'ADJOE', 'ADJDE', 'BARTHAG', 'EFG_O', 'EFG_D',
       'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD', '2P_O', '2P_D', '3P_O',
       '3P_D', 'ADJ_T', 'WAB', 'POSTSEASON', 'SEED', 'YEAR'],
      dtype='object')

In [30]:
# df = df[df["POSTSEASON"].isin(["R68", "R64", "R32", "S16", "E8", "F4", "2ND", "Champion"])]



In [31]:
df.count()

TEAM          2455
CONF          2455
G             2455
W             2455
ADJOE         2455
ADJDE         2455
BARTHAG       2455
EFG_O         2455
EFG_D         2455
TOR           2455
TORD          2455
ORB           2455
DRB           2455
FTR           2455
FTRD          2455
2P_O          2455
2P_D          2455
3P_O          2455
3P_D          2455
ADJ_T         2455
WAB           2455
POSTSEASON     476
SEED           476
YEAR          2455
dtype: int64

In [32]:
targets = df["POSTSEASON"].isin(["Champions", "E8", "2ND", "F4"])

In [33]:
# Should equal 7 seasons * 8 teams each season
targets.value_counts()

False    2399
True       56
Name: POSTSEASON, dtype: int64

In [47]:
def get_inputs(df):
    df["WINRATIO"] = df["W"] / df["G"]
    means = df.groupby("YEAR").transform(lambda x: x.mean())

    fields_to_standardize = ["ADJOE", "ADJDE", "BARTHAG", "EFG_O", "EFG_D", "TOR", "TORD", "ORB", "DRB", "FTR", "FTRD", "2P_O", "2P_D", "3P_O", "3P_D", "ADJ_T"]
    for field in fields_to_standardize:
        df[field] = df[field] / means[field]

    return df[["WINRATIO", "ADJOE", "ADJDE", "BARTHAG", "EFG_O", "EFG_D", "TOR", "TORD", "ORB", "DRB", "FTR", "FTRD", "2P_O", "2P_D", "3P_O", "3P_D", "ADJ_T", "WAB"]]
    # return df[["WINRATIO", "BARTHAG","WAB"]]


In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, recall_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
import random
import numpy as np

# def get_inputs(df):
#     df["WINRATIO"] = df["W"] / df["G"]
#     return df[["WINRATIO", "ADJOE", "ADJDE", "BARTHAG", "EFG_O", "EFG_D", "TOR", "TORD", "ORB", "DRB", "FTR", "FTRD", "2P_O", "2P_D", "3P_O", "3P_D", "ADJ_T", "WAB"]]
#     # return df[["WINRATIO", "BARTHAG","WAB"]]

def preprocess(inputs):
    scaler = preprocessing.StandardScaler().fit(inputs)
    return scaler.transform(inputs)

def build_and_test_model(test_validation_ratio=0.75):
    inputs = get_inputs(df)


    scaled_inputs = preprocess(inputs)

    X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size=1 - test_validation_ratio, random_state=42)

    w = [{0:1000,1:100},{0:1000,1:10}, {0:1000,1:1.0}, 
        {0:500,1:1.0}, {0:400,1:1.0}, {0:300,1:1.0}, {0:200,1:1.0}, 
        {0:150,1:1.0}, {0:100,1:1.0}, {0:99,1:1.0}, {0:10,1:1.0}, 
        {0:0.01,1:1.0}, {0:0.01,1:10}, {0:0.01,1:100}, 
        {0:0.001,1:1.0}, {0:0.005,1:1.0}, {0:1.0,1:1.0}, 
        {0:1.0,1:0.1}, {0:10,1:0.1}, {0:100,1:0.1}, 
        {0:10,1:0.01}, {0:1.0,1:0.01}, {0:1.0,1:0.001}, {0:1.0,1:0.005}, 
        {0:1.0,1:10}, {0:1.0,1:99}, {0:1.0,1:100}, {0:1.0,1:150}, 
        {0:1.0,1:200}, {0:1.0,1:300},{0:1.0,1:400},{0:1.0,1:500}, 
        {0:1.0,1:1000}, {0:10,1:1000},{0:100,1:1000} ]
    crange = np.arange(1, 20.0, 2)
    hyperparam_grid = {"class_weight": w
                    ,"penalty": [None, "l1", "l2"]
                    ,"C": crange
                    ,"fit_intercept": [True, False]  }

    log_reg = LogisticRegression(max_iter=1000)
    grid = GridSearchCV(log_reg,hyperparam_grid,scoring="roc_auc", cv=None, n_jobs=-1, refit=True)
    grid.fit(scaled_inputs, targets)
    print(f'Best score: {grid.best_score_} with param: {grid.best_params_}')
    # if test_validation_ratio < 1:
    #     print(f"Accuracy: {accuracy_score(y_test, log_reg.predict(X_test))}")
    #     print(f"Confusion Matrix: {confusion_matrix(y_test, log_reg.predict(X_test))}")
    #     print(f"ROC-AUC: {roc_auc_score(y_test, log_reg.predict(X_test))}")
    #     print(f"Recall: {recall_score(y_test, log_reg.predict(X_test))}")
    #     print(f"Random Accuracy: {accuracy_score(y_test, [random.randint(1, 8) == 1 for row in y_test])}")

    best_reg = LogisticRegression(max_iter=1000, **grid.best_params_)
    best_reg.fit(scaled_inputs, targets)
    return best_reg


In [49]:
regression = build_and_test_model()

  means = df.groupby("YEAR").transform(lambda x: x.mean())


Best score: 0.9766032453976088 with param: {'C': 3.0, 'class_weight': {0: 0.001, 1: 1.0}, 'fit_intercept': False, 'penalty': 'l2'}


7000 fits failed out of a total of 10500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3500 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mark/.cache/pypoetry/virtualenvs/predictingtheorb-I8IAuHj--py3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mark/.cache/pypoetry/virtualenvs/predictingtheorb-I8IAuHj--py3.10/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/mark/.cache/pypoetry/virtualenvs/predictingtheorb-I8IAuHj--py3.10/lib/python3.10/site-packages/sklearn/linear

In [53]:
duke_df = pd.read_csv(
    os.path.join(
        os.path.abspath(""), "duke.csv"
    )
)

inputs = get_inputs(duke_df)


scaled_inputs = preprocess(inputs)

probabilities = regression.predict_proba(scaled_inputs)

for i, p in enumerate(probabilities):
    print(f"{duke_df.loc[i, 'TEAM']} {duke_df.loc[i, 'YEAR']} -- Elite 8 chance: {round(p[1]*100, 2)}%")

Duke 2022 -- Elite 8 chance: 33.49%
Louisville 2013 -- Elite 8 chance: 52.2%
Connecticut 2014 -- Elite 8 chance: 17.75%
Duke 2015 -- Elite 8 chance: 66.72%
Villanova 2016 -- Elite 8 chance: 51.46%
North Carolina 2017 -- Elite 8 chance: 44.42%
Villanova 2018 -- Elite 8 chance: 66.32%
Virginia 2019 -- Elite 8 chance: 71.58%


  means = df.groupby("YEAR").transform(lambda x: x.mean())


# Analysis
According to our model, we predict the following:

Duke 2022 -- Elite 8 chance: 33.49%

We also computed previous champions elite 8 chances to sanity check our model.

I've copied 538's model results for Elite 8 chances for each of these teams as well for comparison

| Team | Year | Predicted Chance | 538 Chance |
| --- | ----------- | --- | ----------- |
| Louisville | 2013 | 52.2% | NA - No model this year |
| Connecticut | 2014 | 17.75% | 14% |
| Duke | 2015 | 66.72% | 55% |
| Villanova | 2016 | 51.46% | 47% |
| North Carolina | 2017 | 44.42% | 58% |
| Villanova | 2018 | 66.32% | 67% |
| Virginia | 2019 | 71.58% | 73% |

Our percent chances agree very closely with 538's. Given that I think we should use our numbers with slight adjustments.


