<a href="https://colab.research.google.com/github/milesfking/NBA-Champion-Model/blob/main/Champion%20Model%20LOOCV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries

In [None]:
current_season_df = pd.read_csv('https://raw.githubusercontent.com/milesfking/NBA-Champion-Model/main/data/2023_advanced_data.csv')
current_season_df.head()

Unnamed: 0,Year,Team,Age,W,L,PW,PL,MOV,SOS,SRS,...,DRB%,DFT/FGA,Arena,Attend.,Playoffs,W/L%,Losing_season,Champion,won_last,won_last_3
0,2023.0,Atlanta Hawks,24.9,41.0,41.0,42.0,40.0,0.29,0.02,0.32,...,75.8,0.206,State Farm Arena,719787.0,Y,0.5,N,N,N,N
1,2023.0,Boston Celtics,27.4,57.0,25.0,57.0,25.0,6.52,-0.15,6.38,...,78.5,0.18,TD Garden,766240.0,Y,0.695122,N,N,N,N
2,2023.0,Brooklyn Nets,28.0,45.0,37.0,43.0,39.0,0.85,0.18,1.03,...,73.7,0.212,Barclays Center,724439.0,Y,0.54878,N,N,N,N
3,2023.0,Charlotte Hornets,25.3,27.0,55.0,26.0,56.0,-6.24,0.35,-5.89,...,75.5,0.211,Spectrum Center,702052.0,N,0.329268,Y,N,N,N
4,2023.0,Chicago Bulls,27.5,40.0,42.0,44.0,38.0,1.29,0.07,1.37,...,77.8,0.197,United Center,841632.0,Y,0.487805,Y,N,N,N


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.tree import DecisionTreeClassifier

## Data Cleaning

In [2]:
# Load in dataset of previous seasons, include only teams who made the playoffs
past_seasons_df = pd.read_csv("https://raw.githubusercontent.com/milesfking/NBA-Champion-Model/main/data/nba_team_advanced_data.csv")
past_seasons_df = past_seasons_df[past_seasons_df["Playoffs"] == "Y"]

past_seasons_df.head()

Unnamed: 0,Year,Team,Age,W,L,PW,PL,MOV,SOS,SRS,...,DRB%,DFT/FGA,Arena,Attend.,Playoffs,W/L%,Losing_season,Champion,won_last,won_last_3
1,1990.0,Boston Celtics,30.2,52.0,30.0,51.0,31.0,3.99,-0.76,3.23,...,72.1,0.223,Boston Garden,611537.0,Y,0.634146,N,N,N,N
3,1990.0,Chicago Bulls,26.1,55.0,27.0,50.0,32.0,3.26,-0.51,2.74,...,68.1,0.262,Chicago Stadium,752564.0,Y,0.670732,N,N,N,N
4,1990.0,Cleveland Cavaliers,26.1,42.0,40.0,40.0,42.0,-0.3,-0.31,-0.62,...,67.7,0.202,Coliseum at Richfield,695710.0,Y,0.512195,N,N,N,N
5,1990.0,Dallas Mavericks,29.2,47.0,35.0,41.0,41.0,0.07,0.35,0.42,...,67.9,0.232,Reunion Arena,691490.0,Y,0.573171,N,N,N,N
6,1990.0,Denver Nuggets,29.5,43.0,39.0,45.0,37.0,1.41,0.15,1.56,...,71.3,0.264,McNichols Sports Arena,484288.0,Y,0.52439,N,N,N,N


In [3]:
# Drop irrelevant columns
past_seasons_df = past_seasons_df.drop(columns=['Playoffs', 'Losing_season', 'Arena', 'L', 'W', 'PW', 'PL', 'Attend.'])

# Split into predictor and response variables
X = past_seasons_df.iloc[:, ~ past_seasons_df.columns.isin(['Year', 'Champion', 'Team'])]
y = past_seasons_df.loc[:, past_seasons_df.columns == 'Champion'].values
y = np.array([0 if val == "N" else 1 for val in y])

# Transform data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['won_last', 'won_last_3'])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

# Scale data
sc = StandardScaler()
X = sc.fit_transform(X)

## Leave-One-Out Logistic Regression

In [20]:
# Get the predictor variable columns
predictor_cols = past_seasons_df.columns[~past_seasons_df.columns.isin(['Year', 'Champion', 'Team', 'won_last', 'won_last_3'])]

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Fit the OneHotEncoder on the predictor columns
_ = encoder.fit(past_seasons_df[['won_last', 'won_last_3']])

# Get the transformed feature names
feature_names = encoder.get_feature_names_out(input_features=['won_last', 'won_last_3'])

# Create empty DataFrame for coefficients
coefficients_df = pd.DataFrame(columns=np.concatenate([predictor_cols,feature_names]))
coefficients_df.head()



Unnamed: 0,Age,MOV,SOS,SRS,ORtg,DRtg,NRtg,Pace,FTr,3PAr,...,OFT/FGA,DeFG%,DTOV%,DRB%,DFT/FGA,W/L%,won_last_N,won_last_Y,won_last_3_N,won_last_3_Y


In [21]:
favorite = []
champion_probs = []

# Get a list of unique years in the dataset
years = past_seasons_df['Year'].unique()

# Get a list of champions in the dataset
champions = past_seasons_df[past_seasons_df['Champion'] == 'Y'][['Team', 'Year']]

# Iterate over the years
for year in years:
    print(f"Predicting for year {year}...")

    # Load test data for the current year, make all other years train data
    test_df = past_seasons_df[past_seasons_df['Year'] == year]
    train_df = past_seasons_df[past_seasons_df['Year'] != year]

    # Split into predictor and response variables
    X_train = train_df.iloc[:, ~train_df.columns.isin(['Year', 'Champion', 'Team'])]
    y_train = train_df.loc[:, train_df.columns == 'Champion'].values
    y_train = np.array([0 if val == "N" else 1 for val in y_train])

    X_test = test_df.iloc[:, ~test_df.columns.isin(['Year', 'Champion', 'Team'])]
    y_test = test_df.loc[:, test_df.columns == 'Champion'].values
    y_test = np.array([0 if val == "N" else 1 for val in y_test])

    # Transform data
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['won_last', 'won_last_3'])], remainder='passthrough')
    X_train = np.array(ct.fit_transform(X_train))
    X_test = np.array(ct.transform(X_test))

    # Scale data
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    # Create logistic regression classifier
    classifier = LogisticRegression(solver='lbfgs', random_state=0)
    classifier.fit(X_train, y_train)

    # Predict probabilities for test data
    y_proba = classifier.predict_proba(X_test)

    # Scale probabilities so they sum to 1
    normalizing_const = sum([elem[1] for elem in y_proba])
    y_proba = y_proba / normalizing_const

    # Store predicted probabilities in dataframe with team and year
    team_names = test_df['Team'].values
    year_vec = test_df['Year'].values
    predictions = pd.DataFrame(data=y_proba, columns=['Lose Probability', 'Win Probability'])
    predictions.insert(loc=0, column='Team', value=team_names)
    predictions.insert(loc=1, column='Year', value=year_vec)

    # Output dataframe with team, year, and predicted probabilities
    print("Model output:")
    sorted_probs = predictions.sort_values(by=['Win Probability'], ascending=False)[['Team', 'Win Probability']]
    print(sorted_probs.head())

    # Predicted versus true champion
    predicted_champion = sorted_probs['Team'].iloc[0]
    true_champion = champions[champions['Year'] == year]['Team'].iloc[0]
    print("Predicted Champion:", predicted_champion)
    print("True Champion:", true_champion)
    print()

    # Append coefficients to dataframe
    coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)

    # Append champion probab ilities
    champion_probs.append(predictions[predictions['Team'] == true_champion]['Win Probability'].iloc[0])

    # Get number of correct predictions
    if sorted_probs['Team'].iloc[0] == champions[champions['Year'] == year]['Team'].iloc[0]:
      favorite.append(1)
    else:
      favorite.append(0)

Predicting for year 1990.0...
Model output:
                      Team  Win Probability
5          Detroit Pistons         0.434010
8       Los Angeles Lakers         0.182904
13  Portland Trail Blazers         0.109709
14       San Antonio Spurs         0.046682
15               Utah Jazz         0.045731
Predicted Champion: Detroit Pistons
True Champion: Detroit Pistons

Predicting for year 1991.0...


  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)


Model output:
                      Team  Win Probability
12  Portland Trail Blazers         0.259600
2            Chicago Bulls         0.231335
1           Boston Celtics         0.128783
3          Detroit Pistons         0.123218
7       Los Angeles Lakers         0.094291
Predicted Champion: Portland Trail Blazers
True Champion: Chicago Bulls

Predicting for year 1992.0...
Model output:
                      Team  Win Probability
1            Chicago Bulls         0.676695
12  Portland Trail Blazers         0.098609
0           Boston Celtics         0.036095
10         New York Knicks         0.033682
15               Utah Jazz         0.031640
Predicted Champion: Chicago Bulls
True Champion: Chicago Bulls

Predicting for year 1993.0...


  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)


Model output:
                   Team  Win Probability
3         Chicago Bulls         0.374523
11         Phoenix Suns         0.184084
10      New York Knicks         0.143915
14  Seattle SuperSonics         0.090424
4   Cleveland Cavaliers         0.074340
Predicted Champion: Chicago Bulls
True Champion: Chicago Bulls

Predicting for year 1994.0...
Model output:
                   Team  Win Probability
14  Seattle SuperSonics         0.232266
1         Chicago Bulls         0.210733
9       New York Knicks         0.173840
0         Atlanta Hawks         0.129623
11         Phoenix Suns         0.073033
Predicted Champion: Seattle SuperSonics
True Champion: Houston Rockets

Predicting for year 1995.0...


  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)


Model output:
                      Team  Win Probability
10           Orlando Magic         0.222595
13       San Antonio Spurs         0.198433
15               Utah Jazz         0.133313
14     Seattle SuperSonics         0.127191
12  Portland Trail Blazers         0.062026
Predicted Champion: Orlando Magic
True Champion: Houston Rockets

Predicting for year 1996.0...
Model output:
                   Team  Win Probability
1         Chicago Bulls         0.567916
14  Seattle SuperSonics         0.169919
13    San Antonio Spurs         0.082597
9         Orlando Magic         0.064577
15            Utah Jazz         0.024138
Predicted Champion: Chicago Bulls
True Champion: Chicago Bulls

Predicting for year 1997.0...


  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)


Model output:
                   Team  Win Probability
2         Chicago Bulls         0.499989
4       Houston Rockets         0.096231
13  Seattle SuperSonics         0.084228
14            Utah Jazz         0.076883
7            Miami Heat         0.068653
Predicted Champion: Chicago Bulls
True Champion: Chicago Bulls

Predicting for year 1998.0...
Model output:
                   Team  Win Probability
2         Chicago Bulls         0.445474
14  Seattle SuperSonics         0.238034
13    San Antonio Spurs         0.066266
11         Phoenix Suns         0.059265
6    Los Angeles Lakers         0.052729
Predicted Champion: Chicago Bulls
True Champion: Chicago Bulls

Predicting for year 1999.0...
Model output:
                      Team  Win Probability
14       San Antonio Spurs         0.428039
9            Orlando Magic         0.121880
5               Miami Heat         0.096536
12  Portland Trail Blazers         0.090848
15               Utah Jazz         0.076063
Predicted Cham

  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)


Model output:
                      Team  Win Probability
5       Los Angeles Lakers         0.420511
11        Sacramento Kings         0.219155
12       San Antonio Spurs         0.127119
10  Portland Trail Blazers         0.047596
7          New Jersey Nets         0.047378
Predicted Champion: Los Angeles Lakers
True Champion: Los Angeles Lakers

Predicting for year 2003.0...
Model output:
                      Team  Win Probability
13        Sacramento Kings         0.399278
1         Dallas Mavericks         0.172881
14       San Antonio Spurs         0.152021
4       Los Angeles Lakers         0.071600
12  Portland Trail Blazers         0.049185
Predicted Champion: Sacramento Kings
True Champion: San Antonio Spurs

Predicting for year 2004.0...
Model output:
                      Team  Win Probability
15       San Antonio Spurs         0.592358
10  Minnesota Timberwolves         0.163764
6       Los Angeles Lakers         0.073708
14        Sacramento Kings         0.042255
5    

  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)


Model output:
                 Team  Win Probability
14  San Antonio Spurs         0.680170
4     Detroit Pistons         0.189782
12       Phoenix Suns         0.084556
2    Dallas Mavericks         0.013455
9          Miami Heat         0.012486
Predicted Champion: San Antonio Spurs
True Champion: Miami Heat

Predicting for year 2007.0...
Model output:
                 Team  Win Probability
12  San Antonio Spurs         0.310265
11       Phoenix Suns         0.265261
2    Dallas Mavericks         0.205069
6     Houston Rockets         0.068842
8          Miami Heat         0.038593
Predicted Champion: San Antonio Spurs
True Champion: San Antonio Spurs

Predicting for year 2008.0...


  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)


Model output:
                   Team  Win Probability
12    San Antonio Spurs         0.405321
1        Boston Celtics         0.194273
6       Houston Rockets         0.098105
11         Phoenix Suns         0.082912
8   New Orleans Hornets         0.067184
Predicted Champion: San Antonio Spurs
True Champion: Boston Celtics

Predicting for year 2009.0...
Model output:
                   Team  Win Probability
1        Boston Celtics         0.506015
3   Cleveland Cavaliers         0.192585
8    Los Angeles Lakers         0.130100
11        Orlando Magic         0.076186
14    San Antonio Spurs         0.046591
Predicted Champion: Boston Celtics
True Champion: Los Angeles Lakers

Predicting for year 2010.0...
Model output:
                   Team  Win Probability
11        Orlando Magic         0.245020
7    Los Angeles Lakers         0.228427
4   Cleveland Cavaliers         0.199184
12         Phoenix Suns         0.081404
14    San Antonio Spurs         0.072945
Predicted Champion: O

  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)


Model output:
                 Team  Win Probability
2       Chicago Bulls         0.495214
14  San Antonio Spurs         0.231100
9          Miami Heat         0.061448
3    Dallas Mavericks         0.057948
0       Atlanta Hawks         0.025719
Predicted Champion: Chicago Bulls
True Champion: Miami Heat

Predicting for year 2013.0...
Model output:
                     Team  Win Probability
11             Miami Heat         0.473417
8    Los Angeles Clippers         0.155857
15      San Antonio Spurs         0.117539
4          Denver Nuggets         0.071944
14  Oklahoma City Thunder         0.046410
Predicted Champion: Miami Heat
True Champion: Miami Heat

Predicting for year 2014.0...
Model output:
                     Team  Win Probability
13      San Antonio Spurs         0.354377
10             Miami Heat         0.336835
8    Los Angeles Clippers         0.068224
5   Golden State Warriors         0.045869
11  Oklahoma City Thunder         0.041509
Predicted Champion: San Anton

  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)


Model output:
                     Team  Win Probability
6   Golden State Warriors         0.370575
13      San Antonio Spurs         0.316832
0           Atlanta Hawks         0.075003
8    Los Angeles Clippers         0.060191
7         Houston Rockets         0.040064
Predicted Champion: Golden State Warriors
True Champion: Golden State Warriors

Predicting for year 2016.0...
Model output:
                     Team  Win Probability
6   Golden State Warriors         0.467932
14      San Antonio Spurs         0.418991
3     Cleveland Cavaliers         0.035234
12  Oklahoma City Thunder         0.025898
9    Los Angeles Clippers         0.020320
Predicted Champion: Golden State Warriors
True Champion: Cleveland Cavaliers

Predicting for year 2017.0...
Model output:
                     Team  Win Probability
4   Golden State Warriors         0.587353
12      San Antonio Spurs         0.199781
3     Cleveland Cavaliers         0.098318
5         Houston Rockets         0.037611
7    Los 

  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)


Model output:
                     Team  Win Probability
2   Golden State Warriors         0.391051
3         Houston Rockets         0.340879
13        Toronto Raptors         0.073458
10     Philadelphia 76ers         0.037238
12      San Antonio Spurs         0.031645
Predicted Champion: Golden State Warriors
True Champion: Golden State Warriors

Predicting for year 2019.0...
Model output:
                     Team  Win Probability
4   Golden State Warriors         0.528490
8         Milwaukee Bucks         0.210587
14        Toronto Raptors         0.075345
15              Utah Jazz         0.045494
5         Houston Rockets         0.025398
Predicted Champion: Golden State Warriors
True Champion: Toronto Raptors

Predicting for year 2020.0...
Model output:
                    Team  Win Probability
9        Milwaukee Bucks         0.512597
14       Toronto Raptors         0.228893
7     Los Angeles Lakers         0.154867
6   Los Angeles Clippers         0.028436
0         Boston C

  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)
  coefficients_df = coefficients_df.append(pd.Series(classifier.coef_.flatten(), index=coefficients_df.columns), ignore_index=True)


In [28]:
# Print average coefficients
averaged_coefficients = coefficients_df.mean(axis=0)
averaged_coefficients

Age            -0.164117
MOV             0.164117
SOS             0.012735
SRS            -0.012735
ORtg            0.514711
DRtg            0.074402
NRtg            0.024473
Pace            0.072692
FTr             0.098794
3PAr           -0.170815
TS%             0.328265
OeFG%           0.194734
OTOV%           0.414978
ORB%           -0.076401
OFT/FGA         0.012491
DeFG%           0.553980
DTOV%           0.066250
DRB%            0.459356
DFT/FGA        -0.885823
W/L%           -0.491697
won_last_N      0.267633
won_last_Y      0.192121
won_last_3_N   -0.309253
won_last_3_Y    0.644533
dtype: float64