<a href="https://colab.research.google.com/github/milesfking/NBA-Champion-Model/blob/main/Model%20v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.tree import DecisionTreeClassifier

## Data Cleaning

In [13]:
# Load in dataset of previous seasons, include only teams who made the playoffs
historic_dataset = pd.read_csv("https://raw.githubusercontent.com/milesfking/NBA-Champion-Model/main/data/nba_team_advanced_data.csv")
historic_dataset = historic_dataset[historic_dataset["Playoffs"] == "Y"]

# Load in 2023 dataset, include only teams who made the playoffs
current_dataset = pd.read_csv('https://raw.githubusercontent.com/milesfking/NBA-Champion-Model/main/data/2023_advanced_data.csv')
current_dataset = current_dataset[current_dataset["Playoffs"] == "Y"]

# Drop irrelevant columns
historic_dataset = historic_dataset.drop(columns=['Playoffs', 'Losing_season', 'Arena', 'L', 'W', 'PW', 'PL', 'Attend.'])
current_dataset = current_dataset.drop(columns=['Playoffs', 'Losing_season', 'Arena', 'L', 'W', 'PW', 'PL', 'Attend.'])

# Split into predictor and response variables
X_historic = historic_dataset.iloc[:, ~ historic_dataset.columns.isin(['Year', 'Champion', 'Team'])]
y_historic = historic_dataset.loc[:, historic_dataset.columns == 'Champion'].values
y_historic = np.array([0 if val == "N" else 1 for val in y_historic])

X_current = current_dataset.iloc[:, ~ current_dataset.columns.isin(['Year', 'Champion', 'Team'])]
y_current = current_dataset.loc[:, current_dataset.columns == 'Champion'].values
y_current = np.array([0 if val == "N" else 1 for val in y_current])

# Transform data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['won_last', 'won_last_3'])], remainder='passthrough')
X_historic = np.array(ct.fit_transform(X_historic))
X_current = np.array(ct.transform(X_current))

# Scale data
sc = StandardScaler()
X_historic = sc.fit_transform(X_historic)
X_current = sc.transform(X_current)

## Logistic Regression

In [14]:
# Create logistic regression classifier
classifier = LogisticRegression(solver='lbfgs', random_state=0)
classifier.fit(X_historic, y_historic)

### Predict on Current Season

In [17]:
# Predict probabilities for test data
y_current_proba = classifier.predict_proba(X_current)

# Scale probabilities so they sum to 1
normalizing_const = sum([elem[1] for elem in y_current_proba])
y_current_proba_norm = y_current_proba / normalizing_const

# Store predicted probabilities in DataFrame with team and year
team_names = current_dataset['Team'].values
year = current_dataset['Year'].values
current_predictions = pd.DataFrame(data=y_current_proba_norm[:, 1], columns=['norm_pred'])
current_predictions.insert(loc=0, column='Team', value=team_names)
current_predictions.insert(loc=1, column='Year', value=year)

# Output DataFrame with team, year, and predicted probabilities
print("Model output:")
print(current_predictions.sort_values(by=['norm_pred'], ascending=False))

# Get the feature names after one-hot encoding
feature_names = ct.get_feature_names_out()

# Create a DataFrame to store coefficients and feature names
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': classifier.coef_.flatten() / normalizing_const})

# Print the coefficients
print()
print("Model coefficients:")
print(coefficients_df)

Model output:
                      Team    Year  norm_pred
11         Milwaukee Bucks  2023.0   0.410194
1           Boston Celtics  2023.0   0.181999
6    Golden State Warriors  2023.0   0.121528
5           Denver Nuggets  2023.0   0.076651
4      Cleveland Cavaliers  2023.0   0.044012
9        Memphis Grizzlies  2023.0   0.040511
16      Philadelphia 76ers  2023.0   0.027361
17            Phoenix Suns  2023.0   0.021619
7     Los Angeles Clippers  2023.0   0.014242
2            Brooklyn Nets  2023.0   0.010615
14         New York Knicks  2023.0   0.007825
3            Chicago Bulls  2023.0   0.007798
18        Sacramento Kings  2023.0   0.007623
8       Los Angeles Lakers  2023.0   0.006692
13    New Orleans Pelicans  2023.0   0.006010
12  Minnesota Timberwolves  2023.0   0.005202
10              Miami Heat  2023.0   0.003365
19         Toronto Raptors  2023.0   0.003310
0            Atlanta Hawks  2023.0   0.002354
15   Oklahoma City Thunder  2023.0   0.001087

Model coefficients:

### Predict on Past Seasons

In [20]:
# Predict probabilities for test data
y_past_proba = classifier.predict_proba(X_historic)

# Store predicted probabilities in DataFrame with team and year
team_names = historic_dataset['Team'].values
year = historic_dataset['Year'].values
historic_predictions = pd.DataFrame(data=y_past_proba[:, 1], columns=['pred'])
historic_predictions.insert(loc=0, column='Team', value=team_names)
historic_predictions.insert(loc=1, column='Year', value=year)

# Output DataFrame with team, year, and predicted probabilities
print("Model output:")
print(historic_predictions)

historic_predictions.head()

Model output:
                     Team    Year      pred
0          Boston Celtics  1990.0  0.027894
1           Chicago Bulls  1990.0  0.011422
2     Cleveland Cavaliers  1990.0  0.001883
3        Dallas Mavericks  1990.0  0.004468
4          Denver Nuggets  1990.0  0.006137
..                    ...     ...       ...
523  New Orleans Pelicans  2022.0  0.000475
524    Philadelphia 76ers  2022.0  0.002753
525          Phoenix Suns  2022.0  0.233746
526       Toronto Raptors  2022.0  0.004773
527             Utah Jazz  2022.0  0.062737

[528 rows x 3 columns]


Unnamed: 0,Team,Year,pred
0,Boston Celtics,1990.0,0.027894
1,Chicago Bulls,1990.0,0.011422
2,Cleveland Cavaliers,1990.0,0.001883
3,Dallas Mavericks,1990.0,0.004468
4,Denver Nuggets,1990.0,0.006137


### Combine and Tidy the Data

In [21]:
# Get datasets for current and past seasons with predictions
current_merged_df = pd.merge(current_predictions, current_dataset, on=['Team', 'Year'])
historic_merged_df = pd.merge(historic_predictions, historic_dataset, on=['Team', 'Year'])

# Merge all years
full_df = pd.concat([historic_merged_df, current_merged_df]).sort_values(by=['Year', 'Team'], ascending=True)

Create a column for normalized predictions

In [22]:
# Group the data by year
grouped = full_df.groupby('Year')

# Normalize the championship probabilities within each year
full_df['norm_pred'] = grouped['pred'].apply(lambda x: x / x.sum())

# Print the updated DataFrame
full_df.head()

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  full_df['norm_pred'] = grouped['pred'].apply(lambda x: x / x.sum())


Unnamed: 0,Team,Year,pred,Age,MOV,SOS,SRS,ORtg,DRtg,NRtg,...,OFT/FGA,DeFG%,DTOV%,DRB%,DFT/FGA,W/L%,Champion,won_last,won_last_3,norm_pred
0,Boston Celtics,1990.0,0.027894,30.2,3.99,-0.76,3.23,112.0,107.9,4.1,...,0.251,0.477,10.8,72.1,0.223,0.634146,N,N,N,0.043689
1,Chicago Bulls,1990.0,0.011422,26.1,3.26,-0.51,2.74,112.3,109.0,3.3,...,0.235,0.508,15.2,68.1,0.262,0.670732,N,N,N,0.017889
2,Cleveland Cavaliers,1990.0,0.001883,26.1,-0.3,-0.31,-0.62,106.9,107.2,-0.3,...,0.235,0.49,14.0,67.7,0.202,0.512195,N,N,N,0.002949
3,Dallas Mavericks,1990.0,0.004468,29.2,0.07,0.35,0.42,107.2,107.2,0.0,...,0.254,0.481,13.4,67.9,0.232,0.573171,N,N,N,0.006998
4,Denver Nuggets,1990.0,0.006137,29.5,1.41,0.15,1.56,108.0,106.7,1.3,...,0.217,0.501,15.2,71.3,0.264,0.52439,N,N,N,0.009612


### Export the Model and Data

In [9]:
# Save the dataframe as a CSV
full_df.to_csv('20230627_NBA_championship_probs.csv')

In [24]:
import joblib

In [27]:
# Save the classifier object using joblib
joblib.dump(classifier, 'model.pkl')

['model.pkl']

In [28]:
# Save the scaler object using joblib
joblib.dump(sc, 'sc.pkl')

['sc.pkl']