<a href="https://colab.research.google.com/github/milesfking/NBA-Champion-Model/blob/main/New%20Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

## Data Cleaning

In [2]:
# Load in dataset of previous seasons, include only teams who made the playoffs
dataset = pd.read_csv('nba_team_advanced_data.csv')
dataset = dataset[dataset["Playoffs"] == "Y"]

# Load in dataset of previous seasons, include only teams who made the playoffs
test_dataset = pd.read_csv('2023_advanced_data.csv')
test_dataset = test_dataset[test_dataset["Playoffs"] == "Y"]

# Drop irrelevant columns
dataset = dataset.drop(columns=['Playoffs', 'Losing_season', 'Arena', 'L', 'W', 'PW', 'PL', 'Attend.'])
test_dataset = test_dataset.drop(columns=['Playoffs', 'Losing_season', 'Arena', 'L', 'W', 'PW', 'PL', 'Attend.'])

# Split into predictor and response variables
X = dataset.iloc[:, ~ dataset.columns.isin(['Year', 'Champion', 'Team'])]
y = dataset.loc[:, dataset.columns == 'Champion'].values
y = np.array([0 if val == "N" else 1 for val in y])

X_test= test_dataset.iloc[:, ~ test_dataset.columns.isin(['Year', 'Champion', 'Team'])]
y_test = test_dataset.loc[:, test_dataset.columns == 'Champion'].values
y_test = np.array([0 if val == "N" else 1 for val in y_test])

# Transform data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['won_last', 'won_last_3'])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X_test = np.array(ct.transform(X_test))

# Scale data
sc = StandardScaler()
X = sc.fit_transform(X)
X_test = sc.transform(X_test)

## Decision Tree

In [12]:
# Create decision tree classifier
classifier = DecisionTreeClassifier()
classifier.fit(X, y)

param_grid= {'ccp_alpha': [0.1, 0.01, 0.001],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 6, 7, 8, 9],
                         'max_features': ['auto', 'sqrt', 'log2']}

#Evaluate through K Fold (cv=10)
#N_jobs optional (how to run processors), -1 means to use all processors- optimizes
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = param_grid,
                           scoring = 'f1',
                           cv = 10,
                           n_jobs = -1)

grid_search.fit(X, y)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best F1: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'f1': 'f1'}

optimized_classifier = DecisionTreeClassifier(criterion = 'gini',
                                              max_depth = 7,
                                              max_features = 'sqrt',
                                              ccp_alpha = 0.001,
                                              random_state = 0)

accuracies = cross_validate(estimator = optimized_classifier, X = X, y = y, cv = 10, scoring = scoring)

#https://stackoverflow.com/questions/51315083/convert-python-dict-of-arrays-into-a-dataframe
pd.DataFrame([k, *v] for k, v in accuracies.items())

y_pred = classifier.predict(X_test)
test_arrays = np.column_stack([test_dataset["Year"], test_dataset["Team"], y_pred.flatten()])
test_df = pd.DataFrame(test_arrays, columns = ['Year', 'Team', 'Prediction'])

# Return test dataframe
test_df

Best F1: 33.06 %
Best Parameters: {'ccp_alpha': 0.001, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'sqrt'}


Unnamed: 0,Year,Team,Prediction
0,2023.0,Atlanta Hawks,0
1,2023.0,Boston Celtics,0
2,2023.0,Brooklyn Nets,0
3,2023.0,Chicago Bulls,0
4,2023.0,Cleveland Cavaliers,1
5,2023.0,Denver Nuggets,0
6,2023.0,Golden State Warriors,0
7,2023.0,Los Angeles Clippers,0
8,2023.0,Los Angeles Lakers,0
9,2023.0,Memphis Grizzlies,0


## Logistic Regression

In [6]:
# Create logistic regression classifier
classifier = LogisticRegression(solver='lbfgs', random_state=0)
classifier.fit(X, y)

# Predict probabilities for test data
y_proba = classifier.predict_proba(X_test)

# Store predicted probabilities in DataFrame with team and year
team_names = test_dataset['Team'].values
year = test_dataset['Year'].values
predictions = pd.DataFrame(data=y_proba, columns=['Lose Probability', 'Win Probability'])
predictions.insert(loc=0, column='Team', value=team_names)
predictions.insert(loc=1, column='Year', value=year)

# Output DataFrame with team, year, and predicted probabilities
print(predictions.sort_values(by=['Win Probability'], ascending=False)[['Team', 'Win Probability']])

                      Team  Win Probability
11         Milwaukee Bucks         0.130980
1           Boston Celtics         0.058115
6    Golden State Warriors         0.038805
5           Denver Nuggets         0.024476
4      Cleveland Cavaliers         0.014054
9        Memphis Grizzlies         0.012936
16      Philadelphia 76ers         0.008737
17            Phoenix Suns         0.006903
7     Los Angeles Clippers         0.004548
2            Brooklyn Nets         0.003390
14         New York Knicks         0.002499
3            Chicago Bulls         0.002490
18        Sacramento Kings         0.002434
8       Los Angeles Lakers         0.002137
13    New Orleans Pelicans         0.001919
12  Minnesota Timberwolves         0.001661
10              Miami Heat         0.001074
19         Toronto Raptors         0.001057
0            Atlanta Hawks         0.000752
15   Oklahoma City Thunder         0.000347


## Linear Regression and Softmax

In [11]:
# Create linear regression model
linear_regression = LinearRegression()
linear_regression.fit(X, y)

# Predict probabilities for test data
y_pred = linear_regression.predict(X_test)
y_prob = np.apply_along_axis(lambda x: np.exp(x) / np.sum(np.exp(x)), 0, y_pred)

# Store predicted probabilities in DataFrame with team and year
team_names = test_dataset['Team'].values
predictions = pd.DataFrame(data=y_prob, columns=['Win Probability'])

predictions.insert(loc=0, column='Team', value=team_names)

# Output DataFrame with team, year, and predicted probabilities
print(predictions.sort_values(by=['Win Probability'], ascending=False))

                      Team  Win Probability
6    Golden State Warriors         0.064749
11         Milwaukee Bucks         0.056696
5           Denver Nuggets         0.054738
9        Memphis Grizzlies         0.054631
1           Boston Celtics         0.052027
17            Phoenix Suns         0.051623
4      Cleveland Cavaliers         0.051021
2            Brooklyn Nets         0.049259
8       Los Angeles Lakers         0.049067
3            Chicago Bulls         0.048306
14         New York Knicks         0.048117
16      Philadelphia 76ers         0.048117
7     Los Angeles Clippers         0.047930
0            Atlanta Hawks         0.047743
19         Toronto Raptors         0.047557
12  Minnesota Timberwolves         0.047371
18        Sacramento Kings         0.046274
13    New Orleans Pelicans         0.046094
15   Oklahoma City Thunder         0.045379
10              Miami Heat         0.043301
