<a href="https://colab.research.google.com/github/milesfking/NBA-Champion-Model/blob/main/New%20Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Decision Tree Classification

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing and viewing the dataset

In [None]:
dataset = pd.read_csv('nba_team_advanced_data.csv')
dataset.head()

Unnamed: 0,Year,Team,Age,W,L,PW,PL,MOV,SOS,SRS,...,DRB%,DFT/FGA,Arena,Attend.,Playoffs,W/L%,Losing_season,Champion,won_last,won_last_3
0,1990.0,Atlanta Hawks,28.6,41.0,41.0,44.0,38.0,1.02,-0.39,0.64,...,65.5,0.254,Omni Coliseum,573711.0,N,0.5,N,N,N,N
1,1990.0,Boston Celtics,30.2,52.0,30.0,51.0,31.0,3.99,-0.76,3.23,...,72.1,0.223,Boston Garden,611537.0,Y,0.634146,N,N,N,N
2,1990.0,Charlotte Hornets,25.6,19.0,63.0,21.0,61.0,-7.82,0.81,-7.0,...,67.5,0.269,Charlotte Coliseum,979941.0,N,0.231707,Y,N,N,N
3,1990.0,Chicago Bulls,26.1,55.0,27.0,50.0,32.0,3.26,-0.51,2.74,...,68.1,0.262,Chicago Stadium,752564.0,Y,0.670732,N,N,N,N
4,1990.0,Cleveland Cavaliers,26.1,42.0,40.0,40.0,42.0,-0.3,-0.31,-0.62,...,67.7,0.202,Coliseum at Richfield,695710.0,Y,0.512195,N,N,N,N


In [None]:
dataset = dataset[dataset["Playoffs"] == "Y"]

In [None]:
test_dataset = pd.read_csv('2022_advanced_data.csv')
test_dataset.head()

Unnamed: 0,Year,Team,Age,W,L,PW,PL,MOV,SOS,SRS,...,DRB%,DFT/FGA,Arena,Attend.,Playoffs,W/L%,Losing_season,Champion,won_last,won_last_3
0,2022.0,Atlanta Hawks,26.1,43.0,39.0,45.0,37.0,1.56,-0.01,1.55,...,76.9,0.177,State Farm Arena,672742.0,Y,0.52439,N,N,N,N
1,2022.0,Boston Celtics,26.1,51.0,31.0,59.0,23.0,7.28,-0.26,7.02,...,77.3,0.183,TD Garden,727928.0,Y,0.621951,N,N,N,N
2,2022.0,Brooklyn Nets,29.1,44.0,38.0,43.0,39.0,0.78,0.04,0.82,...,75.1,0.201,Barclays Center,711539.0,Y,0.536585,N,N,N,N
3,2022.0,Charlotte Hornets,25.5,43.0,39.0,42.0,40.0,0.44,0.09,0.53,...,74.8,0.187,Spectrum Center,700755.0,N,0.52439,N,N,N,N
4,2022.0,Chicago Bulls,26.3,46.0,36.0,40.0,42.0,-0.39,0.02,-0.38,...,78.3,0.199,United Center,856148.0,Y,0.560976,N,N,N,N


In [None]:
test_dataset = test_dataset[test_dataset["Playoffs"] == "Y"]

In [None]:
dataset.columns

Index(['Year', 'Team', 'Age', 'W', 'L', 'PW', 'PL', 'MOV', 'SOS', 'SRS',
       'ORtg', 'DRtg', 'NRtg', 'Pace', 'FTr', '3PAr', 'TS%', 'OeFG%', 'OTOV%',
       'ORB%', 'OFT/FGA', 'DeFG%', 'DTOV%', 'DRB%', 'DFT/FGA', 'Arena',
       'Attend.', 'Playoffs', 'W/L%', 'Losing_season', 'Champion', 'won_last',
       'won_last_3'],
      dtype='object')

In [None]:
dataset = dataset.drop(columns=['Playoffs', 'Losing_season', 'Arena', 'L', 'W', 'PW', 'PL', 'Attend.'])
test_dataset = test_dataset.drop(columns=['Playoffs', 'Losing_season', 'Arena', 'L', 'W', 'PW', 'PL', 'Attend.'])

In [None]:
X = dataset.iloc[:, ~ dataset.columns.isin(['Year', 'Champion', 'Team'])]
y = dataset.loc[:, dataset.columns == 'Champion'].values
y = np.array([0 if val == "N" else 1 for val in y])

In [None]:
X_test= test_dataset.iloc[:, ~ test_dataset.columns.isin(['Year', 'Champion', 'Team'])]
y_test = test_dataset.loc[:, test_dataset.columns == 'Champion'].values
y_test = np.array([0 if val == "N" else 1 for val in y_test])

## Identifying Class Imbalance in Training Set

In [None]:
neg, pos = np.bincount(y)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 512
    Positive: 32 (6.25% of total)



## Encode Categorical Values

In [None]:
dataset.dtypes

Year          float64
Team           object
Age           float64
MOV           float64
SOS           float64
SRS           float64
ORtg          float64
DRtg          float64
NRtg          float64
Pace          float64
FTr           float64
3PAr          float64
TS%           float64
OeFG%         float64
OTOV%         float64
ORB%          float64
OFT/FGA       float64
DeFG%         float64
DTOV%         float64
DRB%          float64
DFT/FGA       float64
W/L%          float64
Champion       object
won_last       object
won_last_3     object
dtype: object

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
#3 because want to one hot encode State variable
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['won_last', 'won_last_3'])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X_test = np.array(ct.transform(X_test))

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
X_test = sc.transform(X_test)

## Training the Decision Tree Classification model on the Training set

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X, y)

DecisionTreeClassifier()

## Optimizing Hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid= {'ccp_alpha': [0.1, 0.01, 0.001],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 6, 7, 8, 9],
                         'max_features': ['auto', 'sqrt', 'log2']}

#Evaluate through K Fold (cv=10)
#N_jobs optional (how to run processors), -1 means to use all processors- optimizes
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = param_grid,
                           scoring = 'f1',
                           cv = 10,
                           n_jobs = -1)

grid_search.fit(X, y)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best F1: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best F1: 27.95 %
Best Parameters: {'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': 9, 'max_features': 'log2'}


## Performing Cross Validation on the Tree

In [None]:
from sklearn.model_selection import cross_validate

scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'f1': 'f1'}

optimized_classifier = DecisionTreeClassifier(criterion = 'gini',
                                              max_depth = 7,
                                              max_features = 'auto',
                                              ccp_alpha = 0.001,
                                              random_state = 0)

accuracies = cross_validate(estimator = optimized_classifier, X = X, y = y, cv = 10, scoring = scoring)

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#https://stackoverflow.com/questions/51315083/convert-python-dict-of-arrays-into-a-dataframe
pd.DataFrame([k, *v] for k, v in accuracies.items())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,fit_time,0.002222,0.001696,0.001648,0.001691,0.00163,0.001584,0.001644,0.00152,0.00156,0.00158
1,score_time,0.002562,0.0023,0.002262,0.002256,0.00222,0.002488,0.004577,0.002354,0.002272,0.002302
2,test_accuracy,0.923077,0.961538,0.980392,0.921569,0.921569,0.921569,0.941176,0.921569,0.901961,0.921569
3,test_precision,0.5,1.0,1.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0
4,test_recall,0.5,0.5,0.666667,0.0,0.0,0.333333,0.0,0.0,0.0,0.0
5,test_f1,0.5,0.666667,0.8,0.0,0.0,0.333333,0.0,0.0,0.0,0.0


## Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
test_arrays = np.column_stack([test_dataset["Year"], test_dataset["Team"], y_pred.flatten()])
test_df = pd.DataFrame(test_arrays, columns = ['Year', 'Team', 'Prediction'])

In [None]:
test_df

Unnamed: 0,Year,Team,Prediction
0,2022.0,Atlanta Hawks,0
1,2022.0,Boston Celtics,0
2,2022.0,Brooklyn Nets,0
3,2022.0,Chicago Bulls,0
4,2022.0,Dallas Mavericks,0
5,2022.0,Denver Nuggets,0
6,2022.0,Golden State Warriors,0
7,2022.0,Memphis Grizzlies,0
8,2022.0,Miami Heat,0
9,2022.0,Milwaukee Bucks,0


The model predicts the Phoenix Suns to win the 2022 championship!