# Section 1
## Importing The Data
Here we will import our data set and take a peak at the first few rows

In [10]:
import pandas as pd
 # The url may change once I change how the data is uploaded
url = 'games.csv'
df = pd.read_csv(url)
# Taking a peak at the first few rows of data
df.head() 

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-12-22,22200477,Final,1610612740,1610612759,2022,1610612740,126.0,0.484,0.926,...,25.0,46.0,1610612759,117.0,0.478,0.815,0.321,23.0,44.0,1
1,2022-12-22,22200478,Final,1610612762,1610612764,2022,1610612762,120.0,0.488,0.952,...,16.0,40.0,1610612764,112.0,0.561,0.765,0.333,20.0,37.0,1
2,2022-12-21,22200466,Final,1610612739,1610612749,2022,1610612739,114.0,0.482,0.786,...,22.0,37.0,1610612749,106.0,0.47,0.682,0.433,20.0,46.0,1
3,2022-12-21,22200467,Final,1610612755,1610612765,2022,1610612755,113.0,0.441,0.909,...,27.0,49.0,1610612765,93.0,0.392,0.735,0.261,15.0,46.0,1
4,2022-12-21,22200468,Final,1610612737,1610612741,2022,1610612737,108.0,0.429,1.0,...,22.0,47.0,1610612741,110.0,0.5,0.773,0.292,20.0,47.0,0


In [11]:
df.value_counts('HOME_TEAM_WINS')

HOME_TEAM_WINS
1    15645
0    11006
dtype: int64

# Section 2
## Data Transformation
Here we will drop some unessasry rows and do some basic data transformations

In [12]:
# These rows either have no relavance to the problem, or would make things too easy for our model. We will simply remove them
drop_columns = ['GAME_ID', 'GAME_STATUS_TEXT', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'TEAM_ID_home', 'TEAM_ID_away', 'GAME_DATE_EST', 'PTS_home', 'PTS_away']
df = df.drop(drop_columns, axis=1)
# Take a peak at the dataset with those rows removed
df.head()

Unnamed: 0,SEASON,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022,0.484,0.926,0.382,25.0,46.0,0.478,0.815,0.321,23.0,44.0,1
1,2022,0.488,0.952,0.457,16.0,40.0,0.561,0.765,0.333,20.0,37.0,1
2,2022,0.482,0.786,0.313,22.0,37.0,0.47,0.682,0.433,20.0,46.0,1
3,2022,0.441,0.909,0.297,27.0,49.0,0.392,0.735,0.261,15.0,46.0,1
4,2022,0.429,1.0,0.378,22.0,47.0,0.5,0.773,0.292,20.0,47.0,0


In [13]:
# We have a couple of Na values. Since there are so few, we will just drop any row containing a na value
df.dropna(inplace=True)
# Making sure no na values remain
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26552 entries, 0 to 26650
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SEASON          26552 non-null  int64  
 1   FG_PCT_home     26552 non-null  float64
 2   FT_PCT_home     26552 non-null  float64
 3   FG3_PCT_home    26552 non-null  float64
 4   AST_home        26552 non-null  float64
 5   REB_home        26552 non-null  float64
 6   FG_PCT_away     26552 non-null  float64
 7   FT_PCT_away     26552 non-null  float64
 8   FG3_PCT_away    26552 non-null  float64
 9   AST_away        26552 non-null  float64
 10  REB_away        26552 non-null  float64
 11  HOME_TEAM_WINS  26552 non-null  int64  
dtypes: float64(10), int64(2)
memory usage: 2.6 MB


In [14]:
import sklearn.preprocessing as sklp

# We are going to scale some of our columns to imporve the accuracy of the model.
scaler = sklp.MinMaxScaler()

cols = [col for col in df.columns if col not in ['HOME_TEAM_WINS', 'SEASON']]

# Use a standard scaler and look at the first few rows
df[cols] = scaler.fit_transform(df[cols])
df.head()

Unnamed: 0,SEASON,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022,0.539171,0.913652,0.382,0.431818,0.54386,0.528217,0.784131,0.321,0.452381,0.403226,1
1,2022,0.548387,0.943991,0.457,0.227273,0.438596,0.715576,0.725788,0.333,0.380952,0.290323,1
2,2022,0.534562,0.750292,0.313,0.363636,0.385965,0.510158,0.628938,0.433,0.380952,0.435484,1
3,2022,0.440092,0.893816,0.297,0.477273,0.596491,0.334086,0.690782,0.261,0.261905,0.435484,1
4,2022,0.412442,1.0,0.378,0.363636,0.561404,0.577878,0.735123,0.292,0.380952,0.451613,0


In [15]:
from imblearn.over_sampling import RandomOverSampler

# assuming 'df' is your original dataframe
# separate the dataframe into features and target variable
X = df.drop(columns=['HOME_TEAM_WINS'])
y = df['HOME_TEAM_WINS']

# create a RandomOverSampler instance
ros = RandomOverSampler()

# fit and apply the oversampling to the dataset
X_resampled, y_resampled = ros.fit_resample(X, y)

# combine the resampled features and target variable into a new dataframe
df = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled)], axis=1)

df.value_counts('HOME_TEAM_WINS')

HOME_TEAM_WINS
0    15645
1    15645
dtype: int64

# Section 3
## Building a Random Forest Model
Here we will build a random forest model that will predict whether or not the home team won the game.

In [29]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Drop our target to create X and isolate the target in y
X = df.drop('HOME_TEAM_WINS', axis=1)
y = df['HOME_TEAM_WINS']

# Train test split using a 80/20 train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Define a range of hyperparameters to search over
param_grid = {
    'n_estimators': [500],
    'max_depth': [20],
    'min_samples_split': [5],
    'max_features': ['sqrt']
}

# Create a Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Create a grid search object and fit it to the data
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best set of hyperparameters and the corresponding cross-validation score
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best hyperparameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 1000}
Best cross-validation score: 0.8703661218680583


In [42]:
# Create and fit our random forest classifier with some hyperparameters included
clf = RandomForestClassifier(n_estimators=500, max_depth=25, random_state=42) 
clf = clf.fit(X_train, y_train)

In [43]:
from sklearn import metrics, datasets, tree

# Check some metrics on how our model performed
y_pred = clf.predict(X_test)
print('Model Accuracy')
print(clf.score(X_test, y_test))
print('\nModel Balanced Accuracy')
print(metrics.balanced_accuracy_score(y_test, y_pred))
print('\nConfusion Matrix')
print(metrics.confusion_matrix(y_test, y_pred))
print('\nClassification Report')
print(metrics.classification_report(y_test, y_pred))

Model Accuracy
0.8759987216363055

Model Balanced Accuracy
0.8760485720095965

Confusion Matrix
[[2795  326]
 [ 450 2687]]

Classification Report
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      3121
           1       0.89      0.86      0.87      3137

    accuracy                           0.88      6258
   macro avg       0.88      0.88      0.88      6258
weighted avg       0.88      0.88      0.88      6258



## Build A AdaBoost Model

In [18]:
from sklearn.ensemble import AdaBoostClassifier

# Create and fit our random forest classifier with some hyperparameters included
clf = AdaBoostClassifier(n_estimators = 500, random_state=42) 
clf = clf.fit(X_train, y_train)

In [19]:
from sklearn import metrics, datasets, tree

# Check some metrics on how our model performed
y_pred = clf.predict(X_test)
print('Model Accuracy')
print(clf.score(X_test, y_test))
print('\nModel Balanced Accuracy')
print(metrics.balanced_accuracy_score(y_test, y_pred))
print('\nConfusion Matrix')
print(metrics.confusion_matrix(y_test, y_pred))
print('\nClassification Report')
print(metrics.classification_report(y_test, y_pred))

Model Accuracy
0.8363694471077021

Model Balanced Accuracy
0.8363651090226858

Confusion Matrix
[[2605  516]
 [ 508 2629]]

Classification Report
              precision    recall  f1-score   support

           0       0.84      0.83      0.84      3121
           1       0.84      0.84      0.84      3137

    accuracy                           0.84      6258
   macro avg       0.84      0.84      0.84      6258
weighted avg       0.84      0.84      0.84      6258

