## Predicting NBA Games and Gathering Data with Machine Learning and Python

In [None]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

#### Read In NBA Data

In [None]:
# read in data
nba_df = pd.read_csv("nba_games.csv", index_col=0)

# sort values based on data
nba_df = nba_df.sort_values('date')
nba_df = nba_df.reset_index(drop=True)

#### Delete Extraneous Columns

In [None]:
# delete desired columns
del nba_df["mp.1"]
del nba_df["mp_opp.1"]
del nba_df["index_opp"]

#### Create a Function which will Add a Target Column

In [None]:
def target_add(team):
    """
    :param team: current team being looked at
    :return: team with updated columns
    """
    
    # pull the 'won' column from the next game and pull back one row
    team['target'] = team['won'].shift(-1)
    return team

# call function for each team
nba_df = nba_df.groupby("team", group_keys=False).apply(target_add)

#### Begin Fixing Dataframe for Training Models

In [None]:
# turn NaN values of the target column into 2
nba_df['target'][pd.isnull(nba_df['target'])] = 2

# turn False values into 0s and True values into 1s for target column
nba_df['target'] = nba_df['target'].astype(int, errors="ignore")

#### Get Rid of NaN Values

In [None]:
# get nulls
null = pd.isnull(nba_df)
nulls = null.sum()
nulls = nulls[nulls > 0]

# check valid columns
valid_columns = nba_df.columns[~nba_df.columns.isin(nulls.index)]

# copy only the valid columns
nba_df = nba_df[valid_columns].copy()

#### Begin Machine Learning

In [None]:
# initialize models
r = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

# pick 30 features for the model to choose
sfs = SequentialFeatureSelector(r, n_features_to_select=30, direction="forward", cv=split)

# Define Columns to Remove

In [None]:
# get rid of columns which won't be scaled
remove_columns = ["season", "date", "won", "target", "team", "team_opp"]

# get columns that will be scaled
preferred_columns = nba_df.columns[~nba_df.columns.isin(remove_columns)]

# define scaler 
scaler = MinMaxScaler()
nba_df[preferred_columns] = scaler.fit_transform(nba_df[preferred_columns])

#### Fit the Selector

In [None]:
# run the feature selector to get the 30 best features from the dataframe
sfs.fit(nba_df[preferred_columns], nba_df['target'])

#### Get Back Predictor Columns that We should be Using

In [None]:
# get back columns
best_predictors = list(preferred_columns[sfs.get_support()])

#### Create Backtest Function

In [None]:
def b_test(data, model, predictors, start=2, step=1):
    """
    Function will split data up to have previous seasons and future seasons and train on 
    each successive season
    """
    
    # list of data frames, where each dataframe is the predictions for single season
    predictions = []
    
    # create list of seasons
    seasons = sorted(data['season'].unique())
    
    # traverse through seasons
    for i in range(start, len(seasons), step):
        season = seasons[i]
        
        # all data that comes before current season
        train = data[data['season'] < season]
        
        # current season data
        test = data[data['season'] == season]
        
        # model will make judgments on who wins game
        model.fit(train[predictors], train['target'])
        
        # change from numpy array
        pred = model.predict(test[predictors])
        pred = pd.Series(pred, index=test.index)
        
        combined = pd.concat([test['target'], pred], axis=1)
        combined.columns = ['actual', 'predicted']
        
        predictions.append(combined)
    
    # return the predictions
    return pd.concat(predictions)


# call function
nba_predictions = b_test(nba_df, r, best_predictors)

# check to make sure the 2s aren't collected
nba_predictions = nba_predictions[nba_predictions['actual'] != 2]

#### Get Accuracy Score for Predictions

In [None]:
# get accuracy score
accuracy_score(nba_predictions['actual'], nba_predictions['predicted'])

An alright score, not too bad. 

#### Use NBA Team Home Advantage Knowledge to Improve Model

In [None]:
# group accuracy by home column
nba_df.groupby("home").apply(lambda x: x[x['won'] == 1].shape[0] / x.shape[0])

There is somewhat of an advantage for playing at home, as seen by the 57% winning percentage

#### Creating a Dataframe for Rolling Averages

In [None]:
# create a rolling dataframe
nba_df_rolling = nba_df[list(preferred_columns) + ["won", "team", "season"]]

#### Write a Function that Gets Rolling Team Averages

In [None]:
def team_averages(team):
    """
    :param team: current team looking at
    :return: rolling dataframe for team
    """
    
    rolling = team.rolling(10).mean()
    return rolling

# overwrite by grouping based upon team and season 
nba_df_rolling = nba_df_rolling.groupby(["team", "season"], group_keys=False).apply(team_averages)

#### Remove NaN Values from the Rolling Dataframe

In [None]:
# get rolling columns
rolling_cols = [f"{col}_10" for col in nba_df_rolling.columns]
nba_df_rolling.columns = rolling_cols

# combine dataframes
nba_df = pd.concat([nba_df, nba_df_rolling], axis=1)

# drop rows with missing values
nba_df = nba_df.dropna()

#### Add Opponent Information

#### Create Shifting Column Function and Adding Column Function

In [None]:
def shift_col(team, col_name):
    """
    :param team: 
    :param col_name: 
    :return: dataframe with shifted column
    """

    next_col = team[col_name].shift(-1)
    return next_col


def add_col(dataframe, col_name):
    """
    :param dataframe: a dataframe
    :param col_name: a column name
    :return: adds a column to dataframe
    """
    
    return dataframe.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

# find if the next game is home or away
nba_df["home_next"] = add_col(nba_df, "home")
nba_df["team_opp_next"] = add_col(nba_df, "team_opp")
nba_df["date_next"] = add_col(nba_df, "date")


# make a copy for good measure
nba_df = nba_df.copy()

#### Get Information for Team's Previous 10 Games

In [None]:
full = nba_df.merge(nba_df[rolling_cols + ["team_opp_next", "date_next", "team"]],
                    left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])

#### Find Out Our New Preferred Columns

In [None]:
# get removed columns
removed_columns = list(full.columns[full.dtypes == "object"]) + remove_columns

# get preferred columns
preferred_columns = full.columns[~full.columns.isin(removed_columns)]

# fit to the feature selector
sfs.fit(full[preferred_columns], full['target'])

#### Get Final Predictions

In [None]:
# get the best predictors
best_predictors = list(preferred_columns[sfs.get_support()])

# predict 
nba_predictions = b_test(full, r, best_predictors)

# get accuracy score
score = accuracy_score(nba_predictions['actual'], nba_predictions['predicted'])
print(score)