In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime


In [2]:
#Loading previously cleansed data data:
shot_data = pd.read_csv('NBA_2023_Shots.csv')

#Head?
shot_data.head()
shot_data.columns

Index(['SEASON_1', 'SEASON_2', 'TEAM_ID', 'TEAM_NAME', 'PLAYER_ID',
       'PLAYER_NAME', 'POSITION_GROUP', 'POSITION', 'GAME_DATE', 'GAME_ID',
       'HOME_TEAM', 'AWAY_TEAM', 'EVENT_TYPE', 'SHOT_MADE', 'ACTION_TYPE',
       'SHOT_TYPE', 'BASIC_ZONE', 'ZONE_NAME', 'ZONE_ABB', 'ZONE_RANGE',
       'LOC_X', 'LOC_Y', 'SHOT_DISTANCE', 'QUARTER', 'MINS_LEFT', 'SECS_LEFT'],
      dtype='object')

In [3]:
shot_data['EVENT_TYPE'] = shot_data['EVENT_TYPE'].map({'Made Shot': 1, 'Missed Shot': 0})

In [4]:
shot_data.drop(columns=['SEASON_1', 'SEASON_2', 'TEAM_NAME', 'PLAYER_NAME', 'HOME_TEAM', 'AWAY_TEAM', 'TEAM_ID', 'GAME_DATE', 'SHOT_MADE', 'LOC_X', 'LOC_Y', 'GAME_ID', 'QUARTER', 'MINS_LEFT', 'SECS_LEFT', 'ZONE_ABB'], inplace=True)

shot_data.columns

Index(['PLAYER_ID', 'POSITION_GROUP', 'POSITION', 'EVENT_TYPE', 'ACTION_TYPE',
       'SHOT_TYPE', 'BASIC_ZONE', 'ZONE_NAME', 'ZONE_RANGE', 'SHOT_DISTANCE'],
      dtype='object')

In [5]:
#Dummy feature creation
def dumdum(df, features_to_exclude):
    dfo = df.select_dtypes(include=['object'])
    du = pd.get_dummies(df.drop(features_to_exclude,axis =1))
    dfn = df.select_dtypes(exclude=['object'])
    df = dfn.merge(du,left_index=True,right_index=True)
    return df  

shot_data = dumdum(shot_data, ['EVENT_TYPE'])

In [6]:
shot_data.head()

Unnamed: 0,PLAYER_ID_x,EVENT_TYPE,SHOT_DISTANCE_x,PLAYER_ID_y,SHOT_DISTANCE_y,POSITION_GROUP_C,POSITION_GROUP_F,POSITION_GROUP_G,POSITION_C,POSITION_PF,...,ZONE_NAME_Center,ZONE_NAME_Left Side,ZONE_NAME_Left Side Center,ZONE_NAME_Right Side,ZONE_NAME_Right Side Center,ZONE_RANGE_16-24 ft.,ZONE_RANGE_24+ ft.,ZONE_RANGE_8-16 ft.,ZONE_RANGE_Back Court Shot,ZONE_RANGE_Less Than 8 ft.
0,203078,1,24,203078,24,False,False,True,False,False,...,False,True,False,False,False,False,True,False,False,False
1,204001,0,26,204001,26,True,False,False,True,False,...,True,False,False,False,False,False,True,False,False,False
2,1628420,1,1,1628420,1,False,False,True,False,False,...,True,False,False,False,False,False,False,False,False,True
3,204001,1,2,204001,2,True,False,False,True,False,...,True,False,False,False,False,False,False,False,False,True
4,1630166,1,2,1630166,2,False,True,False,False,False,...,True,False,False,False,False,False,False,False,False,True


In [7]:
# Making a Scaler object
scaler = StandardScaler()
# Fitting data to the scaler object
scaled_df = scaler.fit_transform(shot_data)
scaled_df = pd.DataFrame(scaled_df)

In [8]:
#time to split them

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(shot_data.drop(columns='EVENT_TYPE'), 
                                                    shot_data.EVENT_TYPE, test_size=0.2, 
                                                    random_state=47)