In [133]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import datetime
from sklearn.model_selection import train_test_split
from feature_engineering import feature_engineering
# from category_encoders import TargetEncoder

In [202]:
df = pd.read_csv('../data/data2.csv')


In [203]:
df.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,period,playoffs,...,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id,time_remaining
0,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,1,0,...,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,2,622
1,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,1,0,...,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,3,465
2,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,1,0,...,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,4,412
3,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,2,0,...,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,5,379
4,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,3,0,...,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,6,572


***
## Removing Redundant Columns
There are some columns that have very high corelations with some other columns and some columns that

Removing columns:
- team_name, team_id - These have only a single value, so it doesn't add any value to the prediction
- matchup - It has 1 to 1 corelation with the column "opponent"

In [193]:
def remove_columns(df):
    columns_to_drop = ['game_event_id', 'lat', 'lon', 'team_name', 'team_id', 'game_id', 'shot_id']
    df = df.drop(labels=columns_to_drop, axis='columns')
    print(f"Total columns removed: {len(columns_to_drop)}")
    return df

# Preparing for modeling
From the analyzed and preprocessed data, we continue the pre-processing in order to prepare it for the modeling phase

In [137]:
def target_encoding(df, col_to_encode, target_col):
    """
    Performs target-encoding on the specified column. That is to convert categorical column
    to a numerical column.

    Args:
        df (DataFrame):
        col_to_encode (str): Column to be converted to numerical one.
        target_col (str): Target column of the data.

    Returns:
        DataFrame: DataFrame containing the new <col_to_encode>_te column.
    """

    encoding = df.groupby(col_to_encode)[target_col].mean().reset_index()
    encoding = encoding.rename(columns={target_col: f"{col_to_encode}_te"})
    df = df.merge(right=encoding, how='left', on=col_to_encode)
    df.drop(col_to_encode, axis=1, inplace=True)

    return df

In [138]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return res

In [139]:
def one_hot_encoding(df):
    df_new = df.copy(deep=True)
    features_to_encode = ['combined_shot_type', 'season', 'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'opponent', 'action_type', 'matchup']
    for feature in features_to_encode:
        df_new = encode_and_bind(df_new, feature)

    return df_new

# Splitting the dataset

In [140]:
def split_x_y(df, target):
    y = df.loc[:, [target]]
    x = df.drop([target], axis=1)
    return x, y

## Random

In [204]:
def split_random(df, train=None, test=None, validation=None):
    # 80-20
    train_data, rest_data = train_test_split(df, train_size=0.8, shuffle=False)
    # test 10 val 10
    validation_data, test_data = train_test_split(rest_data, test_size=0.5, shuffle=False)

    print(f"Size of training dataset {len(train_data)}" )
    print(f"Size of validation dataset {len(validation_data)}" )
    print(f"Size of test dataset {len(test_data)}" )

    # print(f"Size of train vs test ratio {len(df_train) / (len(df_train) + len(df_test)) }" )
    return train_data, validation_data, test_data

## Regular (train), playoff (test)
### Do we need train, test, validation here?

In [205]:
def split_regular_playoff(df, season, train=None, test=None, validation=None, n_way_split=False):

    # if(train + test + validation != 1):
    #     print("Error dataset split")
    #     return None

    df_validaiton = None

    df_season = df[df['season'] == season]
    isPlayoffs = df_season.playoffs == 1

    df_train = df_season[~isPlayoffs]
    df_test  = df_season[isPlayoffs]

    print(f"Size of training dataset {len(df_train)}" )
    print(f"Size of test dataset {len(df_test)}" )
    print(f"Size of train vs test ratio {len(df_train) / (len(df_train) + len(df_test)) }" )

    if (n_way_split == True):
        train = split_x_y(df_train, 'shot_made_flag')
        test = split_x_y(df_test, 'shot_made_flag')
        return train[0], train[1], test[0], test[1]
    else:
        return df_train, df_test


In [206]:
train, test = split_regular_playoff(df, '2009-10')

Size of training dataset 1344
Size of test dataset 428
Size of train vs test ratio 0.7584650112866818


In [207]:
for column in ['action_type', 'combined_shot_type', 'season', 'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'opponent']:
    print(f"{column}: {len(train[column].unique())}")

action_type: 36
combined_shot_type: 6
season: 1
shot_type: 2
shot_zone_area: 6
shot_zone_basic: 7
shot_zone_range: 5
opponent: 29


In [208]:
train = feature_engineering(train)
train = remove_columns(train)
# train.drop(['game_date'], axis=1, inplace=True)


test = feature_engineering(test)
test = remove_columns(test)
# test.drop(['game_date'], axis=1, inplace=True)


Total columns removed: 7
Total columns removed: 7


In [209]:
# drop game_date, season, playoffs
# ordinal shot-zone-range, shot zone basic
# one hot: shot_zone_area
# opponent-t

In [200]:
train.head()

Unnamed: 0,action_type,combined_shot_type,loc_x,loc_y,period,playoffs,season,shot_distance,shot_made_flag,shot_type,...,shot_zone_basic,shot_zone_range,matchup,opponent,time_remaining,last_5_games_avg,streak_before_shot,points_before_shot,fgp_before_shot,month
12441,Layup Shot,Layup,0,0,1,0,2009-10,0,0.0,2PT Field Goal,...,Restricted Area,Less Than 8 ft.,home,LAC,660,0,0,0,0.0,10
12460,Driving Finger Roll Layup Shot,Layup,0,0,4,0,2009-10,0,1.0,2PT Field Goal,...,Restricted Area,Less Than 8 ft.,home,LAC,113,0,0,16,0.42,10
12459,Jump Shot,Jump Shot,143,28,4,0,2009-10,14,0.0,2PT Field Goal,...,Mid-Range,8-16 ft.,home,LAC,216,0,0,16,0.44,10
12458,Jump Shot,Jump Shot,-56,279,4,0,2009-10,28,0.0,3PT Field Goal,...,Above the Break 3,24+ ft.,home,LAC,339,0,0,16,0.47,10
12457,Jump Shot,Jump Shot,34,82,4,0,2009-10,8,0.0,2PT Field Goal,...,In The Paint (Non-RA),8-16 ft.,home,LAC,596,0,0,16,0.5,10


In [170]:
train = target_encoding(train, 'action_type', 'shot_made_flag')
train = target_encoding(train, 'opponent', 'shot_made_flag')

KeyError: 'action_type'

In [169]:
train

Unnamed: 0,combined_shot_type,loc_x,loc_y,period,playoffs,season,shot_distance,shot_made_flag,shot_type,shot_zone_area,...,shot_zone_range,matchup,opponent,time_remaining,last_5_games_avg,streak_before_shot,points_before_shot,fgp_before_shot,month,action_type_te
0,Layup,0,0,1,0,2009-10,0,0.0,2PT Field Goal,Center(C),...,Less Than 8 ft.,home,LAC,660,0,0,0,0.0,10,0.357664
1,Layup,0,0,4,0,2009-10,0,1.0,2PT Field Goal,Center(C),...,Less Than 8 ft.,home,LAC,113,0,0,16,0.42,10,0.727273
2,Jump Shot,143,28,4,0,2009-10,14,0.0,2PT Field Goal,Right Side(R),...,8-16 ft.,home,LAC,216,0,0,16,0.44,10,0.291492
3,Jump Shot,-56,279,4,0,2009-10,28,0.0,3PT Field Goal,Center(C),...,24+ ft.,home,LAC,339,0,0,16,0.47,10,0.291492
4,Jump Shot,34,82,4,0,2009-10,8,0.0,2PT Field Goal,Center(C),...,8-16 ft.,home,LAC,596,0,0,16,0.5,10,0.291492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1339,Layup,0,0,1,0,2009-10,0,1.0,2PT Field Goal,Center(C),...,Less Than 8 ft.,home,POR,386,0.43,0,0,0.0,4,0.666667
1340,Jump Shot,-109,110,1,0,2009-10,15,0.0,2PT Field Goal,Left Side(L),...,8-16 ft.,home,POR,641,0.43,0,0,0.0,4,0.291492
1341,Jump Shot,-87,290,4,0,2009-10,30,1.0,3PT Field Goal,Center(C),...,24+ ft.,home,POR,49,0.43,0,12,0.31,4,0.291492
1342,Jump Shot,70,132,3,0,2009-10,14,0.0,2PT Field Goal,Center(C),...,8-16 ft.,home,POR,622,0.43,0,7,0.43,4,0.291492


KeyError: 'action_type'

In [124]:
train.season.unique()

array(['2009-10'], dtype=object)

## Pick one team, train seasons before, test seasons after
### Should we train on multiple seasons?


In [69]:
def split_team_season(df, opponent,  season_train, season_test, train=None, test=None, validation=None):

    # if(train + test + validation != 1):
    #     print("Error dataset split")
    #     return None

    df_train = None
    df_test = None
    df_validaiton = None

    if season_train == [] or season_test == []:
        print("Empty seasons")
        return None

    df_train = df[df['season'].isin(season_train)  & (df['opponent'] == opponent) ]
    df_test = df[(df['season'].isin(season_test)) & (df['opponent'] == opponent) ]
    print(f"Testing vs {opponent}\ntraining: {season_train}\ntesting: {season_test}" )
    print(f"Size of training dataset {len(df_train)}" )
    print(f"Size of test dataset {len(df_test)}" )
    print(f"Size of train vs test ratio {len(df_train) / (len(df_train) + len(df_test)) }" )

    train = split_x_y(df_train, 'shot_made_flag')
    test = split_x_y(df_test, 'shot_made_flag')
    return train[0], train[1], test[0], test[1]

## Pipeline for data modeling

In [70]:
# df = feature_engineering(df)

In [72]:
df.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,period,playoffs,...,game_date,matchup,opponent,shot_id,time_remaining,last_5_games_avg,streak_before_shot,points_before_shot,fgp_before_shot,month
0,Jump Shot,Jump Shot,102,29600027,33.9283,-140,116,-118.4098,1,0,...,1996-11-03,home,MIN,22902,42,0.0,0,0,0.0,11
1,Jump Shot,Jump Shot,127,29600031,33.9473,-131,97,-118.4008,2,0,...,1996-11-05,away,NYK,22903,608,0.0,0,0,0.0,11
2,Jump Shot,Jump Shot,124,29600044,33.8633,-142,181,-118.4118,2,0,...,1996-11-06,away,CHA,22904,517,0.0,0,0,0.0,11
3,Jump Shot,Jump Shot,144,29600044,34.0443,0,0,-118.2698,2,0,...,1996-11-06,away,CHA,22905,394,0.0,1,3,1.0,11
4,Jump Shot,Jump Shot,151,29600044,33.9063,-10,138,-118.2798,2,0,...,1996-11-06,away,CHA,22906,327,0.0,0,3,0.5,11
