In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import datetime
from sklearn.model_selection import train_test_split
from feature_engineering import feature_engineering
# from category_encoders import TargetEncoder

In [2]:
df = pd.read_csv('../data/data2.csv')

SEASON = '2009-10'

TRAIN_TEAM_SEASONS = ['2000-01', '2001-02', '2002-03']
TEST_TEAM_SEASONS = ['2003-04']
TEAM = 'PHX'

In [3]:
df.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,period,playoffs,...,game_date,matchup,opponent,shot_id,time_remaining,last_5_games_avg,streak_before_shot,points_before_shot,fgp_before_shot,month
0,Jump Shot,Jump Shot,102,29600027,33.9283,-140,116,-118.4098,1,0,...,1996-11-03,home,MIN,22902,42,0.0,0,0,0.0,11
1,Jump Shot,Jump Shot,127,29600031,33.9473,-131,97,-118.4008,2,0,...,1996-11-05,away,NYK,22903,608,0.0,0,0,0.0,11
2,Jump Shot,Jump Shot,124,29600044,33.8633,-142,181,-118.4118,2,0,...,1996-11-06,away,CHA,22904,517,0.0,0,0,0.0,11
3,Jump Shot,Jump Shot,144,29600044,34.0443,0,0,-118.2698,2,0,...,1996-11-06,away,CHA,22905,394,0.0,1,3,1.0,11
4,Jump Shot,Jump Shot,151,29600044,33.9063,-10,138,-118.2798,2,0,...,1996-11-06,away,CHA,22906,327,0.0,0,3,0.5,11


***
## Removing Redundant Columns
There are some columns that have very high corelations with some other columns and some columns that

Removing columns:
- team_name, team_id - These have only a single value, so it doesn't add any value to the prediction
- matchup - It has 1 to 1 corelation with the column "opponent"

In [4]:
def remove_columns(df, remove_season=False):
    columns_to_drop = ['game_event_id', 'lat', 'lon', 'team_name', 'team_id', 'game_id', 'shot_id', 'game_date',
                       'season', 'loc_x', 'loc_y']
    df = df.drop(labels=columns_to_drop, axis='columns')
    if(type == True):
        train.drop(['month', 'playoffs'], axis=1, inplace=True)
    print(f"Total columns removed: {len(columns_to_drop)}")
    return df

#### Helper function

In [5]:
def remove_na_rows(df: pd.DataFrame):
    """
    :param pd.DataFrame df:
    :return pd.DataFrame
    """
    num_rows_before = len(df)
    print(f'Number of rows: {num_rows_before}')
    df = df.dropna()
    num_rows_after = len(df)
    print(f'Number of rows after dropping missing values: {num_rows_after}')
    removed_rows = num_rows_before - num_rows_after
    print(f'Percentage of data removed: {100 * (removed_rows / num_rows_before)} %')
    return df

# Preparing for modeling
From the analyzed and preprocessed data, we continue the pre-processing in order to prepare it for the modeling phase

In [6]:
def target_encoding(df, col_to_encode, target_col, is_trainset=True, encoder=None):
    """
    For trainset:
        Performs target-encoding on the specified column. That is to convert categorical column
        to a numerical column. Drops the col_to_encode from the returned DataFrame.
    For test-set:
        Performs target-encoding on the specified column using the encoding given from the trainset.
        Drops the col_to_encode from the returned DataFrame.

    Args:
        df (DataFrame):
        col_to_encode (str): Column to be converted to numerical one.
        target_col (str): Target column of the data.
        is_trainset (bool): Tells if the df is training-set or testing-set
        encoding (dict): encoder that tells how to map the categorical values.

    Returns: (pd.DataFrame, dict)
        DataFrame: DataFrame containing the new <col_to_encode>_te column.
        dict: is the encoder.
    """
    if is_trainset:
        encoder = df.groupby(col_to_encode)[target_col].mean().reset_index()
        encoder = encoder.rename(columns={target_col: f"{col_to_encode}_te"})
        df = df.merge(right=encoder, how='left', on=col_to_encode)
        df.drop(col_to_encode, axis=1, inplace=True)
        
    elif is_trainset == False:
        assert encoder is not None
        
        df = df.merge(right=encoder, how='left', on=col_to_encode)
        df.drop(col_to_encode, axis=1, inplace=True)
        
    return df, encoder

In [7]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return res

def one_hot_encoding(df):
#     df_new = df.copy(deep=True)
#     features_to_encode = ['combined_shot_type', 'season', 'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'opponent', 'action_type', 'matchup']
    features_to_encode = ['combined_shot_type', 'shot_type', 'shot_zone_area', 'matchup']
    for feature in features_to_encode:
        df = encode_and_bind(df, feature)

    return df

In [8]:
def ordinal_encoder(df: pd.DataFrame, col_to_encode, mapper: dict) -> pd.DataFrame:
    """
    Performs ordinal encoding using the given mapper. It replaces the col_to_encode's values in
    DataFrame.
    Args:
        df (DataFrame):
        col_to_encode (str): column that needs to be encoded
        mapper (dict): contains the mapping for the column in df. 
            e.g. "col_val" -> 1

    Returns:
        DataFrame: returns the DataFrame with mapped values in col: col_to_encode
    """
    df[col_to_encode] = df[col_to_encode].apply(lambda x: mapper[x])
    return df

### Encoding

In [9]:
shot_zone_basic_mapper = {    
    "Restricted Area": 0,
    "In The Paint (Non-RA)": 1,
    "Mid-Range": 2,
    "Right Corner 3": 3,
    "Above the Break 3": 4,
    "Left Corner 3": 5,
    "Backcourt": 6
}
shot_zone_range_mapper = {
    "Less Than 8 ft.": 0,
    "8-16 ft.": 1,
    "16-24 ft.": 2,
    "24+ ft.": 3,
    "Back Court Shot": 4
}

#### Splitting features & label (target)

In [10]:
def split_x_y(df, target):
    y = df.loc[:, [target]]
    x = df.drop([target], axis=1)
    return x, y

# Regular (train), playoff (test)

In [84]:
def split_regular_playoff(df, season, train=None, test=None, validation=None, n_way_split=False):

    # if(train + test + validation != 1):
    #     print("Error dataset split")
    #     return None

    df_validaiton = None

    df_season = df[df['season'] == season]
    isPlayoffs = df_season.playoffs == 1

    df_train = df_season[~isPlayoffs]
    df_test  = df_season[isPlayoffs]

    print(f"Size of training dataset {len(df_train)}" )
    print(f"Size of test dataset {len(df_test)}" )
    print(f"Size of train vs test ratio {len(df_train) / (len(df_train) + len(df_test)) }" )

    if (n_way_split == True):
        train = split_x_y(df_train, 'shot_made_flag')
        test = split_x_y(df_test, 'shot_made_flag')
        return train[0], train[1], test[0], test[1]
    else:
        return df_train, df_test

In [85]:
train, test = split_regular_playoff(df, SEASON)

Size of training dataset 1344
Size of test dataset 428
Size of train vs test ratio 0.7584650112866818


In [86]:
train = feature_engineering(train)
train = remove_columns(train, remove_season=True)

Total columns removed: 11


In [87]:
test = feature_engineering(test)
test = remove_columns(test, remove_season=True)

Total columns removed: 11


In [None]:
train, action_type_encoder = target_encoding(train, 'action_type', 'shot_made_flag')
train, opponent_type_encoder = target_encoding(train, 'opponent', 'shot_made_flag')
train = one_hot_encoding(train)
train = ordinal_encoder(train, "shot_zone_basic", shot_zone_basic_mapper)
train = ordinal_encoder(train, "shot_zone_range", shot_zone_range_mapper)

In [15]:
test, _ = target_encoding(test, 'action_type', 'shot_made_flag', is_trainset=False, encoder=action_type_encoder)
test, _ = target_encoding(test, 'opponent', 'shot_made_flag', is_trainset=False, encoder=opponent_type_encoder)
test = one_hot_encoding(test)
test = ordinal_encoder(test, "shot_zone_basic", shot_zone_basic_mapper)
test = ordinal_encoder(test, "shot_zone_range", shot_zone_range_mapper)

test = remove_na_rows(test)

Number of rows: 428
Number of rows after dropping missing values: 428
Percentage of data removed: 0.0 %


In [16]:
train.action_type_te.unique()
test.action_type_te.unique()

array([0.29149233, 0.6779661 , 0.81176471, 1.        , 0.6       ,
       0.9375    , 0.35766423, 0.67241379, 0.5974026 , 0.625     ,
       0.79069767, 0.64285714, 0.75      , 0.66666667, 0.72727273,
       0.45454545, 0.        , 0.5       ])

In [17]:
train.to_csv('../data/train_season.csv', index=False)
test.to_csv('../data/test_season.csv', index=False)

# Pick one team, train seasons before, test seasons after

In this version of dataset we get all the matches played against a single team in one season e.g. season `2008-09` 
and then test the model against all the matches played against same team in the next season e.g. season `2009-10`

In [88]:
def split_team_season(df, opponent,  season_train: list, season_test: list,
                      train=None, test=None, validation=None):

    # if(train + test + validation != 1):
    #     print("Error dataset split")
    #     return None

    df_train = None
    df_test = None
    df_validaiton = None

    if season_train == [] or season_test == []:
        print("Empty seasons")
        return None

    df_train = df[(df['season'].isin(season_train))  & (df['opponent'] == opponent) ]
    df_test = df[(df['season'].isin(season_test)) & (df['opponent'] == opponent) ]
    print(f"Testing vs {opponent}\ntraining: {season_train}\ntesting: {season_test}" )
    print(f"Size of training dataset {len(df_train)}" )
    print(f"Size of test dataset {len(df_test)}" )
    print(f"Size of train vs test ratio {len(df_train) / (len(df_train) + len(df_test)) }" )

    return df_train, df_test

In [89]:
train2, test2 = split_team_season(df, opponent=TEAM, season_train=TRAIN_TEAM_SEASONS, season_test=TEST_TEAM_SEASONS)

Testing vs PHX
training: ['2000-01', '2001-02', '2002-03']
testing: ['2003-04']
Size of training dataset 198
Size of test dataset 41
Size of train vs test ratio 0.8284518828451883


In [90]:
train2 = feature_engineering(train2)
train2 = remove_columns(train2, remove_season=False)

Total columns removed: 11


In [91]:
test2 = feature_engineering(test2)
test2 = remove_columns(test2, remove_season=False)

Total columns removed: 11


In [92]:
train2, action_type_encoder = target_encoding(train2, 'action_type', 'shot_made_flag')
train2, opponent_type_encoder = target_encoding(train2, 'opponent', 'shot_made_flag')
train2 = one_hot_encoding(train2)
train2 = ordinal_encoder(train2, "shot_zone_basic", shot_zone_basic_mapper)
train2 = ordinal_encoder(train2, "shot_zone_range", shot_zone_range_mapper)

In [93]:
test2, _ = target_encoding(test2, 'action_type', 'shot_made_flag', is_trainset=False, encoder=action_type_encoder)
test2, _ = target_encoding(test2, 'opponent', 'shot_made_flag', is_trainset=False, encoder=opponent_type_encoder)
test2 = one_hot_encoding(test2)
test2 = ordinal_encoder(test2, "shot_zone_basic", shot_zone_basic_mapper)
test2 = ordinal_encoder(test2, "shot_zone_range", shot_zone_range_mapper)

test2 = remove_na_rows(test2)

Number of rows: 41
Number of rows after dropping missing values: 40
Percentage of data removed: 2.4390243902439024 %


In [94]:
print(f"Rows in training-set: {len(train2)}")
print(f"Rows in test-set: {len(test2)}")

Rows in training-set: 198
Rows in test-set: 40


## missing features in train2 and test2

In [23]:
print(set(train2.columns.values) == (set(test2.columns.values)))

False


In [24]:
print(set(test2.columns.values).difference(set(train2.columns.values)))
print(set(train2.columns.values).difference(set(test2.columns.values)))

for train_features in set(test2.columns.values).difference(set(train2.columns.values)):
    train2[train_features] = 0

for test_features in set(train2.columns.values).difference(set(test2.columns.values)):
    test2[test_features] = 0

set()
{'combined_shot_type_Tip Shot', 'shot_zone_area_Back Court(BC)'}


In [25]:
print(set(train2.columns.values) == (set(test2.columns.values)))

True


In [26]:
train2.columns.values == 'combined_shot_type_Tip Shot'

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False])

In [28]:
# train2.head()
df.action_type.unique()

array(['Jump Shot', 'Driving Layup Shot', 'Layup Shot', 'Dunk Shot',
       'Driving Dunk Shot', 'Running Jump Shot', 'Slam Dunk Shot',
       'Tip Shot', 'Hook Shot', 'Reverse Dunk Shot',
       'Turnaround Jump Shot', 'Reverse Layup Shot', 'Running Hook Shot',
       'Alley Oop Dunk Shot', 'Alley Oop Layup shot', 'Running Dunk Shot',
       'Driving Finger Roll Shot', 'Running Layup Shot',
       'Finger Roll Shot', 'Driving Hook Shot', 'Fadeaway Jump Shot',
       'Follow Up Dunk Shot', 'Turnaround Hook Shot', 'Jump Hook Shot',
       'Running Finger Roll Shot', 'Jump Bank Shot',
       'Turnaround Finger Roll Shot', 'Hook Bank Shot',
       'Running Tip Shot', 'Running Reverse Layup Shot',
       'Driving Finger Roll Layup Shot', 'Fadeaway Bank shot',
       'Pullup Jump shot', 'Finger Roll Layup Shot',
       'Turnaround Fadeaway shot', 'Driving Reverse Layup Shot',
       'Driving Slam Dunk Shot', 'Step Back Jump shot',
       'Turnaround Bank shot', 'Reverse Slam Dunk Shot',
   

In [95]:
train2.to_csv('../data/train_team.csv', index=False)
test2.to_csv('../data/test_team.csv', index=False)