In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import datetime
from sklearn.model_selection import train_test_split
from feature_engineering import feature_engineering
# from category_encoders import TargetEncoder

In [2]:
df = pd.read_csv('../data/data2.csv')

SEASON = '2009-10'

TRAIN_TEAM_SEASONS = ['2000-01', '2001-02']
TEST_TEAM_SEASONS = ['2002-03']
TEAM = 'PHX'

In [3]:
df.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,period,playoffs,...,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id,time_remaining
0,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,1,0,...,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,2,622
1,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,1,0,...,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,3,465
2,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,1,0,...,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,4,412
3,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,2,0,...,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,5,379
4,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,3,0,...,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,6,572


***
## Removing Redundant Columns
There are some columns that have very high corelations with some other columns and some columns that

Removing columns:
- team_name, team_id - These have only a single value, so it doesn't add any value to the prediction
- matchup - It has 1 to 1 corelation with the column "opponent"

In [4]:
def remove_columns(df, remove_season=False):
    columns_to_drop = ['game_event_id', 'lat', 'lon', 'team_name', 'team_id', 'game_id', 'shot_id', 'game_date',
                       'season', 'loc_x', 'loc_y']
    df = df.drop(labels=columns_to_drop, axis='columns')
    if(type == True):
        train.drop(['month', 'playoffs'], axis=1, inplace=True)
    print(f"Total columns removed: {len(columns_to_drop)}")
    return df

# Preparing for modeling
From the analyzed and preprocessed data, we continue the pre-processing in order to prepare it for the modeling phase

In [5]:
def target_encoding(df, col_to_encode, target_col):
    """
    Performs target-encoding on the specified column. That is to convert categorical column
    to a numerical column. Drops the col_to_encode from the returned DataFrame.

    Args:
        df (DataFrame):
        col_to_encode (str): Column to be converted to numerical one.
        target_col (str): Target column of the data.

    Returns:
        DataFrame: DataFrame containing the new <col_to_encode>_te column.
    """

    encoding = df.groupby(col_to_encode)[target_col].mean().reset_index()
    encoding = encoding.rename(columns={target_col: f"{col_to_encode}_te"})
    df = df.merge(right=encoding, how='left', on=col_to_encode)
    df.drop(col_to_encode, axis=1, inplace=True)

    return df

In [6]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return res

def one_hot_encoding(df):
#     df_new = df.copy(deep=True)
#     features_to_encode = ['combined_shot_type', 'season', 'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'opponent', 'action_type', 'matchup']
    features_to_encode = ['combined_shot_type', 'shot_type', 'shot_zone_area', 'matchup']
    for feature in features_to_encode:
        df = encode_and_bind(df, feature)

    return df

In [7]:
def ordinal_encoder(df: pd.DataFrame, col_to_encode, mapper: dict) -> pd.DataFrame:
    """
    Performs ordinal encoding using the given mapper. It replaces the col_to_encode's values in
    DataFrame.
    Args:
        df (DataFrame):
        col_to_encode (str): column that needs to be encoded
        mapper (dict): contains the mapping for the column in df. 
            e.g. "col_val" -> 1

    Returns:
        DataFrame: returns the DataFrame with mapped values in col: col_to_encode
    """
    df[col_to_encode] = df[col_to_encode].apply(lambda x: mapper[x])
    return df

### Encoding

In [8]:
shot_zone_basic_mapper = {    
    "Restricted Area": 0,
    "In The Paint (Non-RA)": 1,
    "Mid-Range": 2,
    "Right Corner 3": 3,
    "Above the Break 3": 4,
    "Left Corner 3": 5,
    "Backcourt": 6
}
shot_zone_range_mapper = {
    "Less Than 8 ft.": 0,
    "8-16 ft.": 1,
    "16-24 ft.": 2,
    "24+ ft.": 3,
    "Back Court Shot": 4
}

#### Splitting features & label (target)

In [9]:
def split_x_y(df, target):
    y = df.loc[:, [target]]
    x = df.drop([target], axis=1)
    return x, y

## Random

In [10]:
def split_random(df, train=None, test=None, validation=None):
    # 80-20
    train_data, rest_data = train_test_split(df, train_size=0.8, shuffle=False)
    # test 10 val 10
    validation_data, test_data = train_test_split(rest_data, test_size=0.5, shuffle=False)

    print(f"Size of training dataset {len(train_data)}" )
    print(f"Size of validation dataset {len(validation_data)}" )
    print(f"Size of test dataset {len(test_data)}" )

    # print(f"Size of train vs test ratio {len(df_train) / (len(df_train) + len(df_test)) }" )
    return train_data, validation_data, test_data

# Regular (train), playoff (test)

In [11]:
def split_regular_playoff(df, season, train=None, test=None, validation=None, n_way_split=False):

    # if(train + test + validation != 1):
    #     print("Error dataset split")
    #     return None

    df_validaiton = None

    df_season = df[df['season'] == season]
    isPlayoffs = df_season.playoffs == 1

    df_train = df_season[~isPlayoffs]
    df_test  = df_season[isPlayoffs]

    print(f"Size of training dataset {len(df_train)}" )
    print(f"Size of test dataset {len(df_test)}" )
    print(f"Size of train vs test ratio {len(df_train) / (len(df_train) + len(df_test)) }" )

    if (n_way_split == True):
        train = split_x_y(df_train, 'shot_made_flag')
        test = split_x_y(df_test, 'shot_made_flag')
        return train[0], train[1], test[0], test[1]
    else:
        return df_train, df_test


In [12]:
train, test = split_regular_playoff(df, SEASON)

Size of training dataset 1344
Size of test dataset 428
Size of train vs test ratio 0.7584650112866818


In [13]:
train = feature_engineering(train)
train = remove_columns(train, remove_season=True)

test = feature_engineering(test)
test = remove_columns(test, remove_season=True)


Total columns removed: 11
Total columns removed: 11


In [14]:
train = target_encoding(train, 'action_type', 'shot_made_flag')
train = target_encoding(train, 'opponent', 'shot_made_flag')
train = one_hot_encoding(train)
train = ordinal_encoder(train, "shot_zone_basic", shot_zone_basic_mapper)
train = ordinal_encoder(train, "shot_zone_range", shot_zone_range_mapper)

test = target_encoding(test, 'action_type', 'shot_made_flag')
test = target_encoding(test, 'opponent', 'shot_made_flag')
test = one_hot_encoding(test)
test = ordinal_encoder(test, "shot_zone_basic", shot_zone_basic_mapper)
test = ordinal_encoder(test, "shot_zone_range", shot_zone_range_mapper)

In [15]:
train.head()

Unnamed: 0,period,playoffs,shot_distance,shot_made_flag,shot_zone_basic,shot_zone_range,time_remaining,last_5_games_avg,streak_before_shot,points_before_shot,...,shot_type_2PT Field Goal,shot_type_3PT Field Goal,shot_zone_area_Back Court(BC),shot_zone_area_Center(C),shot_zone_area_Left Side Center(LC),shot_zone_area_Left Side(L),shot_zone_area_Right Side Center(RC),shot_zone_area_Right Side(R),matchup_away,matchup_home
0,1,0,0,0.0,0,0,660,0,0,0,...,1,0,0,1,0,0,0,0,0,1
1,4,0,0,1.0,0,0,113,0,0,16,...,1,0,0,1,0,0,0,0,0,1
2,4,0,14,0.0,2,1,216,0,0,16,...,1,0,0,0,0,0,0,1,0,1
3,4,0,28,0.0,4,3,339,0,0,16,...,0,1,0,1,0,0,0,0,0,1
4,4,0,8,0.0,1,1,596,0,0,16,...,1,0,0,1,0,0,0,0,0,1


In [16]:
train.to_csv('../data/train_season.csv', index=False)
test.to_csv('../data/test_season.csv', index=False)

# Pick one team, train seasons before, test seasons after

In this version of dataset we get all the matches played against a single team in one season e.g. season `2008-09` 
and then test the model against all the matches played against same team in the next season e.g. season `2009-10`

In [17]:
def split_team_season(df, opponent,  season_train: list, season_test: list,
                      train=None, test=None, validation=None):

    # if(train + test + validation != 1):
    #     print("Error dataset split")
    #     return None

    df_train = None
    df_test = None
    df_validaiton = None

    if season_train == [] or season_test == []:
        print("Empty seasons")
        return None

    df_train = df[(df['season'].isin(season_train))  & (df['opponent'] == opponent) ]
    df_test = df[(df['season'].isin(season_test)) & (df['opponent'] == opponent) ]
    print(f"Testing vs {opponent}\ntraining: {season_train}\ntesting: {season_test}" )
    print(f"Size of training dataset {len(df_train)}" )
    print(f"Size of test dataset {len(df_test)}" )
    print(f"Size of train vs test ratio {len(df_train) / (len(df_train) + len(df_test)) }" )

    return df_train, df_test

In [18]:
train2, test2 = split_team_season(df, opponent=TEAM, season_train=TRAIN_TEAM_SEASONS, season_test=TEST_TEAM_SEASONS)

Testing vs PHX
training: ['2000-01', '2001-02']
testing: ['2002-03']
Size of training dataset 104
Size of test dataset 94
Size of train vs test ratio 0.5252525252525253


In [19]:
train2 = feature_engineering(train2)
train2 = remove_columns(train2, remove_season=False)

test2 = feature_engineering(test2)
test2 = remove_columns(test2, remove_season=False)


Total columns removed: 11
Total columns removed: 11


In [20]:
train2 = target_encoding(train2, 'action_type', 'shot_made_flag')
train2 = target_encoding(train2, 'opponent', 'shot_made_flag')
train2 = one_hot_encoding(train2)
train2 = ordinal_encoder(train2, "shot_zone_basic", shot_zone_basic_mapper)
train2 = ordinal_encoder(train2, "shot_zone_range", shot_zone_range_mapper)

test2 = target_encoding(test2, 'action_type', 'shot_made_flag')
test2 = target_encoding(test2, 'opponent', 'shot_made_flag')
test2 = one_hot_encoding(test2)
test2 = ordinal_encoder(test2, "shot_zone_basic", shot_zone_basic_mapper)
test2 = ordinal_encoder(test2, "shot_zone_range", shot_zone_range_mapper)

## missing features in train2 and test2

In [21]:
print(set(train2.columns.values) == (set(test2.columns.values)))

False


In [22]:
print(set(test2.columns.values).difference(set(train2.columns.values)))
print(set(train2.columns.values).difference(set(test2.columns.values)))

for train_features in set(test2.columns.values).difference(set(train2.columns.values)):
    train2[train_features] = 0

for test_features in set(train2.columns.values).difference(set(test2.columns.values)):
    test2[test_features] = 0

set()
{'shot_zone_area_Back Court(BC)', 'combined_shot_type_Tip Shot'}


In [23]:
print(set(train2.columns.values) == (set(test2.columns.values)))


True


In [24]:
train2.columns.values == 'combined_shot_type_Tip Shot'

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False])

In [25]:
# train2.head()
df.action_type.unique()

array(['Jump Shot', 'Driving Dunk Shot', 'Layup Shot',
       'Running Jump Shot', 'Reverse Dunk Shot', 'Slam Dunk Shot',
       'Driving Layup Shot', 'Turnaround Jump Shot', 'Reverse Layup Shot',
       'Tip Shot', 'Running Hook Shot', 'Alley Oop Dunk Shot',
       'Dunk Shot', 'Alley Oop Layup shot', 'Running Dunk Shot',
       'Driving Finger Roll Shot', 'Running Layup Shot',
       'Finger Roll Shot', 'Fadeaway Jump Shot', 'Follow Up Dunk Shot',
       'Hook Shot', 'Turnaround Hook Shot', 'Jump Hook Shot',
       'Running Finger Roll Shot', 'Jump Bank Shot',
       'Turnaround Finger Roll Shot', 'Hook Bank Shot',
       'Driving Hook Shot', 'Running Tip Shot',
       'Running Reverse Layup Shot', 'Driving Finger Roll Layup Shot',
       'Fadeaway Bank shot', 'Pullup Jump shot', 'Finger Roll Layup Shot',
       'Turnaround Fadeaway shot', 'Driving Reverse Layup Shot',
       'Driving Slam Dunk Shot', 'Step Back Jump shot',
       'Turnaround Bank shot', 'Reverse Slam Dunk Shot',
   

In [26]:
train2.to_csv('../data/test_team.csv', index=False)
test2.to_csv('../data/train_team.csv', index=False)