In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [12]:
def target_encoding(df, col_to_encode, target_col):
    """
    Performs target-encoding on the specified column. That is to convert categorical column 
    to a numerical column.

    Args:
        df (DataFrame):
        col_to_encode (str): Column to be converted to numerical one.
        target_col (str): Target column of the data.

    Returns:
        DataFrame: DataFrame containing the new <col_to_encode>_te column.
    """

    encoding = df.groupby(col_to_encode)[target_col].mean().reset_index()
    encoding = encoding.rename(columns={target_col: f"{col_to_encode}_te"})
    df = df.merge(right=encoding, how='left', on=col_to_encode)
    return df

In [3]:
df = pd.read_csv("../data/data2.csv")

In [9]:
target_column = "shot_made_flag"  # y_column_name

In [20]:
def split_regular_playoff(df, season, train=None, test=None, validation=None):

    # if(train + test + validation != 1):
    #     print("Error dataset split")
    #     return None

    def split_x_y(df, target):
        y = df.loc[:, [target]]
        x = df.drop([target], axis=1)
        return x, y

    df_validaiton = None

    df_season = df[df['season'] == season]
    isPlayoffs = df_season.playoffs == 1

    df_train = df_season[~isPlayoffs]
    df_test  = df_season[isPlayoffs]

    print(f"Size of training dataset {len(df_train)}" )
    print(f"Size of test dataset {len(df_test)}" )
    print(f"Size of train vs test ratio {len(df_train) / (len(df_train) + len(df_test)) }" )

    return df_train, df_test
#     train = split_x_y(df_train, 'shot_made_flag')
#     test = split_x_y(df_test, 'shot_made_flag')
#     return train[0], train[1], test[0], test[1]
#     return train, test

# x_train, y_train, x_test, y_test = split_regular_playoff(df, '2010-11')
train, test = split_regular_playoff(df, '2010-11')

Size of training dataset 1360
Size of test dataset 161
Size of train vs test ratio 0.8941485864562788


In [5]:
print(df.keys())
df.head()

Index(['action_type', 'combined_shot_type', 'game_event_id', 'game_id', 'lat',
       'loc_x', 'loc_y', 'lon', 'period', 'playoffs', 'season',
       'shot_distance', 'shot_made_flag', 'shot_type', 'shot_zone_area',
       'shot_zone_basic', 'shot_zone_range', 'team_id', 'team_name',
       'game_date', 'matchup', 'opponent', 'shot_id', 'time_remaining',
       'last_5_games_avg', 'streak_before_shot', 'points_before_shot',
       'fgp_before_shot', 'month'],
      dtype='object')


Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,period,playoffs,...,game_date,matchup,opponent,shot_id,time_remaining,last_5_games_avg,streak_before_shot,points_before_shot,fgp_before_shot,month
0,Jump Shot,Jump Shot,102,29600027,33.9283,-140,116,-118.4098,1,0,...,1996-11-03,home,MIN,22902,42,0.0,0,0,0.0,11
1,Jump Shot,Jump Shot,127,29600031,33.9473,-131,97,-118.4008,2,0,...,1996-11-05,away,NYK,22903,608,0.0,0,0,0.0,11
2,Jump Shot,Jump Shot,124,29600044,33.8633,-142,181,-118.4118,2,0,...,1996-11-06,away,CHA,22904,517,0.0,0,0,0.0,11
3,Jump Shot,Jump Shot,144,29600044,34.0443,0,0,-118.2698,2,0,...,1996-11-06,away,CHA,22905,394,0.0,1,3,1.0,11
4,Jump Shot,Jump Shot,151,29600044,33.9063,-10,138,-118.2798,2,0,...,1996-11-06,away,CHA,22906,327,0.0,0,3,0.5,11


In [21]:
df.combined_shot_type.value_counts()

Jump Shot    19710
Layup         4532
Dunk          1056
Tip Shot       152
Hook Shot      127
Bank Shot      120
Name: combined_shot_type, dtype: int64

In [None]:
def convert_cat_to_num(df):
    
    

In [22]:
train

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,period,playoffs,...,game_date,matchup,opponent,shot_id,time_remaining,last_5_games_avg,streak_before_shot,points_before_shot,fgp_before_shot,month
19848,Driving Layup Shot,Layup,584,21000003,34.0213,18,23,-118.2518,4,0,...,2010-10-26,home,HOU,16453,105,0.39,0,10,0.36,10
19849,Jump Shot,Jump Shot,538,21000003,33.9173,-79,127,-118.3488,4,0,...,2010-10-26,home,HOU,16452,259,0.39,0,10,0.38,10
19850,Jump Shot,Jump Shot,350,21000003,33.9363,-180,108,-118.4498,3,0,...,2010-10-26,home,HOU,16450,400,0.39,2,10,0.42,10
19851,Jump Shot,Jump Shot,332,21000003,33.8983,-127,146,-118.3968,3,0,...,2010-10-26,home,HOU,16449,493,0.39,1,8,0.36,10
19852,Jump Shot,Jump Shot,322,21000003,33.9623,-106,82,-118.3758,3,0,...,2010-10-26,home,HOU,16448,546,0.39,0,6,0.30,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21203,Jump Shot,Jump Shot,468,21001230,33.9943,-98,50,-118.3678,4,0,...,2011-04-13,away,SAC,18067,89,0.42,0,22,0.69,4
21204,Jump Shot,Jump Shot,475,21001230,33.9553,144,89,-118.1258,4,0,...,2011-04-13,away,SAC,18068,57,0.42,0,22,0.65,4
21205,Jump Shot,Jump Shot,480,21001230,33.9963,-119,48,-118.3888,4,0,...,2011-04-13,away,SAC,18069,15,0.42,0,22,0.61,4
21206,Jump Shot,Jump Shot,506,21001230,33.9153,92,129,-118.1778,5,0,...,2011-04-13,away,SAC,18071,244,0.42,1,25,0.60,4


In [23]:
target_encoding(train, "action_type", target_column)

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,period,playoffs,...,matchup,opponent,shot_id,time_remaining,last_5_games_avg,streak_before_shot,points_before_shot,fgp_before_shot,month,action_type_te
0,Driving Layup Shot,Layup,584,21000003,34.0213,18,23,-118.2518,4,0,...,home,HOU,16453,105,0.39,0,10,0.36,10,0.706897
1,Jump Shot,Jump Shot,538,21000003,33.9173,-79,127,-118.3488,4,0,...,home,HOU,16452,259,0.39,0,10,0.38,10,0.326633
2,Jump Shot,Jump Shot,350,21000003,33.9363,-180,108,-118.4498,3,0,...,home,HOU,16450,400,0.39,2,10,0.42,10,0.326633
3,Jump Shot,Jump Shot,332,21000003,33.8983,-127,146,-118.3968,3,0,...,home,HOU,16449,493,0.39,1,8,0.36,10,0.326633
4,Jump Shot,Jump Shot,322,21000003,33.9623,-106,82,-118.3758,3,0,...,home,HOU,16448,546,0.39,0,6,0.30,10,0.326633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1355,Jump Shot,Jump Shot,468,21001230,33.9943,-98,50,-118.3678,4,0,...,away,SAC,18067,89,0.42,0,22,0.69,4,0.326633
1356,Jump Shot,Jump Shot,475,21001230,33.9553,144,89,-118.1258,4,0,...,away,SAC,18068,57,0.42,0,22,0.65,4,0.326633
1357,Jump Shot,Jump Shot,480,21001230,33.9963,-119,48,-118.3888,4,0,...,away,SAC,18069,15,0.42,0,22,0.61,4,0.326633
1358,Jump Shot,Jump Shot,506,21001230,33.9153,92,129,-118.1778,5,0,...,away,SAC,18071,244,0.42,1,25,0.60,4,0.326633


## Fit Linear Regression

In [13]:
reg = LinearRegression()
reg.fit(x_train, y_train)

ValueError: could not convert string to float: 'Driving Layup Shot'