# Feature Engineering Players

In [13]:
from functions import Schedule, PlayerStats, OpeningDayRoster
import pandas as pd
import numpy as np

In [14]:
players = pd.read_csv("data/PlayerStats_withODTm_1979-2023.csv")
players['OpeningDayTm'].unique()

array(['BOS', 'NJN', 'NYK', 'PHI', 'WSB', 'ATL', 'CLE', 'DET', 'HOU',
       'NOJ', 'SAS', 'CHI', 'DEN', 'IND', 'KCK', 'MIL', 'GSW', 'LAL',
       'PHO', 'POR', 'SDC', 'SEA', 'UTA', 'DAL', 'LAC', 'SAC', 'CHH',
       'MIA', 'ORL', 'MIN', 'TOR', 'VAN', 'WAS', 'MEM', 'NOH', 'CHA',
       'OKC', 'BRK', 'NOP', 'CHO'], dtype=object)

In [15]:
players.columns

Index(['PlayerODR', 'Year', 'OpeningDayTm', 'clean_name', 'Player', 'href',
       'Pos', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%',
       '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS',
       'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'],
      dtype='object')

In [16]:
years = players.Year.unique()
teams = players.OpeningDayTm.unique()

### Get Points, Assist, Reb

In [17]:
window = 5
def get_stats_players(year, team):
    assert len(players[(players.Year == year ) & (players.OpeningDayTm == team)]) > 0, print("Team doesn't exist for this certain Year")
    res = [team, year]
    roster = list(players[(players.Year == year ) & (players.OpeningDayTm == team)].Player)
    # Points for a window of three years
    injury = players[(players.Player.isin(roster)) & (players.Year <= year - 1)][["Player", "G"]]
    games_played = injury.groupby("Player").sum()
    total_games = injury.groupby("Player").count().rename(columns = {"G" : "Total"})
    injury_index = games_played.merge(total_games, how = 'left', on = "Player").reset_index()
    injury_index["InjuryIndex"] = injury_index.apply(lambda x : x.G/(x.Total*82), axis = 1)

    injury_index = injury_index.loc[:, ["Player", "InjuryIndex"]]

    for y in range(window):
        tamp = players[(players.Player.isin(roster)) & (players.Year == year - 1 - y)][["Player", "G", "MP", "PTS", "VORP", "WS", "PER"]]
        tamp = tamp.merge(injury_index, how = "left", on  = "Player")
        tamp["TMP"] = tamp.MP * tamp.G
        tamp = tamp[(tamp.TMP >= 100)]
        total_mp = tamp.MP.sum()
        tamp["weightedPTS"] = tamp.apply(lambda x : (x.MP*x.PTS*x.InjuryIndex), axis = 1)/total_mp
        tamp["weightedVORP"] = tamp.apply(lambda x : (x.MP*x.VORP*x.InjuryIndex), axis = 1)/total_mp
        tamp["weightedPER"] = tamp.apply(lambda x : (x.MP*x.PER*x.InjuryIndex), axis = 1)/total_mp
        tamp["weightedWS"] = tamp.apply(lambda x : (x.MP*x.WS*x.InjuryIndex), axis = 1)/total_mp
        res.append(round(tamp.weightedPTS.sum(), 3))
        res.append(round(tamp.weightedVORP.mean(), 3))
        res.append(round(tamp.weightedPER.mean(), 3))
        res.append(round(tamp.weightedWS.mean(), 3))

    
    return res
    

In [18]:
player_features = []
for y in [i for i in range(1984, 2024)]:
    for t in teams: 
        try :
            player_features.append(get_stats_players(y, t))
        except AssertionError:
            pass
col = ["Team", "Year"]
for i in range(window):
    col += [f"MeanPTSWeighted_{i+1}", f"MeanVORP_{i+1}", f"MeanPER_{i+1}", f"MeanWS_{i+1}"]
player_features = pd.DataFrame(player_features, columns = col)

Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exi

In [19]:
player_features

Unnamed: 0,Team,Year,MeanPTSWeighted_1,MeanVORP_1,MeanPER_1,MeanWS_1,MeanPTSWeighted_2,MeanVORP_2,MeanPER_2,MeanWS_2,...,MeanPER_3,MeanWS_3,MeanPTSWeighted_4,MeanVORP_4,MeanPER_4,MeanWS_4,MeanPTSWeighted_5,MeanVORP_5,MeanPER_5,MeanWS_5
0,BOS,1984,13.240,0.223,1.566,0.652,14.649,0.269,1.759,0.752,...,1.783,0.765,15.229,0.298,2.035,0.868,15.170,0.395,2.626,1.176
1,NJN,1984,10.958,0.100,1.096,0.361,11.357,0.088,1.123,0.317,...,1.479,0.426,12.540,0.390,2.773,1.056,11.632,0.333,3.323,1.018
2,NYK,1984,10.499,0.089,1.160,0.359,13.041,0.125,1.278,0.457,...,1.630,0.663,13.150,0.142,1.736,0.601,13.985,0.160,2.364,0.644
3,PHI,1984,13.423,0.247,1.649,0.727,16.980,0.419,2.417,1.066,...,2.662,1.218,16.338,0.513,3.080,1.375,16.460,0.583,3.554,1.653
4,WSB,1984,12.765,0.277,2.222,0.907,11.995,0.183,1.932,0.688,...,4.490,1.403,12.871,0.349,4.745,1.327,11.897,0.252,4.561,1.236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1126,MEM,2023,9.551,0.094,1.024,0.290,8.416,0.051,1.103,0.228,...,1.509,0.348,8.514,0.204,2.274,0.804,8.315,0.178,2.939,1.002
1127,OKC,2023,8.059,0.023,0.808,0.097,9.006,0.002,1.222,0.082,...,1.607,0.370,5.527,0.093,2.559,0.540,4.773,0.377,9.044,1.507
1128,BRK,2023,9.912,0.061,0.770,0.180,10.155,0.084,0.916,0.244,...,1.028,0.260,10.874,0.145,1.075,0.371,12.270,0.239,1.690,0.585
1129,NOP,2023,9.536,0.061,0.908,0.206,10.709,0.089,1.111,0.263,...,1.434,0.320,10.267,0.085,1.709,0.368,10.630,0.155,2.118,0.589


In [20]:
col = list(player_features.columns)[1:]
mean_dataframe = player_features[col].groupby("Year").mean().reset_index()
std_dataframe = player_features[col].groupby("Year").std().reset_index()

In [21]:
player_features

Unnamed: 0,Team,Year,MeanPTSWeighted_1,MeanVORP_1,MeanPER_1,MeanWS_1,MeanPTSWeighted_2,MeanVORP_2,MeanPER_2,MeanWS_2,...,MeanPER_3,MeanWS_3,MeanPTSWeighted_4,MeanVORP_4,MeanPER_4,MeanWS_4,MeanPTSWeighted_5,MeanVORP_5,MeanPER_5,MeanWS_5
0,BOS,1984,13.240,0.223,1.566,0.652,14.649,0.269,1.759,0.752,...,1.783,0.765,15.229,0.298,2.035,0.868,15.170,0.395,2.626,1.176
1,NJN,1984,10.958,0.100,1.096,0.361,11.357,0.088,1.123,0.317,...,1.479,0.426,12.540,0.390,2.773,1.056,11.632,0.333,3.323,1.018
2,NYK,1984,10.499,0.089,1.160,0.359,13.041,0.125,1.278,0.457,...,1.630,0.663,13.150,0.142,1.736,0.601,13.985,0.160,2.364,0.644
3,PHI,1984,13.423,0.247,1.649,0.727,16.980,0.419,2.417,1.066,...,2.662,1.218,16.338,0.513,3.080,1.375,16.460,0.583,3.554,1.653
4,WSB,1984,12.765,0.277,2.222,0.907,11.995,0.183,1.932,0.688,...,4.490,1.403,12.871,0.349,4.745,1.327,11.897,0.252,4.561,1.236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1126,MEM,2023,9.551,0.094,1.024,0.290,8.416,0.051,1.103,0.228,...,1.509,0.348,8.514,0.204,2.274,0.804,8.315,0.178,2.939,1.002
1127,OKC,2023,8.059,0.023,0.808,0.097,9.006,0.002,1.222,0.082,...,1.607,0.370,5.527,0.093,2.559,0.540,4.773,0.377,9.044,1.507
1128,BRK,2023,9.912,0.061,0.770,0.180,10.155,0.084,0.916,0.244,...,1.028,0.260,10.874,0.145,1.075,0.371,12.270,0.239,1.690,0.585
1129,NOP,2023,9.536,0.061,0.908,0.206,10.709,0.089,1.111,0.263,...,1.434,0.320,10.267,0.085,1.709,0.368,10.630,0.155,2.118,0.589


In [22]:
def normalizing(year, value, col):
    return (value - mean_dataframe[[col, "Year"]][(mean_dataframe.Year == year)].values[0][0])/(std_dataframe[[col, "Year"]][(mean_dataframe.Year == year)].values[0][0])

for feat in col[1:]:
    player_features[feat] = player_features.apply(lambda x : normalizing(x.Year, x[feat], feat), axis = 1)
player_features

Unnamed: 0,Team,Year,MeanPTSWeighted_1,MeanVORP_1,MeanPER_1,MeanWS_1,MeanPTSWeighted_2,MeanVORP_2,MeanPER_2,MeanWS_2,...,MeanPER_3,MeanWS_3,MeanPTSWeighted_4,MeanVORP_4,MeanPER_4,MeanWS_4,MeanPTSWeighted_5,MeanVORP_5,MeanPER_5,MeanWS_5
0,BOS,1984,0.691282,1.306881,0.415070,1.050208,0.931111,1.223837,0.082140,0.793457,...,-0.598941,-0.282896,0.676630,0.123337,-0.740864,-0.222974,0.664872,0.262073,-0.735670,-0.010462
1,NJN,1984,-0.704902,-0.408742,-1.024888,-0.549483,-0.370323,-0.862499,-1.121805,-1.075314,...,-0.796610,-0.892554,-0.102480,0.687010,-0.132179,0.162991,-0.265026,-0.021045,-0.268896,-0.296304
2,NYK,1984,-0.985729,-0.562171,-0.828808,-0.560477,0.295417,-0.436011,-0.828391,-0.473870,...,-0.698426,-0.466333,0.074262,-0.832456,-0.987473,-0.771125,0.353417,-0.811036,-0.911129,-0.972920
3,PHI,1984,0.803246,1.641637,0.669361,1.462499,1.852632,2.952845,1.327730,2.142409,...,-0.027394,0.531779,0.997952,1.440616,0.121027,0.817898,1.003925,1.120561,-0.114197,0.852495
4,WSB,1984,0.400665,2.060082,2.424884,2.451998,-0.118101,0.232539,0.409628,0.518512,...,1.161216,0.864483,-0.006576,0.435808,1.494280,0.719354,-0.195375,-0.390926,0.560182,0.098086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1126,MEM,2023,0.029700,0.788586,1.130905,0.968684,-1.070200,-0.527428,0.347908,-0.031597,...,1.236977,0.842472,-0.817234,0.814641,0.774663,1.877754,-0.682453,-0.181242,0.348617,0.962643
1127,OKC,2023,-1.345019,-1.285102,-0.719667,-1.695773,-0.637248,-1.811068,1.053207,-2.128521,...,1.717071,1.102178,-2.221336,-0.685776,1.175132,0.380692,-1.910064,1.068198,4.354969,2.408899
1128,BRK,2023,0.362323,-0.175241,-1.045230,-0.549918,0.205909,0.337065,-0.760419,0.198202,...,-1.119403,-0.196354,0.292133,0.017122,-0.910120,-0.577654,0.688298,0.201752,-0.471028,-0.231592
1129,NOP,2023,0.015879,-0.175241,0.137079,-0.190976,0.612445,0.468048,0.395323,0.471090,...,0.869558,0.511936,0.006800,-0.793914,-0.019251,-0.594666,0.119896,-0.325650,-0.190157,-0.220136


In [23]:
player_features.to_csv("data/player_features.csv", index = None)