# Feature Engineering Players

In [1]:
from functions import Schedule, PlayerStats, OpeningDayRoster
import pandas as pd
import numpy as np

In [2]:
players = pd.read_csv("data/PlayersStats_1979-2023.csv")
players

Unnamed: 0,Player,href,Year,Pos,Age,Tm,G,GS,MP,FG,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,Kareem Abdul-Jabbar,abdulka01,1979,C,31,LAL,80,,39.5,9.7,...,15.3,23.3,8.8,5.6,14.4,0.219,4.6,3.0,7.6,7.7
1,Tom Abernethy,abernto01,1979,PF,24,GSW,70,,17.4,2.5,...,7.7,13.8,2.3,1.3,3.7,0.144,0.2,0.7,0.9,0.9
2,Alvan Adams,adamsal01,1979,C,24,PHO,77,,30.7,7.4,...,18.9,24.1,3.9,3.7,7.6,0.154,2.3,1.2,3.4,3.2
3,Lucius Allen,allenlu01,1979,PG,31,KCK,31,,13.3,2.2,...,13.7,20.3,-0.4,0.5,0.1,0.007,-3.7,0.4,-3.3,-0.1
4,Kim Anderson,anderki01,1979,SF,23,POR,21,,10.7,1.1,...,19.8,19.6,-0.6,0.2,-0.4,-0.078,-6.1,-1.3,-7.5,-0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19102,Thaddeus Young,youngth01,2023,PF,34,TOR,54,9.0,14.7,2.0,...,16.7,13.5,0.7,1.1,1.8,0.109,-1.8,1.9,0.1,0.4
19103,Trae Young,youngtr01,2023,PG,24,ATL,73,73.0,34.8,8.2,...,15.2,32.6,5.3,1.4,6.7,0.126,5.3,-2.0,3.3,3.4
19104,Omer Yurtseven,yurtsom01,2023,C,24,MIA,9,0.0,9.2,1.8,...,11.9,18.0,0.2,0.1,0.3,0.159,-2.5,-1.5,-3.9,0.0
19105,Cody Zeller,zelleco01,2023,C,30,MIA,15,2.0,14.5,2.5,...,15.8,18.1,0.4,0.3,0.7,0.147,-2.0,-0.7,-2.8,0.0


In [3]:
players.columns

Index(['Player', 'href', 'Year', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG',
       'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT',
       'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%',
       'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM',
       'DBPM', 'BPM', 'VORP'],
      dtype='object')

In [4]:
years = players.Year.unique()
teams = players.Tm.unique()

In [5]:
opening_roster = pd.read_csv("data/OpeningDayRosters_1979-2023.csv")
opening_roster

Unnamed: 0,Player,Year,OpeningDayTm
0,Tiny Archibald,1979,BOS
1,Dennis Awtrey,1979,BOS
2,Marvin Barnes,1979,BOS
3,Don Chaney,1979,BOS
4,Dave Cowens,1979,BOS
...,...,...,...
17878,Josh Richardson,2023,SAS
17879,Isaiah Roby,2023,SAS
17880,Jeremy Sochan,2023,SAS
17881,Devin Vassell,2023,SAS


In [6]:
def clean_players(x):
    return x.replace(" ", "").upper()

In [7]:
opening_roster.Player = opening_roster.Player.apply(clean_players)
players.Player = players.Player.apply(clean_players)

### Get Points, Assist, Reb

In [68]:
window = 3
def get_stats_players(year, team):
    assert len(opening_roster[(opening_roster.Year == year ) & (opening_roster.OpeningDayTm == team)]) > 0, print("Team doesn't exist for this certain Year")
    res = [team, year]
    roster = list(opening_roster[(opening_roster.Year == year ) & (opening_roster.OpeningDayTm == team)].Player)
    # Points for a window of three years
    injury = players[(players.Player.isin(roster)) & (players.Year <= year - 1)][["Player", "G"]]
    games_played = injury.groupby("Player").sum()
    total_games = injury.groupby("Player").count().rename(columns = {"G" : "Total"})
    injury_index = games_played.merge(total_games, how = 'left', on = "Player").reset_index()
    injury_index["InjuryIndex"] = injury_index.apply(lambda x : x.G/(x.Total*82), axis = 1)

    injury_index = injury_index.loc[:, ["Player", "InjuryIndex"]]

    for y in range(window):
        tamp = players[(players.Player.isin(roster)) & (players.Year == year - 1 - y)][["Player", "G", "MP", "PTS", "VORP", "WS", "PER"]]
        tamp = tamp.merge(injury_index, how = "left", on  = "Player")
        tamp["TMP"] = tamp.MP * tamp.G
        tamp = tamp[(tamp.TMP >= 100)]
        total_mp = tamp.MP.sum()
        tamp["weightedPTS"] = tamp.apply(lambda x : (x.MP*x.PTS*x.InjuryIndex), axis = 1)/total_mp
        tamp["weightedVORP"] = tamp.apply(lambda x : (x.MP*x.VORP*x.InjuryIndex), axis = 1)/total_mp
        tamp["weightedPER"] = tamp.apply(lambda x : (x.MP*x.PER*x.InjuryIndex), axis = 1)/total_mp
        tamp["weightedWS"] = tamp.apply(lambda x : (x.MP*x.WS*x.InjuryIndex), axis = 1)/total_mp
        res.append(round(tamp.weightedPTS.sum(), 3))
        res.append(round(tamp.weightedVORP.mean(), 3))
        res.append(round(tamp.weightedPER.mean(), 3))
        res.append(round(tamp.weightedWS.mean(), 3))

    
    return res
    

In [69]:
player_features = []
for y in [i for i in range(1982, 2024)]:
    for t in teams: 
        try :
            player_features.append(get_stats_players(y, t))
        except AssertionError:
            pass
col = ["Team", "Year"]
for i in range(window):
    if i == 0:
        col += ["MeanPTS", "MeanVORP", "MeanPER", "MeanWS"]
    else : 
        col += [f"MeanPTSWeighted_{i}", f"MeanVORP_{i}", f"MeanPER_{i}", f"MeanWS_{i}"]
player_features = pd.DataFrame(player_features, columns = col)

Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exist for this certain Year
Team doesn't exi

In [70]:
player_features


Unnamed: 0,Team,Year,MeanPTS,MeanVORP,MeanPER,MeanWS,MeanPTSWeighted_1,MeanVORP_1,MeanPER_1,MeanWS_1,MeanPTSWeighted_2,MeanVORP_2,MeanPER_2,MeanWS_2
0,LAL,1982,14.405,0.213,1.718,0.654,14.197,0.269,1.701,0.762,14.709,0.359,2.352,0.963
1,GSW,1982,12.521,0.114,1.376,0.451,14.032,0.257,2.429,0.669,14.715,0.312,2.349,0.884
2,PHO,1982,11.995,0.171,1.459,0.569,12.813,0.176,1.657,0.567,15.655,0.301,2.402,0.913
3,KCK,1982,11.474,0.109,1.368,0.400,10.568,0.178,1.683,0.576,11.599,0.240,2.303,0.776
4,POR,1982,11.462,0.131,1.504,0.465,10.706,0.170,2.181,0.668,9.543,0.292,3.190,0.979
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,MEM,2023,9.809,0.123,1.256,0.362,8.643,0.070,1.376,0.312,9.055,0.106,1.760,0.409
1131,OKC,2023,7.671,0.020,0.710,0.084,8.961,0.002,1.212,0.081,6.700,0.098,1.595,0.369
1132,BRK,2023,10.498,0.078,0.935,0.222,10.369,0.095,1.000,0.272,9.978,0.097,1.124,0.291
1133,NOP,2023,8.339,0.078,1.507,0.293,10.630,0.154,1.854,0.456,11.974,0.164,2.359,0.487


In [71]:
col = list(player_features.columns)[1:]
mean_dataframe = player_features[col].groupby("Year").mean().reset_index()
std_dataframe = player_features[col].groupby("Year").std().reset_index()

In [72]:
player_features

Unnamed: 0,Team,Year,MeanPTS,MeanVORP,MeanPER,MeanWS,MeanPTSWeighted_1,MeanVORP_1,MeanPER_1,MeanWS_1,MeanPTSWeighted_2,MeanVORP_2,MeanPER_2,MeanWS_2
0,LAL,1982,14.405,0.213,1.718,0.654,14.197,0.269,1.701,0.762,14.709,0.359,2.352,0.963
1,GSW,1982,12.521,0.114,1.376,0.451,14.032,0.257,2.429,0.669,14.715,0.312,2.349,0.884
2,PHO,1982,11.995,0.171,1.459,0.569,12.813,0.176,1.657,0.567,15.655,0.301,2.402,0.913
3,KCK,1982,11.474,0.109,1.368,0.400,10.568,0.178,1.683,0.576,11.599,0.240,2.303,0.776
4,POR,1982,11.462,0.131,1.504,0.465,10.706,0.170,2.181,0.668,9.543,0.292,3.190,0.979
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,MEM,2023,9.809,0.123,1.256,0.362,8.643,0.070,1.376,0.312,9.055,0.106,1.760,0.409
1131,OKC,2023,7.671,0.020,0.710,0.084,8.961,0.002,1.212,0.081,6.700,0.098,1.595,0.369
1132,BRK,2023,10.498,0.078,0.935,0.222,10.369,0.095,1.000,0.272,9.978,0.097,1.124,0.291
1133,NOP,2023,8.339,0.078,1.507,0.293,10.630,0.154,1.854,0.456,11.974,0.164,2.359,0.487


In [73]:
def normalizing(year, value, col):
    return (value - mean_dataframe[[col, "Year"]][(mean_dataframe.Year == year)].values[0][0])/(std_dataframe[[col, "Year"]][(mean_dataframe.Year == year)].values[0][0])

for feat in col[1:]:
    player_features[feat] = player_features.apply(lambda x : normalizing(x.Year, x[feat], feat), axis = 1)
player_features

Unnamed: 0,Team,Year,MeanPTS,MeanVORP,MeanPER,MeanWS,MeanPTSWeighted_1,MeanVORP_1,MeanPER_1,MeanWS_1,MeanPTSWeighted_2,MeanVORP_2,MeanPER_2,MeanWS_2
0,LAL,1982,1.892143,1.917886,1.238739,1.590509,0.950722,1.095151,-0.201652,0.684053,0.789726,1.692870,0.492909,1.128304
1,GSW,1982,0.638538,-0.056195,0.012871,0.142647,0.871121,0.963971,0.960932,0.309636,0.791977,1.168319,0.486313,0.774909
2,PHO,1982,0.288540,1.080397,0.310377,0.984261,0.283035,0.078509,-0.271918,-0.101016,1.144611,1.045552,0.602845,0.904636
3,KCK,1982,-0.058131,-0.155896,-0.015804,-0.221102,-0.800028,0.100372,-0.230397,-0.064782,-0.376967,0.364751,0.385173,0.291785
4,POR,1982,-0.066116,0.282789,0.471676,0.242499,-0.733452,0.012919,0.564887,0.305610,-1.148260,0.945106,2.335423,1.199878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,MEM,2023,0.478867,1.585110,1.428775,1.850001,-0.712801,-0.189722,0.463157,0.493607,-0.333054,0.303561,0.817836,0.882456
1131,OKC,2023,-1.147486,-1.193484,-1.181798,-1.910641,-0.503675,-1.698319,0.005009,-1.866617,-1.570052,0.153399,0.387230,0.519073
1132,BRK,2023,1.002982,0.371161,-0.106012,-0.043848,0.422266,0.364909,-0.587231,0.084910,0.151765,0.134628,-0.841954,-0.189523
1133,NOP,2023,-0.639346,0.371161,2.628874,0.916604,0.593907,1.673838,1.798491,1.964916,1.200194,1.392240,2.381066,1.591052
