In [1]:
import pandas as pd
import numpy as np

import datetime

def str2datetime(date):
    
    date_tmp = date.split('-')
    
    y = int(date_tmp[0])
    m = int(date_tmp[1])
    d = int(date_tmp[2])
    
    return datetime.datetime(y,m,d)

In [2]:
inj_df = pd.read_csv('nba_games_2010_20.csv')
inj_df = inj_df.loc[inj_df['Age'] != 0]
inj_df = inj_df.drop(columns='Unnamed: 0')

In [3]:
inj_df

Unnamed: 0,Name,Height [cm],Weight [kg],Position,Pos3,Shoots,Date,Age,Opp,MP,...,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,Was_Injured?,Notes
0,Carlos Boozer,206,117,Center and Power Forward,Big,Right,2011-01-15,29.153320,MIA,35.500000,...,3,0,1,2,2,12,8.8,0.0,1.0,sprained left ankle (DNP)
1,Carlos Boozer,206,117,Center and Power Forward,Big,Right,2011-03-09,29.298426,CHA,26.466667,...,1,0,1,3,4,10,4.6,14.0,1.0,placed on IL with sprained left ankle
2,Carlos Boozer,206,117,Center and Power Forward,Big,Right,2013-01-28,31.188912,CHA,26.816667,...,1,0,0,1,4,13,4.8,-12.0,1.0,strained right hamstring (DNP)
3,Carlos Boozer,206,117,Center and Power Forward,Big,Right,2014-01-02,32.117728,BOS,29.833333,...,0,1,0,1,3,16,8.9,4.0,1.0,sore/bruised right knee (DNP)
4,Carlos Boozer,206,117,Center and Power Forward,Big,Right,2014-01-20,32.167009,LAL,28.016667,...,4,0,0,0,2,11,5.6,5.0,1.0,left calf injury (DNP)
5,Carlos Boozer,206,117,Center and Power Forward,Big,Right,2014-02-04,32.208077,PHO,31.800000,...,1,1,1,1,2,19,13.3,5.0,1.0,strained left calf (DNP)
6,Carlos Boozer,206,117,Center and Power Forward,Big,Right,2014-11-21,33.002738,DAL,19.783333,...,2,0,0,0,3,12,6.4,-3.0,1.0,strained left shoulder (DTD)
7,Jonas Jerebko,208,104,Power Forward,Big,Right,2016-03-15,29.035592,IND,26.283333,...,2,0,0,1,2,17,11.9,-8.0,1.0,placed on IL with sore left ankle
8,Troy Murphy,211,111,Power Forward,Big,Left,2010-11-09,30.522930,CLE,24.733333,...,1,0,0,1,2,5,4.9,2.0,1.0,placed on IL with sore right foot
9,Troy Murphy,211,111,Power Forward,Big,Left,2010-11-13,30.533881,ORL,8.433333,...,0,0,0,0,2,2,1.1,12.0,1.0,placed on IL with sore right foot


### Sort values by Name and Date for Windowing

In [4]:
inj_df = inj_df.sort_values(by=['Name','Date']).reset_index(drop=True)

#### Start with 7 day window

In [5]:
time_window = datetime.timedelta(days=7) #Changes the time window

window_arr = []

for idx in range(inj_df.shape[0]):
    
    idx_end = idx
    idx_start = idx

    current_row = inj_df.iloc[idx]
    date_game = str2datetime(current_row['Date'])
    
    if idx%10000==0:
        print("On index {}, player is {}".format(idx,current_row['Name']))

    in_window = True

    while(in_window):
        #Run backwards through dataframe, finding the first game that's out of the window or is a different player
        row = inj_df.iloc[idx_end-1]
        in_window = (current_row['Name'] == row['Name']) and (date_game - str2datetime(row['Date']) <= time_window)
        if in_window:
            idx_end -= 1

    games_window = inj_df.iloc[idx_end:idx_start].loc[:,'MP':'+/-']
    
    window_mean = games_window.mean()
    
    #Correctly compute average FG%, 3P%, FT%
    window_mean['FG%'] = games_window['FG'].sum()/games_window['FGA'].sum()
    window_mean['3P%'] = games_window['3P'].sum()/games_window['3PA'].sum()
    window_mean['FT%'] = games_window['FT'].sum()/games_window['FT%'].sum()
    
    window_mean = np.concatenate((window_mean,[len(games_window)])) #Add column of N_games in time window
    
    window_arr.append(window_mean)
    
window_arr = np.array(window_arr)

On index 0, player is (William) Tony Parker




On index 10000, player is Andre Drummond
On index 20000, player is Ben McLemore
On index 30000, player is Buddy Hield
On index 40000, player is Chris Wright (b)
On index 50000, player is Dante Cunningham
On index 60000, player is Dennis Smith Jr.
On index 70000, player is Dwayne Bacon
On index 80000, player is Evan Fournier
On index 90000, player is Greg Monroe
On index 100000, player is J.R. Smith
On index 110000, player is Jarrett Allen
On index 120000, player is Jimmy Butler
On index 130000, player is Jordan Hill
On index 140000, player is Kemba Walker
On index 150000, player is Kurt Thomas
On index 160000, player is Louis Williams
On index 170000, player is Mario Chalmers
On index 180000, player is Mike Conley Jr.
On index 190000, player is Nikola Mirotic
On index 200000, player is Paul Millsap
On index 210000, player is Robert Covington
On index 220000, player is Serge Ibaka
On index 230000, player is T.J. McConnell
On index 240000, player is Tony Allen
On index 250000, player is 

#### Add this on to our exisiting dataframe, and save the result

In [6]:
inj_window = inj_df.copy()

averaged_columns = inj_df.columns[9:30]

for i,col in enumerate(averaged_columns):
    col_tag = col + "_Av7"
    inj_window[col_tag] = window_arr[:,i]
    
inj_window['N_games_7'] = window_arr[:,-1]

In [7]:
inj_window.to_csv('injs_avg7.csv')

### 14 day window

In [8]:
time_window = datetime.timedelta(days=14) #Changes the time window

window_arr = []

for idx in range(inj_df.shape[0]):
    
    idx_end = idx
    idx_start = idx

    current_row = inj_df.iloc[idx]
    date_game = str2datetime(current_row['Date'])
    
    if idx%10000==0:
        print("On index {}, player is {}".format(idx,current_row['Name']))

    in_window = True

    while(in_window):
        #Run backwards through dataframe, finding the first game that's out of the window or is a different player
        row = inj_df.iloc[idx_end-1]
        in_window = (current_row['Name'] == row['Name']) and (date_game - str2datetime(row['Date']) <= time_window)
        if in_window:
            idx_end -= 1

    games_window = inj_df.iloc[idx_end:idx_start].loc[:,'MP':'+/-']
    
    window_mean = games_window.mean()
    
    #Correctly compute average FG%, 3P%, FT%
    window_mean['FG%'] = games_window['FG'].sum()/games_window['FGA'].sum()
    window_mean['3P%'] = games_window['3P'].sum()/games_window['3PA'].sum()
    window_mean['FT%'] = games_window['FT'].sum()/games_window['FT%'].sum()
    
    window_mean = np.concatenate((window_mean,[len(games_window)])) #Add column of N_games in time window
    
    window_arr.append(window_mean)
    
window_arr = np.array(window_arr)

On index 0, player is (William) Tony Parker




On index 10000, player is Andre Drummond
On index 20000, player is Ben McLemore
On index 30000, player is Buddy Hield
On index 40000, player is Chris Wright (b)
On index 50000, player is Dante Cunningham
On index 60000, player is Dennis Smith Jr.
On index 70000, player is Dwayne Bacon
On index 80000, player is Evan Fournier
On index 90000, player is Greg Monroe
On index 100000, player is J.R. Smith
On index 110000, player is Jarrett Allen
On index 120000, player is Jimmy Butler
On index 130000, player is Jordan Hill
On index 140000, player is Kemba Walker
On index 150000, player is Kurt Thomas
On index 160000, player is Louis Williams
On index 170000, player is Mario Chalmers
On index 180000, player is Mike Conley Jr.
On index 190000, player is Nikola Mirotic
On index 200000, player is Paul Millsap
On index 210000, player is Robert Covington
On index 220000, player is Serge Ibaka
On index 230000, player is T.J. McConnell
On index 240000, player is Tony Allen
On index 250000, player is 

#### Add this on to our exisiting dataframe, and save the result

In [9]:
inj_window = inj_df.copy()

averaged_columns = inj_df.columns[9:30]

for i,col in enumerate(averaged_columns):
    col_tag = col + "_Av14"
    inj_window[col_tag] = window_arr[:,i]
    
inj_window['N_games_14'] = window_arr[:,-1]

In [10]:
inj_window.to_csv('injs_avg14.csv')

### 21 day window

In [11]:
time_window = datetime.timedelta(days=21) #Changes the time window

window_arr = []

for idx in range(inj_df.shape[0]):
    
    idx_end = idx
    idx_start = idx

    current_row = inj_df.iloc[idx]
    date_game = str2datetime(current_row['Date'])
    
    if idx%10000==0:
        print("On index {}, player is {}".format(idx,current_row['Name']))

    in_window = True

    while(in_window):
        #Run backwards through dataframe, finding the first game that's out of the window or is a different player
        row = inj_df.iloc[idx_end-1]
        in_window = (current_row['Name'] == row['Name']) and (date_game - str2datetime(row['Date']) <= time_window)
        if in_window:
            idx_end -= 1

    games_window = inj_df.iloc[idx_end:idx_start].loc[:,'MP':'+/-']
    
    window_mean = games_window.mean()
    
    #Correctly compute average FG%, 3P%, FT%
    window_mean['FG%'] = games_window['FG'].sum()/games_window['FGA'].sum()
    window_mean['3P%'] = games_window['3P'].sum()/games_window['3PA'].sum()
    window_mean['FT%'] = games_window['FT'].sum()/games_window['FT%'].sum()
    
    window_mean = np.concatenate((window_mean,[len(games_window)])) #Add column of N_games in time window
    
    window_arr.append(window_mean)
    
window_arr = np.array(window_arr)

On index 0, player is (William) Tony Parker




On index 10000, player is Andre Drummond
On index 20000, player is Ben McLemore
On index 30000, player is Buddy Hield
On index 40000, player is Chris Wright (b)
On index 50000, player is Dante Cunningham
On index 60000, player is Dennis Smith Jr.
On index 70000, player is Dwayne Bacon
On index 80000, player is Evan Fournier
On index 90000, player is Greg Monroe
On index 100000, player is J.R. Smith
On index 110000, player is Jarrett Allen
On index 120000, player is Jimmy Butler
On index 130000, player is Jordan Hill
On index 140000, player is Kemba Walker
On index 150000, player is Kurt Thomas
On index 160000, player is Louis Williams
On index 170000, player is Mario Chalmers
On index 180000, player is Mike Conley Jr.
On index 190000, player is Nikola Mirotic
On index 200000, player is Paul Millsap
On index 210000, player is Robert Covington
On index 220000, player is Serge Ibaka
On index 230000, player is T.J. McConnell
On index 240000, player is Tony Allen
On index 250000, player is 

#### Add this on to our exisiting dataframe, and save the result

In [12]:
inj_window = inj_df.copy()

averaged_columns = inj_df.columns[9:30]

for i,col in enumerate(averaged_columns):
    col_tag = col + "_Av21"
    inj_window[col_tag] = window_arr[:,i]
    
inj_window['N_games_21'] = window_arr[:,-1]

In [13]:
inj_window.to_csv('injs_avg21.csv')

### Back to Back Games

In [14]:
time_window = datetime.timedelta(days=1) #Changes the time window

window_arr = []

for idx in range(inj_df.shape[0]):
    
    idx_end = idx
    idx_start = idx

    current_row = inj_df.iloc[idx]
    date_game = str2datetime(current_row['Date'])
    
    if idx%10000==0:
        print("On index {}, player is {}".format(idx,current_row['Name']))

    in_window = True

    while(in_window):
        #Run backwards through dataframe, finding the first game that's out of the window or is a different player
        row = inj_df.iloc[idx_end-1]
        in_window = (current_row['Name'] == row['Name']) and (date_game - str2datetime(row['Date']) <= time_window)
        if in_window:
            idx_end -= 1

    games_window = inj_df.iloc[idx_end:idx_start].loc[:,'MP':'+/-']
    
    window_mean = games_window.mean()
    
    #Correctly compute average FG%, 3P%, FT%
    window_mean['FG%'] = games_window['FG'].sum()/games_window['FGA'].sum()
    window_mean['3P%'] = games_window['3P'].sum()/games_window['3PA'].sum()
    window_mean['FT%'] = games_window['FT'].sum()/games_window['FT%'].sum()
    
    window_mean = np.concatenate((window_mean,[len(games_window)])) #Add column of N_games in time window
    
    window_arr.append(window_mean)
    
window_arr = np.array(window_arr)

On index 0, player is (William) Tony Parker




On index 10000, player is Andre Drummond
On index 20000, player is Ben McLemore
On index 30000, player is Buddy Hield
On index 40000, player is Chris Wright (b)
On index 50000, player is Dante Cunningham
On index 60000, player is Dennis Smith Jr.
On index 70000, player is Dwayne Bacon
On index 80000, player is Evan Fournier
On index 90000, player is Greg Monroe
On index 100000, player is J.R. Smith
On index 110000, player is Jarrett Allen
On index 120000, player is Jimmy Butler
On index 130000, player is Jordan Hill
On index 140000, player is Kemba Walker
On index 150000, player is Kurt Thomas
On index 160000, player is Louis Williams
On index 170000, player is Mario Chalmers
On index 180000, player is Mike Conley Jr.
On index 190000, player is Nikola Mirotic
On index 200000, player is Paul Millsap
On index 210000, player is Robert Covington
On index 220000, player is Serge Ibaka
On index 230000, player is T.J. McConnell
On index 240000, player is Tony Allen
On index 250000, player is 

#### Add this on to our exisiting dataframe, and save the result

In [15]:
inj_window = inj_df.copy()

averaged_columns = inj_df.columns[9:30]

for i,col in enumerate(averaged_columns):
    col_tag = col + "_b2b"
    inj_window[col_tag] = window_arr[:,i]
    
inj_window['b2b'] = window_arr[:,-1]

In [16]:
inj_window.to_csv('injs_b2b.csv')

## Combine all the dataframes together

In [17]:
inj_df1 = pd.read_csv('injs_avg7.csv')
inj_df1 = inj_df1.drop(columns='Unnamed: 0')

inj_df2 = pd.read_csv('injs_avg14.csv')
inj_df2 = inj_df2.drop(columns='Unnamed: 0')

inj_df3 = pd.read_csv('injs_avg21.csv')
inj_df3 = inj_df3.drop(columns='Unnamed: 0')

inj_df4 = pd.read_csv('injs_b2b.csv')
inj_df4 = inj_df4.drop(columns='Unnamed: 0')

In [18]:
same_cols = [c for c in inj_df1.columns[:32]]

In [23]:
tmp = inj_df1.merge(inj_df2,how='inner',left_on=same_cols,right_on=same_cols)
tmp2 = tmp.merge(inj_df3,how='inner',left_on=same_cols,right_on=same_cols)
full_df = tmp2.merge(inj_df4,how='inner',left_on=same_cols,right_on=same_cols)

In [24]:
full_df.to_csv('inj_all_windows.csv')

In [25]:
for c in full_df.columns:
    print(c)

Name
Height [cm]
Weight [kg]
Position
Pos3
Shoots
Date
Age
Opp
MP
FG
FGA
FG%
3P
3PA
3P%
FT
FTA
FT%
ORB
DRB
TRB
AST
STL
BLK
TOV
PF
PTS
GmSc
+/-
Was_Injured?
Notes
MP_Av7
FG_Av7
FGA_Av7
FG%_Av7
3P_Av7
3PA_Av7
3P%_Av7
FT_Av7
FTA_Av7
FT%_Av7
ORB_Av7
DRB_Av7
TRB_Av7
AST_Av7
STL_Av7
BLK_Av7
TOV_Av7
PF_Av7
PTS_Av7
GmSc_Av7
+/-_Av7
N_games_7
MP_Av14
FG_Av14
FGA_Av14
FG%_Av14
3P_Av14
3PA_Av14
3P%_Av14
FT_Av14
FTA_Av14
FT%_Av14
ORB_Av14
DRB_Av14
TRB_Av14
AST_Av14
STL_Av14
BLK_Av14
TOV_Av14
PF_Av14
PTS_Av14
GmSc_Av14
+/-_Av14
N_games_14
MP_Av21
FG_Av21
FGA_Av21
FG%_Av21
3P_Av21
3PA_Av21
3P%_Av21
FT_Av21
FTA_Av21
FT%_Av21
ORB_Av21
DRB_Av21
TRB_Av21
AST_Av21
STL_Av21
BLK_Av21
TOV_Av21
PF_Av21
PTS_Av21
GmSc_Av21
+/-_Av21
N_games_21
MP_b2b
FG_b2b
FGA_b2b
FG%_b2b
3P_b2b
3PA_b2b
3P%_b2b
FT_b2b
FTA_b2b
FT%_b2b
ORB_b2b
DRB_b2b
TRB_b2b
AST_b2b
STL_b2b
BLK_b2b
TOV_b2b
PF_b2b
PTS_b2b
GmSc_b2b
+/-_b2b
b2b
