In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.set_option('max_columns', 300)

In [100]:
def make_scores(data):
    df = data.copy()
    df['points_made'] = 0
    df.loc[df.EventType == 'made1', 'points_made'] = 1
    df.loc[df.EventType == 'made2', 'points_made'] = 2
    df.loc[df.EventType == 'made3', 'points_made'] = 3
    df['tmp_gameID'] = df['DayNum'].astype(str) + '_' + df['WTeamID'].astype(str) + '_' + df['LTeamID'].astype(str)
    df['Final_difference'] = df['WFinalScore'] - df['LFinalScore']
    
    df = df.sort_values(by=['DayNum', 'WTeamID', 'ElapsedSeconds'])
    
    df['points'] = df.groupby(['tmp_gameID', 'EventTeamID']).points_made.cumsum() - df.points_made
    
    del df['WCurrentScore']
    del df['LCurrentScore']
    
    df.loc[df.WTeamID == df.EventTeamID, 'WCurrentScore'] = df.points
    df.loc[df.LTeamID == df.EventTeamID, 'LCurrentScore'] = df.points

    df['WCurrentScore'] = df.groupby('tmp_gameID')['WCurrentScore'].fillna(method='ffill').fillna(0)
    df['LCurrentScore'] = df.groupby('tmp_gameID')['LCurrentScore'].fillna(method='ffill').fillna(0)
    
    df['Current_difference'] = df['WCurrentScore'] - df['LCurrentScore']
    
    del df['points']
    del df['points_made']
    del df['tmp_gameID']
    
    return df


def quarter_score(data):
    df = data.copy()
    
    df['period'] = 1
    df.loc[df.ElapsedSeconds > 20 * 60, 'period'] = 2
    df.loc[df.ElapsedSeconds > 40 * 60, 'period'] = 3
    
    df['minutes'] = df['ElapsedSeconds'] / 60
    df['tmp_gameID'] = df['DayNum'].astype(str) + '_' + df['WTeamID'].astype(str) + '_' + df['LTeamID'].astype(str)
    
    ot = ((df.groupby('tmp_gameID').minutes.max() - 40) / 5).reset_index()
    ot['n_OT'] = np.where(ot.minutes > 0, ot.minutes, 0)
    df = pd.merge(df, ot[['tmp_gameID', 'n_OT']], on='tmp_gameID')
    
    
    
    #del df['tmp_gameID']
    del df['minutes']
    
    return df

In [101]:
df_2019 = pd.read_csv('data/raw_men/MEvents2019.csv')

df_2019 = make_scores(df_2019)
df_2019 = quarter_score(df_2019)

df_2019.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,EventSubType,X,Y,Area,Final_difference,WCurrentScore,LCurrentScore,Current_difference,period,tmp_gameID,n_OT
0,10442717,2019,1,1104,1380,82,62,18,1380,10314,made2,jump,0,0,0,20,0.0,0.0,0.0,1,1_1104_1380,0.0
1,10442718,2019,1,1104,1380,82,62,18,1380,10307,assist,,0,0,0,20,0.0,2.0,-2.0,1,1_1104_1380,0.0
2,10442719,2019,1,1104,1380,82,62,42,1104,129,made2,jump,0,0,0,20,0.0,2.0,-2.0,1,1_1104_1380,0.0
3,10442720,2019,1,1104,1380,82,62,58,1380,10303,turnover,unk,0,0,0,20,0.0,2.0,-2.0,1,1_1104_1380,0.0
4,10442721,2019,1,1104,1380,82,62,63,1104,143,made3,unk,0,0,0,20,2.0,2.0,0.0,1,1_1104_1380,0.0


In [104]:
df_2019.groupby(['tmp_gameID', 'period'])[['WCurrentScore', 'LCurrentScore']].max()

Unnamed: 0_level_0,Unnamed: 1_level_0,WCurrentScore,LCurrentScore
tmp_gameID,period,Unnamed: 2_level_1,Unnamed: 3_level_1
100_1101_1311,1,30.0,16.0
100_1101_1311,2,63.0,48.0
100_1119_1131,1,32.0,25.0
100_1119_1131,2,71.0,61.0
100_1133_1260,1,20.0,26.0
...,...,...,...
9_1440_1367,2,78.0,72.0
9_1460_1405,1,40.0,32.0
9_1460_1405,2,84.0,74.0
9_1461_1212,1,52.0,42.0


In [105]:
files = ['2015', '2016', '2017', '2018', '2019']
all_events = []

for year in np.arange(2015, 2020):
    df = pd.read_csv(f'data/raw_men/MEvents{year}.csv')
    df = make_scores(df)
    df = quarter_score(df)
    all_events.append(df)

all_events = pd.concat(all_events, ignore_index=True)

all_events.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,EventSubType,X,Y,Area,Final_difference,WCurrentScore,LCurrentScore,Current_difference,period,tmp_gameID,n_OT
0,1,2015,11,1103,1420,74,57,19,1103,100,miss3,unk,0,0,0,17,0.0,0.0,0.0,1,11_1103_1420,0.0
1,2,2015,11,1103,1420,74,57,19,1420,11784,reb,def,0,0,0,17,0.0,0.0,0.0,1,11_1103_1420,0.0
2,3,2015,11,1103,1420,74,57,27,1420,11789,made2,dunk,0,0,0,17,0.0,0.0,0.0,1,11_1103_1420,0.0
3,4,2015,11,1103,1420,74,57,27,1420,11803,assist,,0,0,0,17,0.0,2.0,-2.0,1,11_1103_1420,0.0
4,5,2015,11,1103,1420,74,57,59,1103,87,made2,jump,0,0,0,17,0.0,2.0,-2.0,1,11_1103_1420,0.0


In [69]:
all_events.EventType.value_counts(dropna=False)

sub         3356811
reb         2099080
foul        1032476
miss2       1019422
made2        985522
made1        778146
miss3        751246
assist       725993
turnover     707561
made3        399533
timeout      358201
steal        337776
miss1        328187
block        186528
fouled        74308
jumpb          8894
Name: EventType, dtype: int64

In [70]:
all_events.ElapsedSeconds.max() / 60

60.0

In [17]:
df_2019['Final_difference'] = df_2019['WFinalScore'] - df_2019['LFinalScore']
df_2019['Current_difference'] = df_2019['WCurrentScore'] - df_2019['LCurrentScore']

df_2019['points_made'] = 0
df_2019.loc[df_2019.EventType == 'made1', 'points_made'] = 1
df_2019.loc[df_2019.EventType == 'made2', 'points_made'] = 2
df_2019.loc[df_2019.EventType == 'made3', 'points_made'] = 3

df_2019['tmp_gameID'] = df_2019['DayNum'].astype(str) + '_' + df_2019['WTeamID'].astype(str) + '_' +df_2019['LTeamID'].astype(str)

In [21]:
df_2019.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,WCurrentScore,LCurrentScore,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,EventSubType,X,Y,Area,Final_difference,Current_difference,points_made,tmp_gameID
0,10442717,2019,1,1104,1380,82,62,0,0,18,1380,10314,made2,jump,0,0,0,20,0,2,1_1104_1380
1,10442718,2019,1,1104,1380,82,62,0,0,18,1380,10307,assist,,0,0,0,20,0,0,1_1104_1380
2,10442719,2019,1,1104,1380,82,62,0,0,42,1104,129,made2,jump,0,0,0,20,0,2,1_1104_1380
3,10442720,2019,1,1104,1380,82,62,0,0,58,1380,10303,turnover,unk,0,0,0,20,0,0,1_1104_1380
4,10442721,2019,1,1104,1380,82,62,0,0,63,1104,143,made3,unk,0,0,0,20,0,3,1_1104_1380


In [56]:
test = df_2019[['DayNum', 'WTeamID', 'LTeamID', 'tmp_gameID', 'ElapsedSeconds', 'EventTeamID', 'EventType', 'points_made']].copy()

test = test.sort_values(by=['DayNum', 'WTeamID', 'ElapsedSeconds'])

test['points'] = test.groupby(['tmp_gameID', 'EventTeamID']).points_made.cumsum() - test.points_made

test.head(10)

Unnamed: 0,DayNum,WTeamID,LTeamID,tmp_gameID,ElapsedSeconds,EventTeamID,EventType,points_made,points
0,1,1104,1380,1_1104_1380,18,1380,made2,2,0
1,1,1104,1380,1_1104_1380,18,1380,assist,0,2
2,1,1104,1380,1_1104_1380,42,1104,made2,2,0
3,1,1104,1380,1_1104_1380,58,1380,turnover,0,2
4,1,1104,1380,1_1104_1380,63,1104,made3,3,2
5,1,1104,1380,1_1104_1380,63,1104,assist,0,5
6,1,1104,1380,1_1104_1380,71,1380,foul,0,2
7,1,1104,1380,1_1104_1380,71,1380,turnover,0,2
8,1,1104,1380,1_1104_1380,79,1380,foul,0,2
9,1,1104,1380,1_1104_1380,94,1104,miss2,0,5


In [57]:
test.loc[test.WTeamID == test.EventTeamID, 'WCurrent_score'] = test.points
test.loc[test.LTeamID == test.EventTeamID, 'LCurrent_score'] = test.points

test['WCurrent_score'] = test.groupby('tmp_gameID')['WCurrent_score'].fillna(method='ffill').fillna(0)
test['LCurrent_score'] = test.groupby('tmp_gameID')['LCurrent_score'].fillna(method='ffill').fillna(0)

test.head()

Unnamed: 0,DayNum,WTeamID,LTeamID,tmp_gameID,ElapsedSeconds,EventTeamID,EventType,points_made,points,WCurrent_score,LCurrent_score
0,1,1104,1380,1_1104_1380,18,1380,made2,2,0,0.0,0.0
1,1,1104,1380,1_1104_1380,18,1380,assist,0,2,0.0,2.0
2,1,1104,1380,1_1104_1380,42,1104,made2,2,0,0.0,2.0
3,1,1104,1380,1_1104_1380,58,1380,turnover,0,2,0.0,2.0
4,1,1104,1380,1_1104_1380,63,1104,made3,3,2,2.0,2.0


In [58]:
test[test.tmp_gameID == '1_1104_1380'].tail()

Unnamed: 0,DayNum,WTeamID,LTeamID,tmp_gameID,ElapsedSeconds,EventTeamID,EventType,points_made,points,WCurrent_score,LCurrent_score
502,1,1104,1380,1_1104_1380,2368,1104,sub,0,82,82.0,60.0
503,1,1104,1380,1_1104_1380,2368,1104,sub,0,82,82.0,60.0
504,1,1104,1380,1_1104_1380,2395,1104,miss2,0,82,82.0,60.0
505,1,1104,1380,1_1104_1380,2395,1380,reb,0,60,82.0,60.0
506,1,1104,1380,1_1104_1380,2399,1380,made2,2,60,82.0,60.0


In [59]:
test[test.tmp_gameID == '1_1113_1168']

Unnamed: 0,DayNum,WTeamID,LTeamID,tmp_gameID,ElapsedSeconds,EventTeamID,EventType,points_made,points,WCurrent_score,LCurrent_score
507,1,1113,1168,1_1113_1168,18,1168,foul,0,0,0.0,0.0
508,1,1113,1168,1_1113_1168,18,1113,miss2,0,0,0.0,0.0
509,1,1113,1168,1_1113_1168,18,1168,reb,0,0,0.0,0.0
510,1,1113,1168,1_1113_1168,18,1168,sub,0,0,0.0,0.0
511,1,1113,1168,1_1113_1168,34,1168,turnover,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1137,1,1113,1168,1_1113_1168,2992,1168,foul,0,94,100.0,94.0
1138,1,1113,1168,1_1113_1168,2992,1113,made1,1,100,100.0,94.0
1139,1,1113,1168,1_1113_1168,2992,1113,made1,1,101,101.0,94.0
1140,1,1113,1168,1_1113_1168,2997,1168,miss2,0,94,101.0,94.0


In [60]:
df_2019[df_2019.WCurrentScore > 0]

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,WCurrentScore,LCurrentScore,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,EventSubType,X,Y,Area,Final_difference,Current_difference,points_made,tmp_gameID
2211,10444928,2019,1,1123,1232,86,69,10,7,366,1232,4708,sub,out,0,0,0,17,3,0,1_1123_1232
2213,10444930,2019,1,1123,1232,86,69,10,7,366,1232,4709,sub,out,0,0,0,17,3,0,1_1123_1232
2215,10444932,2019,1,1123,1232,86,69,10,7,373,1123,7175,miss2,jump,7,23,6,17,3,0,1_1123_1232
2216,10444933,2019,1,1123,1232,86,69,10,7,373,1232,4718,reb,def,0,0,0,17,3,0,1_1123_1232
2217,10444934,2019,1,1123,1232,86,69,10,7,373,1123,791,foul,pers,6,62,13,17,3,0,1_1123_1232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2706961,13149678,2019,152,1438,1120,63,62,10,11,421,1438,12422,foul,pers,20,84,13,1,-1,0,152_1438_1120
2706962,13149679,2019,152,1438,1120,63,62,10,11,421,1120,701,fouled,,0,0,0,1,-1,0,152_1438_1120
2706963,13149680,2019,152,1438,1120,63,62,10,11,421,1438,12422,sub,out,0,0,0,1,-1,0,152_1438_1120
2706965,13149682,2019,152,1438,1120,63,62,10,11,421,1120,695,sub,out,0,0,0,1,-1,0,152_1438_1120


In [61]:
test[(test.tmp_gameID == '1_1123_1232') & (test.ElapsedSeconds == 366)].head(10)

Unnamed: 0,DayNum,WTeamID,LTeamID,tmp_gameID,ElapsedSeconds,EventTeamID,EventType,points_made,points,WCurrent_score,LCurrent_score
2211,1,1123,1232,1_1123_1232,366,1232,sub,0,7,10.0,7.0
2212,1,1123,1232,1_1123_1232,366,1232,sub,0,7,10.0,7.0
2213,1,1123,1232,1_1123_1232,366,1232,sub,0,7,10.0,7.0
2214,1,1123,1232,1_1123_1232,366,1232,sub,0,7,10.0,7.0
2776,1,1123,1232,1_1123_1232,366,1232,foul,0,7,10.0,7.0
2777,1,1123,1232,1_1123_1232,366,1123,fouled,0,10,10.0,7.0
2778,1,1123,1232,1_1123_1232,366,1123,sub,0,10,10.0,7.0
2779,1,1123,1232,1_1123_1232,366,1123,sub,0,10,10.0,7.0
