In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.set_option('max_columns', 300)

In [35]:
def make_scores(data):
    df = data.copy()
    df['points_made'] = 0
    df.loc[df.EventType == 'made1', 'points_made'] = 1
    df.loc[df.EventType == 'made2', 'points_made'] = 2
    df.loc[df.EventType == 'made3', 'points_made'] = 3
    df['tmp_gameID'] = df['DayNum'].astype(str) + '_' + df['WTeamID'].astype(str) + '_' + df['LTeamID'].astype(str)
    df['Final_difference'] = df['WFinalScore'] - df['LFinalScore']
    
    df = df.sort_values(by=['DayNum', 'WTeamID', 'ElapsedSeconds'])
    
    df['points'] = df.groupby(['tmp_gameID', 'EventTeamID']).points_made.cumsum() - df.points_made
    
    del df['WCurrentScore']
    del df['LCurrentScore']
    
    df.loc[df.WTeamID == df.EventTeamID, 'WCurrentScore'] = df.points
    df.loc[df.LTeamID == df.EventTeamID, 'LCurrentScore'] = df.points

    df['WCurrentScore'] = df.groupby('tmp_gameID')['WCurrentScore'].fillna(method='ffill').fillna(0)
    df['LCurrentScore'] = df.groupby('tmp_gameID')['LCurrentScore'].fillna(method='ffill').fillna(0)
    
    df['Current_difference'] = df['WCurrentScore'] - df['LCurrentScore']
    
    del df['points']
    del df['points_made']
    del df['tmp_gameID']
    
    return df


def quarter_score(data):
    df = data.copy()
    
    df['period'] = 1
    df.loc[df.ElapsedSeconds > 20 * 60, 'period'] = 2
    df.loc[df.ElapsedSeconds > 40 * 60, 'period'] = 3
    
    df['crunch'] = 0
    df.loc[(df.ElapsedSeconds > 37 * 60) & (df.ElapsedSeconds <= 40 * 60), 'crunch'] = 1
    
    df['minutes'] = df['ElapsedSeconds'] / 60
    df['tmp_gameID'] = df['DayNum'].astype(str) + '_' + df['WTeamID'].astype(str) + '_' + df['LTeamID'].astype(str)
    
    ot = ((df.groupby('tmp_gameID').minutes.max() - 40) / 5).reset_index()
    ot['n_OT'] = np.where(ot.minutes > 0, np.ceil(ot.minutes), 0)    
    half = df[df.period==1].groupby(['tmp_gameID'], as_index=False)[['WCurrentScore', 'LCurrentScore']].max()
    half['Halftime_difference'] = half['WCurrentScore'] - half['LCurrentScore']
    half.drop(['WCurrentScore', 'LCurrentScore'], axis=1, inplace=True)
    crunchtime = df[df.crunch==0].groupby(['tmp_gameID'], as_index=False)[['WCurrentScore', 'LCurrentScore']].max()
    crunchtime['3mins_difference'] = crunchtime['WCurrentScore'] - crunchtime['LCurrentScore']
    crunchtime.drop(['WCurrentScore', 'LCurrentScore'], axis=1, inplace=True)
    
    add_ons = pd.merge(ot[['tmp_gameID', 'n_OT']], half, on='tmp_gameID')
    add_ons = pd.merge(add_ons, crunchtime, on='tmp_gameID')
    
    df = pd.merge(df, add_ons, on='tmp_gameID')
    
    del df['tmp_gameID']
    del df['minutes']
    
    if data.shape[0] != df.shape[0]:
        raise KeyError('Some merge went wrong')
    
    return df


def lead_changes(data):
    df = data.copy()
    df['tmp_gameID'] = df['DayNum'].astype(str) + '_' + df['WTeamID'].astype(str) + '_' + df['LTeamID'].astype(str)
    
    changes = df.groupby('tmp_gameID').Current_difference.apply(lambda x: len(np.where(np.diff(np.sign(x)))[0])).reset_index()
    changes.rename(columns={'Current_difference': 'game_lc'}, inplace=True)
    changes_2 = df[df.period==2].groupby('tmp_gameID').Current_difference.apply(lambda x: len(np.where(np.diff(np.sign(x)))[0])).reset_index()
    changes_2.rename(columns={'Current_difference': 'half2_lc'}, inplace=True)
    changes_3 = df[df.crunch==1].groupby('tmp_gameID').Current_difference.apply(lambda x: len(np.where(np.diff(np.sign(x)))[0])).reset_index()
    changes_3.rename(columns={'Current_difference': 'crunchtime_lc'}, inplace=True)
    
    add_ons = pd.merge(changes, changes_2, on='tmp_gameID')
    add_ons = pd.merge(add_ons, changes_3, on='tmp_gameID')
    
    df = pd.merge(df, add_ons, on='tmp_gameID')
    
    del df['tmp_gameID']
    
    if data.shape[0] != df.shape[0]:
        raise KeyError('Some merge went wrong')
        
    return df


def event_count(data):
    df = data.copy()
    
    # points made in each block
    # % of scores with assists
    # turnover
    # rebounds
    # % scores
    # turnover
    # steals
    # blocs
    
    del df['tmp_gameID']
    
    if data.shape[0] != df.shape[0]:
        raise KeyError('Some merge went wrong')
    
    return df

In [36]:
df_2019 = pd.read_csv('data/raw_men/MEvents2019.csv')

df_2019 = make_scores(df_2019)
df_2019 = quarter_score(df_2019)
df_2019 = lead_changes(df_2019)

df_2019.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,EventSubType,X,Y,Area,Final_difference,WCurrentScore,LCurrentScore,Current_difference,period,crunch,n_OT,Halftime_difference,3mins_difference,game_lc,half2_lc,crunchime_lc
0,10442717,2019,1,1104,1380,82,62,18,1380,10314,made2,jump,0,0,0,20,0.0,0.0,0.0,1,0,0.0,19.0,21.0,3,0,0
1,10442718,2019,1,1104,1380,82,62,18,1380,10307,assist,,0,0,0,20,0.0,2.0,-2.0,1,0,0.0,19.0,21.0,3,0,0
2,10442719,2019,1,1104,1380,82,62,42,1104,129,made2,jump,0,0,0,20,0.0,2.0,-2.0,1,0,0.0,19.0,21.0,3,0,0
3,10442720,2019,1,1104,1380,82,62,58,1380,10303,turnover,unk,0,0,0,20,0.0,2.0,-2.0,1,0,0.0,19.0,21.0,3,0,0
4,10442721,2019,1,1104,1380,82,62,63,1104,143,made3,unk,0,0,0,20,2.0,2.0,0.0,1,0,0.0,19.0,21.0,3,0,0


In [46]:
test = df_2019[['DayNum', 'WTeamID', 'LTeamID', 'WFinalScore', 'LFinalScore', 'Final_difference', 
                'n_OT', 'Halftime_difference', '3mins_difference', 'game_lc', 'half2_lc', 'crunchime_lc']].drop_duplicates()

print(test.shape)

test.head()

(5597, 12)


Unnamed: 0,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,Final_difference,n_OT,Halftime_difference,3mins_difference,game_lc,half2_lc,crunchime_lc
0,1,1104,1380,82,62,20,0.0,19.0,21.0,3,0,0
507,1,1113,1168,102,94,8,2.0,3.0,8.0,34,20,6
1142,1,1119,1265,73,69,4,0.0,6.0,-1.0,15,10,4
1706,1,1120,1375,101,58,43,0.0,19.0,48.0,1,0,0
2211,1,1123,1232,86,69,17,0.0,0.0,13.0,17,1,0


In [105]:
files = ['2015', '2016', '2017', '2018', '2019']
all_events = []

for year in np.arange(2015, 2020):
    df = pd.read_csv(f'data/raw_men/MEvents{year}.csv')
    df = make_scores(df)
    df = quarter_score(df)
    all_events.append(df)

all_events = pd.concat(all_events, ignore_index=True)

all_events.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,EventSubType,X,Y,Area,Final_difference,WCurrentScore,LCurrentScore,Current_difference,period,tmp_gameID,n_OT
0,1,2015,11,1103,1420,74,57,19,1103,100,miss3,unk,0,0,0,17,0.0,0.0,0.0,1,11_1103_1420,0.0
1,2,2015,11,1103,1420,74,57,19,1420,11784,reb,def,0,0,0,17,0.0,0.0,0.0,1,11_1103_1420,0.0
2,3,2015,11,1103,1420,74,57,27,1420,11789,made2,dunk,0,0,0,17,0.0,0.0,0.0,1,11_1103_1420,0.0
3,4,2015,11,1103,1420,74,57,27,1420,11803,assist,,0,0,0,17,0.0,2.0,-2.0,1,11_1103_1420,0.0
4,5,2015,11,1103,1420,74,57,59,1103,87,made2,jump,0,0,0,17,0.0,2.0,-2.0,1,11_1103_1420,0.0


In [69]:
all_events.EventType.value_counts(dropna=False)

sub         3356811
reb         2099080
foul        1032476
miss2       1019422
made2        985522
made1        778146
miss3        751246
assist       725993
turnover     707561
made3        399533
timeout      358201
steal        337776
miss1        328187
block        186528
fouled        74308
jumpb          8894
Name: EventType, dtype: int64

In [70]:
all_events.ElapsedSeconds.max() / 60

60.0