In [1]:
import pandas as pd
import nba_scraper.nba_scraper as ns
import os
import requests
from sqlalchemy import create_engine
import numpy as np
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
#engine = create_engine(os.environ['NBA_CONNECT_DEV'])


In [2]:

def calc_possessions(game_df):
    '''
    funciton to calculate possesion numbers for both team and players
    and insert into possesion tables

    Inputs:
    game_df  - dataframe of nba play by play
    engine   - sql alchemy engine

    Outputs:
    None
    '''
    #calculating made shot possessions
    game_df['home_possession'] = np.where((game_df.event_team == game_df.home_team_abbrev) &
                                         (game_df.event_type_de == 'shot'), 1, 0)
#calculating turnover possessions
    game_df['home_possession'] = np.where((game_df.event_team == game_df.home_team_abbrev) &
                                         (game_df.event_type_de == 'turnover'), 1, game_df['home_possession'])
#calculating defensive rebound possessions
    game_df['home_possession'] = np.where(((game_df.event_team == game_df.away_team_abbrev) &
                                         (game_df.is_d_rebound == 1)) |
                                          ((game_df.event_type_de == 'rebound') &
                                           (game_df.is_d_rebound == 0) &
                                           (game_df.is_o_rebound == 0) &
                                           (game_df.event_team == game_df.away_team_abbrev) &
                                           (game_df.event_type_de.shift(1) != 'free-throw')),
                                          1, game_df['home_possession'])
#calculating final free throw possessions
    game_df['home_possession'] = np.where((game_df.event_team == game_df.home_team_abbrev) &
                                         ((game_df.homedescription.str.contains('Free Throw 2 of 2')) |
                                           (game_df.homedescription.str.contains('Free Throw 3 of 3'))),
                                         1, game_df['home_possession'])
#calculating made shot possessions
    game_df['away_possession'] = np.where((game_df.event_team == game_df.away_team_abbrev) &
                                         (game_df.event_type_de == 'shot'), 1, 0)
#calculating turnover possessions
    game_df['away_possession'] = np.where((game_df.event_team == game_df.away_team_abbrev) &
                                         (game_df.event_type_de == 'turnover'), 1, game_df['away_possession'])
#calculating defensive rebound possessions
    game_df['away_possession'] = np.where(((game_df.event_team == game_df.home_team_abbrev) &
                                         (game_df.is_d_rebound == 1)) |
                                          ((game_df.event_type_de == 'rebound') &
                                           (game_df.is_d_rebound == 0) &
                                           (game_df.is_o_rebound == 0) &
                                           (game_df.event_team == game_df.home_team_abbrev) &
                                           (game_df.event_type_de.shift(1) != 'free-throw')),
                                          1, game_df['away_possession'])
#calculating final free throw possessions
    game_df['away_possession'] = np.where((game_df.event_team == game_df.away_team_abbrev) &
                                         ((game_df.visitordescription.str.contains('Free Throw 2 of 2')) |
                                           (game_df.visitordescription.str.contains('Free Throw 3 of 3'))),
                                         1, game_df['away_possession'])
    return game_df


In [3]:
pbp_df = pd.read_csv('/Users/mbarlowe/Downloads/2019_pbp.csv')\
    .sort_values(by=['game_id', 'seconds_elapsed', 'eventnum'])
pbp_df['is_d_rebound'] = np.where(pbp_df['is_d_rebound'] == 't', 1, 0)
pbp_df['is_o_rebound'] = np.where(pbp_df['is_o_rebound'] == 't', 1, 0)
pbp_df['is_turnover'] = np.where(pbp_df['is_turnover'] == 't', 1, 0)
pbp_df['is_steal']  = np.where(pbp_df['is_steal'] == 't', 1, 0)                         
pbp_df['is_putback']  = np.where(pbp_df['is_putback'] == 't', 1, 0)
pbp_df['is_block'] = np.where(pbp_df['is_block'] == 't', 1, 0)
pbp_df['is_three']  = np.where(pbp_df['is_three'] == 't', 1, 0)
pbp_df['shot_made'] = np.where(pbp_df['shot_made'] == 't', 1, 0)


points_by_second = pbp_df.groupby(['game_id', 'seconds_elapsed'])['points_made'].sum().reset_index()

In [4]:
# this is run on each individual game to get the possessions then i need to insert it into the table
def parse_possessions(poss_list):
    '''
    a function to parse each possession and create one row for offense team
    and defense team
    
    Inputs:
    poss_list   - list of dataframes each one representing one possession
    
    Outputs:
    parsed_list  - list of lists where each list inside represents the players on 
                   off and def and points score for each possession
    '''
    parsed_list = []
    
    for df in poss_list:
        if df.loc[df.index[-1], 'event_type_de'] in ['rebound', 'turnover']:
            if df.loc[df.index[-1], 'event_type_de'] == 'turnover':
                if df.loc[df.index[-1], 'event_team'] == df.loc[df.index[-1], 'home_team_abbrev']:
                    row_df = pd.concat([df.loc[df.index[-1], 'home_player_1':'away_player_5_id'],
                                        df.loc[df.index[-1], ['points_made_y', 'home_team_abbrev', 
                                                              'event_team', 'away_team_abbrev']]])

                    parsed_list.append(pd.DataFrame([list(row_df)], columns=['off_player_1', 'off_player_1_id',
                                                                           'off_player_2', 'off_player_2_id',
                                                                           'off_player_3', 'off_player_3_id',
                                                                           'off_player_4', 'off_player_4_id',
                                                                           'off_player_5', 'off_player_5_id',
                                                                           'def_player_1', 'def_player_1_id',
                                                                           'def_player_2', 'def_player_2_id',
                                                                           'def_player_3', 'def_player_3_id',
                                                                           'def_player_4', 'def_player_4_id',
                                                                           'def_player_5', 'def_player_5_id',
                                                                           'points_made', 'home_team', 
                                                                           'event_team', 'away_team']))
                elif df.loc[df.index[-1], 'event_team'] == df.loc[df.index[-1], 'away_team_abbrev']:
                    row_df = pd.concat([df.loc[df.index[-1], 'home_player_1':'away_player_5_id'],
                                        df.loc[df.index[-1], ['points_made_y', 'home_team_abbrev', 
                                                              'event_team', 'away_team_abbrev']]])

                    parsed_list.append(pd.DataFrame([list(row_df)], columns=['def_player_1', 'def_player_1_id',
                                                                           'def_player_2', 'def_player_2_id',
                                                                           'def_player_3', 'def_player_3_id',
                                                                           'def_player_4', 'def_player_4_id',
                                                                           'def_player_5', 'def_player_5_id',
                                                                           'off_player_1', 'off_player_1_id',
                                                                           'off_player_2', 'off_player_2_id',
                                                                           'off_player_3', 'off_player_3_id',
                                                                           'off_player_4', 'off_player_4_id',
                                                                           'off_player_5', 'off_player_5_id',
                                                                           'points_made', 'home_team', 
                                                                           'event_team', 'away_team']))
            if df.loc[df.index[-1], 'event_type_de'] == 'rebound':
                if df.loc[df.index[-1], 'event_team'] == df.loc[df.index[-1], 'away_team_abbrev']:
                    row_df = pd.concat([df.loc[df.index[-1], 'home_player_1':'away_player_5_id'],
                                        df.loc[df.index[-1], ['points_made_y', 'home_team_abbrev', 
                                                              'event_team', 'away_team_abbrev']]])

                    parsed_list.append(pd.DataFrame([list(row_df)], columns=['off_player_1', 'off_player_1_id',
                                                                           'off_player_2', 'off_player_2_id',
                                                                           'off_player_3', 'off_player_3_id',
                                                                           'off_player_4', 'off_player_4_id',
                                                                           'off_player_5', 'off_player_5_id',
                                                                           'def_player_1', 'def_player_1_id',
                                                                           'def_player_2', 'def_player_2_id',
                                                                           'def_player_3', 'def_player_3_id',
                                                                           'def_player_4', 'def_player_4_id',
                                                                           'def_player_5', 'def_player_5_id',
                                                                           'points_made', 'home_team', 
                                                                           'event_team', 'away_team']))
                elif df.loc[df.index[-1], 'event_team'] == df.loc[df.index[-1], 'home_team_abbrev']:
                    row_df = pd.concat([df.loc[df.index[-1], 'home_player_1':'away_player_5_id'],
                                        df.loc[df.index[-1], ['points_made_y', 'home_team_abbrev', 
                                                              'event_team', 'away_team_abbrev']]])

                    parsed_list.append(pd.DataFrame([list(row_df)], columns=['def_player_1', 'def_player_1_id',
                                                                           'def_player_2', 'def_player_2_id',
                                                                           'def_player_3', 'def_player_3_id',
                                                                           'def_player_4', 'def_player_4_id',
                                                                           'def_player_5', 'def_player_5_id',
                                                                           'off_player_1', 'off_player_1_id',
                                                                           'off_player_2', 'off_player_2_id',
                                                                           'off_player_3', 'off_player_3_id',
                                                                           'off_player_4', 'off_player_4_id',
                                                                           'off_player_5', 'off_player_5_id',
                                                                           'points_made', 'home_team', 
                                                                           'event_team', 'away_team']))
                                        
        elif df.loc[df.index[-1], 'event_type_de'] in ['shot', 'free-throw']:
            if df.loc[df.index[-1], 'event_team'] == df.loc[df.index[-1], 'home_team_abbrev']:
                row_df = pd.concat([df.loc[df.index[-1], 'home_player_1':'away_player_5_id'],
                                    df.loc[df.index[-1], ['points_made_y', 'home_team_abbrev', 
                                                          'event_team', 'away_team_abbrev']]])

                parsed_list.append(pd.DataFrame([list(row_df)], columns=['off_player_1', 'off_player_1_id',
                                                                       'off_player_2', 'off_player_2_id',
                                                                       'off_player_3', 'off_player_3_id',
                                                                       'off_player_4', 'off_player_4_id',
                                                                       'off_player_5', 'off_player_5_id',
                                                                       'def_player_1', 'def_player_1_id',
                                                                       'def_player_2', 'def_player_2_id',
                                                                       'def_player_3', 'def_player_3_id',
                                                                       'def_player_4', 'def_player_4_id',
                                                                       'def_player_5', 'def_player_5_id',
                                                                       'points_made', 'home_team', 
                                                                       'event_team', 'away_team']))
            elif df.loc[df.index[-1], 'event_team'] == df.loc[df.index[-1], 'away_team_abbrev']:
                row_df = pd.concat([df.loc[df.index[-1], 'home_player_1':'away_player_5_id'],
                                    df.loc[df.index[-1], ['points_made_y', 'home_team_abbrev', 
                                                          'event_team', 'away_team_abbrev']]])

                parsed_list.append(pd.DataFrame([list(row_df)], columns=['def_player_1', 'def_player_1_id',
                                                                       'def_player_2', 'def_player_2_id',
                                                                       'def_player_3', 'def_player_3_id',
                                                                       'def_player_4', 'def_player_4_id',
                                                                       'def_player_5', 'def_player_5_id',
                                                                       'off_player_1', 'off_player_1_id',
                                                                       'off_player_2', 'off_player_2_id',
                                                                       'off_player_3', 'off_player_3_id',
                                                                       'off_player_4', 'off_player_4_id',
                                                                       'off_player_5', 'off_player_5_id',
                                                                       'points_made', 'home_team', 
                                                                       'event_team', 'away_team']))
                    
    return parsed_list
    

In [7]:
#this is the part that actually runs out side the possessions

pbp_df = calc_possessions(pbp_df)
poss_df = []
for game in range(21800001, 21801231):
    print(f'Processing game {game}')
    game_df = pbp_df[pbp_df.game_id == game].reset_index()
    poss_index = game_df[(game_df.home_possession == 1) | (game_df.away_possession == 1)].index
    shift_dfs = []
    past_index = 0
    for i in poss_index:
        shift_dfs.append(game_df.iloc[past_index+1: i+1, :].reset_index())
        past_index = i
    possession = [x.merge(points_by_second, on=['game_id', 'seconds_elapsed']) for x in shift_dfs]
    poss_df.extend(parse_possessions(possession))

Processing game 21800001
Processing game 21800002
Processing game 21800003
Processing game 21800004
Processing game 21800005
Processing game 21800006
Processing game 21800007
Processing game 21800008
Processing game 21800009
Processing game 21800010
Processing game 21800011
Processing game 21800012
Processing game 21800013
Processing game 21800014
Processing game 21800015
Processing game 21800016
Processing game 21800017
Processing game 21800018
Processing game 21800019
Processing game 21800020
Processing game 21800021
Processing game 21800022
Processing game 21800023
Processing game 21800024
Processing game 21800025
Processing game 21800026
Processing game 21800027
Processing game 21800028
Processing game 21800029
Processing game 21800030
Processing game 21800031
Processing game 21800032
Processing game 21800033
Processing game 21800034
Processing game 21800035
Processing game 21800036
Processing game 21800037
Processing game 21800038
Processing game 21800039
Processing game 21800040


Processing game 21800329
Processing game 21800330
Processing game 21800331
Processing game 21800332
Processing game 21800333
Processing game 21800334
Processing game 21800335
Processing game 21800336
Processing game 21800337
Processing game 21800338
Processing game 21800339
Processing game 21800340
Processing game 21800341
Processing game 21800342
Processing game 21800343
Processing game 21800344
Processing game 21800345
Processing game 21800346
Processing game 21800347
Processing game 21800348
Processing game 21800349
Processing game 21800350
Processing game 21800351
Processing game 21800352
Processing game 21800353
Processing game 21800354
Processing game 21800355
Processing game 21800356
Processing game 21800357
Processing game 21800358
Processing game 21800359
Processing game 21800360
Processing game 21800361
Processing game 21800362
Processing game 21800363
Processing game 21800364
Processing game 21800365
Processing game 21800366
Processing game 21800367
Processing game 21800368


Processing game 21800657
Processing game 21800658
Processing game 21800659
Processing game 21800660
Processing game 21800661
Processing game 21800662
Processing game 21800663
Processing game 21800664
Processing game 21800665
Processing game 21800666
Processing game 21800667
Processing game 21800668
Processing game 21800669
Processing game 21800670
Processing game 21800671
Processing game 21800672
Processing game 21800673
Processing game 21800674
Processing game 21800675
Processing game 21800676
Processing game 21800677
Processing game 21800678
Processing game 21800679
Processing game 21800680
Processing game 21800681
Processing game 21800682
Processing game 21800683
Processing game 21800684
Processing game 21800685
Processing game 21800686
Processing game 21800687
Processing game 21800688
Processing game 21800689
Processing game 21800690
Processing game 21800691
Processing game 21800692
Processing game 21800693
Processing game 21800694
Processing game 21800695
Processing game 21800696


Processing game 21800985
Processing game 21800986
Processing game 21800987
Processing game 21800988
Processing game 21800989
Processing game 21800990
Processing game 21800991
Processing game 21800992
Processing game 21800993
Processing game 21800994
Processing game 21800995
Processing game 21800996
Processing game 21800997
Processing game 21800998
Processing game 21800999
Processing game 21801000
Processing game 21801001
Processing game 21801002
Processing game 21801003
Processing game 21801004
Processing game 21801005
Processing game 21801006
Processing game 21801007
Processing game 21801008
Processing game 21801009
Processing game 21801010
Processing game 21801011
Processing game 21801012
Processing game 21801013
Processing game 21801014
Processing game 21801015
Processing game 21801016
Processing game 21801017
Processing game 21801018
Processing game 21801019
Processing game 21801020
Processing game 21801021
Processing game 21801022
Processing game 21801023
Processing game 21801024


In [12]:
poss_df[25].head()

Unnamed: 0,off_player_1,off_player_1_id,off_player_2,off_player_2_id,off_player_3,off_player_3_id,off_player_4,off_player_4_id,off_player_5,off_player_5_id,def_player_1,def_player_1_id,def_player_2,def_player_2_id,def_player_3,def_player_3_id,def_player_4,def_player_4_id,def_player_5,def_player_5_id,points_made,home_team,event_team,away_team
0,Al Horford,201143,Jayson Tatum,1628369,Jaylen Brown,1627759,Gordon Hayward,202330,Kyrie Irving,202681,Robert Covington,203496,Ben Simmons,1627732,Markelle Fultz,1628365,Amir Johnson,101161,JJ Redick,200755,2,BOS,BOS,PHI


In [13]:
test_df = pd.concat(poss_df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [16]:
test_df.to_csv('possesions_data.csv', index=False)

In [15]:
test_df.head()

Unnamed: 0,away_team,def_player_1,def_player_1_id,def_player_2,def_player_2_id,def_player_3,def_player_3_id,def_player_4,def_player_4_id,def_player_5,def_player_5_id,event_team,home_team,off_player_1,off_player_1_id,off_player_2,off_player_2_id,off_player_3,off_player_3_id,off_player_4,off_player_4_id,off_player_5,off_player_5_id,points_made
0,PHI,Al Horford,201143,Jayson Tatum,1628369,Jaylen Brown,1627759,Gordon Hayward,202330,Kyrie Irving,202681,BOS,BOS,Robert Covington,203496,Dario Saric,203967,Ben Simmons,1627732,Joel Embiid,203954,Markelle Fultz,1628365,0
0,PHI,Robert Covington,203496,Dario Saric,203967,Ben Simmons,1627732,Joel Embiid,203954,Markelle Fultz,1628365,PHI,BOS,Al Horford,201143,Jayson Tatum,1628369,Jaylen Brown,1627759,Gordon Hayward,202330,Kyrie Irving,202681,0
0,PHI,Al Horford,201143,Jayson Tatum,1628369,Jaylen Brown,1627759,Gordon Hayward,202330,Kyrie Irving,202681,PHI,BOS,Robert Covington,203496,Dario Saric,203967,Ben Simmons,1627732,Joel Embiid,203954,Markelle Fultz,1628365,0
0,PHI,Robert Covington,203496,Dario Saric,203967,Ben Simmons,1627732,Joel Embiid,203954,Markelle Fultz,1628365,PHI,BOS,Al Horford,201143,Jayson Tatum,1628369,Jaylen Brown,1627759,Gordon Hayward,202330,Kyrie Irving,202681,0
0,PHI,Al Horford,201143,Jayson Tatum,1628369,Jaylen Brown,1627759,Gordon Hayward,202330,Kyrie Irving,202681,PHI,BOS,Robert Covington,203496,Dario Saric,203967,Ben Simmons,1627732,Joel Embiid,203954,Markelle Fultz,1628365,2


In [7]:
def lambda_to_alpha(lambda_value, samples):
    return (lambda_value * samples) / 2.0

def map_players(row_in, players):
    p1 = row_in[0]
    p2 = row_in[1]
    p3 = row_in[2]
    p4 = row_in[3]
    p5 = row_in[4]
    p6 = row_in[5]
    p7 = row_in[6]
    p8 = row_in[7]
    p9 = row_in[8]
    p10 = row_in[9]

    rowOut = np.zeros([len(players) * 2])

    rowOut[players.index(p1)] = 1
    rowOut[players.index(p2)] = 1
    rowOut[players.index(p3)] = 1
    rowOut[players.index(p4)] = 1
    rowOut[players.index(p5)] = 1

    rowOut[players.index(p6) + len(players)] = -1
    rowOut[players.index(p7) + len(players)] = -1
    rowOut[players.index(p8) + len(players)] = -1
    rowOut[players.index(p9) + len(players)] = -1
    rowOut[players.index(p10) + len(players)] = -1

    return rowOut


In [9]:
sa_engine = create_engine(os.environ['NBA_CONNECT_DEV'])

shifts_df = pd.read_sql_query(f'select * from nba.rapm_shifts where season = {season};', sa_engine)

shifts_df = shifts_df[shifts_df.possessions != 0]
shifts_df.to_csv('shifts.csv')

# pull out unique player ids
players = list(set(list(shifts_df['off_player_1_id'].unique()) +
                   list(shifts_df['off_player_2_id'].unique()) +
                   list(shifts_df['off_player_3_id'].unique()) +
                   list(shifts_df['off_player_4_id'].unique()) +
                   list(shifts_df['off_player_5_id'].unique()) +
                   list(shifts_df['def_player_1_id'].unique()) +
                   list(shifts_df['def_player_2_id'].unique()) +
                   list(shifts_df['def_player_3_id'].unique()) +
                   list(shifts_df['def_player_4_id'].unique()) +
                   list(shifts_df['def_player_5_id'].unique())))

players.sort()
train_x = shifts_df.as_matrix(columns=['off_player_1_id', 'off_player_2_id',
                                             'off_player_3_id', 'off_player_4_id', 'off_player_5_id',
                                             'def_player_1_id', 'def_player_2_id',
                                             'def_player_3_id', 'def_player_4_id', 'def_player_5_id'])

train_x = np.apply_along_axis(map_players, 1, train_x, players)




In [17]:
np.set_printoptions(threshold=2000)
print([train_x[0].index(x) for x in train_x[0] if x == -1])

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [54]:
query = '''
    select sum(time_together) 
    from (
    select sum(event_length) time_together from nba.pbp 
    where season = 2019 
    and (home_player_1_id in (202331) or 
            home_player_2_id in (202331) or 
            home_player_3_id in (202331) or 
            home_player_4_id in (202331) or 
            home_player_5_id in (202331))
    and (home_player_1_id not in (201566) and 
            home_player_2_id not in (201566) and
            home_player_3_id not in (201566) and 
            home_player_4_id not in (201566) and 
            home_player_5_id not in (201566))
    union all 
        select sum(event_length) time_together from nba.pbp 
    where season = 2019 
    and (away_player_1_id in (202331) or 
            away_player_2_id in (202331) or 
            away_player_3_id in (202331) or 
            away_player_4_id in (202331) or 
            away_player_5_id in (202331))
    and (away_player_1_id not in (201566) and 
            away_player_2_id not in (201566) and 
            away_player_3_id not in (201566) and 
            away_player_4_id not in (201566) and
            away_player_5_id not in (201566))) tt
'''
pbp_df = pd.read_sql_query(query, engine)
pbp_df



Unnamed: 0,sum
0,51665.0


In [None]:
train_y = shifts_df.as_matrix(['points_made'])
possessions = shifts_df['possessions']

lambdas_rapm = [.01, .05, .1]
alphas = [lambda_to_alpha(l, train_x.shape[0]) for l in lambdas_rapm]
clf = RidgeCV(alphas=alphas, cv=5, fit_intercept=True, normalize=False)
model = clf.fit(train_x, train_y, sample_weight=possessions)
player_arr = np.transpose(np.array(players).reshape(1, len(players)))

# extract our coefficients into the offensive and defensive parts
coef_offensive_array = np.transpose(model.coef_[:, 0:len(players)])
coef_defensive_array = np.transpose(model.coef_[:, len(players):])

# concatenate the offensive and defensive values with the playey ids into a mx3 matrix
player_id_with_coef = np.concatenate([player_arr, coef_offensive_array, coef_defensive_array], axis=1)
# build a dataframe from our matrix
players_coef = pd.DataFrame(player_id_with_coef)
intercept = model.intercept_
name = 'rapm'
# apply new column names
players_coef.columns = ['player_id', '{0}__Off'.format(name), '{0}__Def'.format(name)]

print(f'This is the intercept of the model: {intercept}')
print(players_coef.head())
player_df = pd.read_sql_query(f'select * from nba.player_details;', sa_engine)

results_df = players_coef.merge(player_df[['player_id', 'display_first_last']], on='player_id')
results_df.to_csv('rapm_results.csv')


In [None]:
    and (away_player_1_id in (202331) or 
            away_player_2_id in (202331) or 
            away_player_3_id in (202331) or 
            away_player_4_id in (202331) or 
            away_player_5_id in (202331))
    and (away_player_1_id not in (201566) or 
            away_player_2_id not in (201566) or 
            away_player_3_id not in (201566) or 
            away_player_4_id not in (201566) or 
            away_player_5_id not in (201566))) tt
'''