# 1. Pull Data

Pull data from the basketball reference at a schedule, game, and player level, leveraging code from before

In [1]:
import pandas as pd
import numpy as np

## Schedule

In [2]:
from src.Scraping import get_nba_schedule

In [3]:
# schedule = get_nba_schedule(2020, pd.to_datetime('2020-11-01'), bubble_months=True)
# schedule.to_csv('data/schedule_2020.csv', index=False)
schedule = pd.read_csv('data/schedule_2020.csv')

In [4]:
schedule.head()

Unnamed: 0,Date,Time,Visitor Team,Visitor Points,Home Team,Home Points,Box Score,OT,Notes,bubble,playoff
0,2019-10-22,800,New Orleans Pelicans,122,Toronto Raptors,130,www.basketball-reference.com/boxscores/2019102...,OT,,0,0
1,2019-10-22,1030,Los Angeles Lakers,102,Los Angeles Clippers,112,www.basketball-reference.com/boxscores/2019102...,,,0,0
2,2019-10-23,700,Chicago Bulls,125,Charlotte Hornets,126,www.basketball-reference.com/boxscores/2019102...,,,0,0
3,2019-10-23,700,Detroit Pistons,119,Indiana Pacers,110,www.basketball-reference.com/boxscores/2019102...,,,0,0
4,2019-10-23,700,Cleveland Cavaliers,85,Orlando Magic,94,www.basketball-reference.com/boxscores/2019102...,,,0,0


#### Manipulations: I want to add a field that denotes a bubble game and playoff game

In [5]:
schedule['Date'] = pd.to_datetime(schedule['Date'])

In [6]:
schedule['bubble'] = [1 if d>pd.to_datetime('2020-04-01') else 0 for d in schedule['Date']]
schedule['bubble'].value_counts()

0    971
1    136
Name: bubble, dtype: int64

In [7]:
schedule['playoff'] = [1 if d>pd.to_datetime('2020-08-14') else 0 for d in schedule['Date']]
schedule['playoff'].value_counts()

0    1059
1      48
Name: playoff, dtype: int64

In [8]:
schedule.groupby(['bubble', 'playoff']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Time,Visitor Team,Visitor Points,Home Team,Home Points,Box Score,OT,Notes
bubble,playoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,971,971,971,971,971,971,971,61,3
1,0,88,88,88,88,88,88,88,8,0
1,1,48,48,48,48,48,48,48,3,0


## Team Game Performance

In [9]:
from src.Scraping import update_team_perf

In [10]:
# load in template for team performance
team_perf = pd.read_csv('data/team_perf_2020.csv')
team_perf.head()

Unnamed: 0,Minutes,FG,FGA,FGP,3P,3PA,3PP,FT,FTA,FTP,...,ASTP,STLP,BLKP,TOVP,USGP,ORTG,DRTG,Home,Opp Team,Date
0,265,43,102,0.422,19,45,0.422,17,20,0.85,...,69.8,3.5,14.3,14.6,100.0,108.1,115.2,Home,Toronto Raptors,2019-10-22
1,265,42,103,0.408,14,40,0.35,32,38,0.842,...,54.8,6.2,5.3,11.8,100.0,115.2,108.1,Away,New Orleans Pelicans,2019-10-22
2,240,37,85,0.435,13,33,0.394,15,21,0.714,...,54.1,4.2,14.0,12.9,100.0,107.0,117.5,Home,Los Angeles Clippers,2019-10-22
3,240,42,81,0.519,11,31,0.355,17,24,0.708,...,57.1,8.4,9.6,13.3,100.0,117.5,107.0,Away,Los Angeles Lakers,2019-10-22
4,240,49,105,0.467,9,30,0.3,18,22,0.818,...,51.0,10.8,9.1,8.0,100.0,123.0,124.0,Home,Charlotte Hornets,2019-10-23


In [11]:
team_perf.columns

Index(['Minutes', 'FG', 'FGA', 'FGP', '3P', '3PA', '3PP', 'FT', 'FTA', 'FTP',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PM',
       'Team', 'TSP', 'EFGP', '3PAR', 'FTR', 'ORBP', 'DRBP', 'TRBP', 'ASTP',
       'STLP', 'BLKP', 'TOVP', 'USGP', 'ORTG', 'DRTG', 'Home', 'Opp Team',
       'Date'],
      dtype='object')

In [12]:
# update_team_perf(schedule.iloc[1000:], 'data/team_perf_2020.csv', 'NBA')

## Player Performance

In [13]:
from src.Scraping import update_player_perf

In [14]:
player_perf = pd.read_csv('data/player_perf_2020.csv')
player_perf.head()

Unnamed: 0,3P,3PA,3PAR,3PP,AST,ASTP,BLK,BLKP,DRB,DRBP,...,STL,STLP,TOV,TOVP,TRB,TRBP,TSP,Team,Time,USGP


In [22]:
# update_player_perf(schedule.iloc[1100:], 'data/player_perf_2020.csv', 'NBA')

Unnamed: 0,Player,Minutes,FG,FGA,FGP,3P,3PA,3PP,FT,FTA,...,DRBP,TRBP,ASTP,STLP,BLKP,TOVP,USGP,ORTG,DRTG,Date
0,Eric Gordon,32.266667,9,17,0.529,0,3,0.000,2,2,...,2.5,5.3,15.1,2.9,0.0,5.3,24.0,118.0,84.0,2020-08-29
1,Robert Covington,31.283333,8,14,0.571,6,11,0.545,0,0,...,15.3,9.5,15.0,4.4,0.0,6.7,19.6,142.0,73.0,2020-08-29
2,James Harden,28.400000,11,15,0.733,4,8,0.500,5,6,...,5.6,3.0,34.6,1.6,3.7,14.5,29.8,146.0,83.0,2020-08-29
3,Russell Westbrook,23.583333,3,13,0.231,0,2,0.000,1,2,...,13.6,10.8,38.6,2.0,0.0,0.0,24.1,89.0,79.0,2020-08-29
4,P.J. Tucker,21.216667,2,5,0.400,1,4,0.250,0,0,...,15.1,12.0,11.8,4.4,0.0,28.6,13.5,93.0,73.0,2020-08-29
5,Jeff Green,30.683333,3,11,0.273,3,7,0.429,0,0,...,26.1,13.8,4.1,0.0,3.4,8.3,16.0,78.0,76.0,2020-08-29
6,Danuel House,27.616667,2,10,0.200,2,6,0.333,1,2,...,20.3,12.3,8.8,1.7,0.0,15.5,19.1,68.0,77.0,2020-08-29
7,Austin Rivers,23.500000,2,7,0.286,1,3,0.333,0,0,...,13.6,9.0,10.5,3.9,4.4,0.0,12.2,94.0,73.0,2020-08-29
8,Ben McLemore,10.250000,1,4,0.250,1,3,0.333,0,0,...,7.8,4.1,12.2,0.0,0.0,0.0,16.0,89.0,87.0,2020-08-29
9,Michael Frazier,3.733333,1,3,0.333,1,1,1.000,0,0,...,42.9,22.8,42.7,0.0,0.0,25.0,43.9,83.0,69.0,2020-08-29


## Player Info

In [26]:
from src.Scraping import get_nba_player_info

In [27]:
# player_info = get_nba_player_info(2020)
# player_info.to_csv('data/players_2020.csv', index=False)

  data.set_value(r, 'Player', tr[r+1].find('th').getText())
  data.set_value(r, 'year_min', c.getText())
  data.set_value(r, 'year_max', c.getText())
  data.set_value(r, 'Position', c.getText())
  data.set_value(r, 'Height', c.getText())
  data.set_value(r, 'Weight', c.getText())


In [32]:
player_info = pd.read_csv('data/players_2020.csv')
player_info.head()

Unnamed: 0,Player,year_min,year_max,Position,Height,Weight
0,Álex Abrines,2017,2019,G-F,6-6,200
1,Quincy Acy,2013,2019,F-C,6-7,240
2,Jaylen Adams,2019,2019,G,6-2,190
3,Steven Adams,2014,2020,C,6-11,265
4,Bam Adebayo,2018,2020,C-F,6-9,255
