In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import requests
import ipdb
from bs4 import BeautifulSoup

# Season Stats Scraper

## Structure

Example NBA: https://basketball.realgm.com/nba/stats/2020/Totals/All/points/All/asc/1/Regular_Season

Example NCAA: https://basketball.realgm.com/ncaa/stats/2003/Totals/All/All/Season/All/points/desc/1/

Glossary: https://basketball.realgm.com/info/glossary

In [31]:
universe = 'All' # 'Qualified'
seasons = range(2005, 2021)

In [4]:
stats_basic = ['Id', 'Player', 'Link', 'Competition', 'Team', 'Season', 'GP', 'MIN']

stats_dict = {'Totals': ['FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 
                         'FTM', 'FTA', 'FT%', 'TOV', 'PF', 'ORB', 
                         'DRB', 'REB', 'AST', 'STL', 'BLK', 'PTS'],
              
              'Misc_Stats': ['DDBL', 'TDBL', '40PTS', '20REB', '20AST',
                             '5STL', '5BLK', 'HIGHGAME', 'TECHS', 'HOB',
                             'AST/TO', 'STL/TO', 'FT/FGA', 'W-S', 'L-S',
                             'WIN%', 'OWS', 'DWS', 'WS'],
              
              'Advanced_Stats': ['TS%', 'EFG%', 'TOTALS%', 'ORB%', 'DRB%',
                                 'TRB%', 'AST%', 'TOV%', 'STL%', 'BLK%',
                                 'USG%', 'PPS', 'ORTG', 'DRTG', 'EDIFF',
                                 'FIC', 'PER']}

In [5]:
def create_dataframe(totals=False, misc_stats=False, advanced_stats=False):
    cols = stats_basic
    
    if totals:
        cols = cols + stats_dict['Totals']
        
    if misc_stats:
        cols = cols + stats_dict['Misc_Stats']
        
    if advanced_stats:
        cols = cols + stats_dict['Advanced_Stats']
        
    return pd.DataFrame(columns=cols)

## NBA

In [6]:
def get_totals(compet, season, info):
    link = info[1].find('a', href=True)['href']
    data = {'Id': int(link.split("/")[-1]),
            'Player': info[1].text,
            'Link': link,
            'Competition': compet.upper(),
            'Team': info[2].text, 
            'Season': season,
            'GP': info[3].text, 
            'MIN': info[4].text,
            'FGM': info[5].text, 
            'FGA': info[6].text, 
            'FG%': info[7].text, 
            '3PM': info[8].text, 
            '3PA': info[9].text, 
            '3P%': info[10].text,
            'FTM': info[11].text, 
            'FTA': info[12].text, 
            'FT%': info[13].text, 
            'TOV': info[14].text, 
            'PF': info[15].text,  
            'ORB': info[16].text, 
            'DRB': info[17].text,  
            'REB': info[18].text, 
            'AST': info[19].text, 
            'STL': info[20].text, 
            'BLK': info[21].text, 
            'PTS': info[22].text}
    return data


In [7]:
def get_misc_stats(info):
    data = {'DDBL': info[3].text,
            'TDBL': info[4].text,
            '40PTS': info[5].text,
            '20REB': info[6].text,
            '20AST': info[7].text,
            '5STL': info[8].text,
            '5BLK': info[9].text,
            'HIGHGAME': info[10].text,
            'TECHS': info[11].text,
            'HOB': info[12].text,
            'AST/TO': info[13].text, 
            'STL/TO': info[14].text, 
            'FT/FGA': info[15].text, 
            'W-S': info[16].text,
            'L-S': info[17].text, 
            'WIN%': info[18].text, 
            'OWS': info[19].text, 
            'DWS': info[20].text, 
            'WS': info[21].text}
    return data

In [8]:
def get_advanced_stats(info):
    data = {'TS%': info[3].text,
            'EFG%': info[4].text,
            'TOTALS%': info[5].text,
            'ORB%': info[6].text,
            'DRB%': info[7].text,
            'TRB%': info[8].text,
            'AST%': info[9].text,
            'TOV%': info[10].text,
            'STL%': info[11].text,
            'BLK%': info[12].text,
            'USG%': info[13].text, 
            'PPS': info[14].text, 
            'ORTG': info[15].text, 
            'DRTG': info[16].text,
            'EDIFF': info[17].text, 
            'FIC': info[18].text, 
            'PER': info[19].text}
    return data


In [9]:
def add_to_row(df, pid, season, competition, stype, info):
    new_df = df.copy()
    target_cols, target_info = list(info.keys()), list(info.values())
    mask = (new_df.Id == pid) & (new_df.Season == season) & (new_df.Competition == competition.upper())
    new_df.loc[mask, target_cols] = np.array(target_info)
    return new_df

In [33]:
def scrape_data(df, competition, season, stype):
    page = 1
    next_page = True

    while next_page:
        
        if competition == 'nba':
            url = f'https://basketball.realgm.com/nba/stats/{season}/{stype}/{universe}/points/All/desc/{page}/Regular_Season'
        elif competition == 'ncaa':
            url = f'https://basketball.realgm.com/ncaa/stats/{season}/{stype}/{universe}/All/Season/All/points/desc/{page}/'
              
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        rows = soup.select('table.tablesaw.compact > tbody > tr')

        if rows:
            for row in rows:
                info = row.find_all('td')
                player_id = int(info[1].find('a', href=True)['href'].split("/")[-1])

                if stype == 'Totals':
                    row_data = get_totals('nba', s, info)
                    df = df.append(row_data, ignore_index=True)

                elif stype == 'Misc_Stats':
                    row_data = get_misc_stats(info)
                    df = add_to_row(df, player_id, season, competition, stype, row_data)

                elif stype == 'Advanced_Stats':
                    row_data = get_advanced_stats(info)
                    df = add_to_row(df, player_id, season, competition, stype, row_data)
                    

            page += 1

        else:
            next_page = False
            
    return df

In [13]:
df = create_dataframe(totals=True, misc_stats=True, advanced_stats=True)
rows = 0

for s in seasons:
    print(f'Scraping season {s}!')
    
    for t in stats_dict.keys():
        print(f'> Getting {t}...')
        df = scrape_data(df, 'nba', s, t)
    
    new_rows = df.shape[0] - rows
    rows = df.shape[0]
    print(f'> New {new_rows} rows added, total of {rows}.')

Scraping season 2005!
> Getting Totals...
> Getting Misc_Stats...
> Getting Advanced_Stats...
> New 464 rows added, total of 464.
Scraping season 2006!
> Getting Totals...
> Getting Misc_Stats...
> Getting Advanced_Stats...
> New 458 rows added, total of 922.
Scraping season 2007!
> Getting Totals...
> Getting Misc_Stats...
> Getting Advanced_Stats...
> New 458 rows added, total of 1380.
Scraping season 2008!
> Getting Totals...
> Getting Misc_Stats...
> Getting Advanced_Stats...
> New 451 rows added, total of 1831.
Scraping season 2009!
> Getting Totals...
> Getting Misc_Stats...
> Getting Advanced_Stats...
> New 445 rows added, total of 2276.
Scraping season 2010!
> Getting Totals...
> Getting Misc_Stats...
> Getting Advanced_Stats...
> New 442 rows added, total of 2718.
Scraping season 2011!
> Getting Totals...
> Getting Misc_Stats...
> Getting Advanced_Stats...
> New 452 rows added, total of 3170.
Scraping season 2012!
> Getting Totals...
> Getting Misc_Stats...
> Getting Advanced_

In [20]:
df.isnull().sum().values

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0, 3974, 3974, 3974, 3974, 3974, 3974, 3974,
       3974, 3974, 3974, 3974, 3974, 3974, 3974, 3974, 3974, 3974, 3974,
       3974,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0])

In [24]:
df.sort_values(by=['Id', 'Season'], inplace=True)
df.reset_index(inplace=True)

In [25]:
df[df.Player == 'LeBron James']

Unnamed: 0,index,Id,Player,Link,Competition,Team,Season,GP,MIN,FGM,...,TOV%,STL%,BLK%,USG%,PPS,ORTG,DRTG,EDIFF,FIC,PER
1552,1,250,LeBron James,/player/LeBron-James/Summary/250,NBA,CLE,2005,80,3388.0,795,...,11.8,2.8,1.1,29.7,3.7,1.3,113.5,103.8,9.7,1613.2
1553,465,250,LeBron James,/player/LeBron-James/Summary/250,NBA,CLE,2006,79,3361.3,875,...,10.7,2.0,1.5,33.7,2.6,1.4,115.5,104.8,10.7,1600.8
1554,923,250,LeBron James,/player/LeBron-James/Summary/250,NBA,CLE,2007,78,3190.3,772,...,11.5,2.1,1.3,31.1,2.0,1.3,111.6,100.6,11.0,1383.1
1555,1381,250,LeBron James,/player/LeBron-James/Summary/250,NBA,CLE,2008,75,3026.8,794,...,11.4,2.4,2.1,33.4,3.5,1.4,116.7,103.8,12.9,1627.1
1556,1832,250,LeBron James,/player/LeBron-James/Summary/250,NBA,CLE,2009,81,3054.0,789,...,11.0,2.4,2.4,33.8,5.1,1.4,121.6,99.4,22.3,1801.2
1557,2277,250,LeBron James,/player/LeBron-James/Summary/250,NBA,CLE,2010,76,2965.6,768,...,12.3,2.2,2.0,33.5,5.9,1.5,121.3,102.1,19.2,1787.9
1558,2719,250,LeBron James,/player/LeBron-James/Summary/250,NBA,MIA,2011,79,3063.0,758,...,13.8,2.1,1.3,31.5,2.8,1.4,116.3,101.9,14.4,1574.4
1559,3171,250,LeBron James,/player/LeBron-James/Summary/250,NBA,MIA,2012,62,2326.2,621,...,13.3,2.6,1.8,31.9,1.9,1.4,118.2,98.3,19.9,1301.5
1560,3650,250,LeBron James,/player/LeBron-James/Summary/250,NBA,MIA,2013,76,2877.1,765,...,12.4,2.4,1.9,30.1,5.0,1.5,124.7,101.4,23.3,1767.6
1561,4119,250,LeBron James,/player/LeBron-James/Summary/250,NBA,MIA,2014,77,2901.9,767,...,14.4,2.2,0.8,31.0,2.0,1.5,120.9,105.0,15.9,1576.9


In [30]:
df.to_csv('../raw_data/nba.csv', index=False)

## NCAA

In [36]:
df_ncaa = create_dataframe(totals=True, misc_stats=True, advanced_stats=True)
rows = 0

for s in seasons:
    print(f'Scraping season {s}!')
    
    for t in stats_dict.keys():
        print(f'> Getting {t}...')
        df_ncaa = scrape_data(df_ncaa, 'ncaa', s, t)
    
    new_rows = df_ncaa.shape[0] - rows
    rows = df_ncaa.shape[0]
    print(f'> New {new_rows} rows added, total of {rows}.')

Scraping season 2005!
> Getting Totals...
> Getting Misc_Stats...


KeyboardInterrupt: 