In [1]:
import pandas as pd

In [2]:
df = pd.read_html('https://www.basketball-reference.com/boxscores/202002040HOU.html')

## Getting Game IDs

In [29]:
team_handles_dict = {'Toronto Raptors': 'TOR',
                     'Boston Celtics': 'BOS',
                     'Philadelphia 76ers': 'PHI',
                     'Cleveland Cavaliers': 'CLE',
                     'Indiana Pacers': 'IND',
                     'Miami Heat': 'MIA',
                     'Milwaukee Bucks': 'MIL',
                     'Washington Wizards': 'WAS',
                     'Detroit Pistons': 'DET',
                     'Charlotte Hornets': 'CHO',
                     'New York Knicks': 'NYK',
                     'Brooklyn Nets': 'BRK',
                     'Chicago Bulls': 'CHI',
                     'Orlando Magic': 'ORL',
                     'Atlanta Hawks': 'ATL',
                     'Houston Rockets': 'HOU',
                     'Golden State Warriors': 'GSW',
                     'Portland Trail Blazers': 'POR',
                     'Oklahoma City Thunder': 'OKC',
                     'Utah Jazz': 'UTA',
                     'New Orleans Pelicans': 'NOP',
                     'San Antonio Spurs': 'SAS',
                     'Minnesota Timberwolves': 'MIN',
                     'Denver Nuggets': 'DEN',
                     'Los Angeles Clippers': 'LAC',
                     'Los Angeles Lakers': 'LAL',
                     'Sacramento Kings': 'SAC',
                     'Dallas Mavericks': 'DAL',
                     'Memphis Grizzlies': 'MEM',
                     'Phoenix Suns': 'PHO'}

In [30]:
home_away_dict = {0: 'away', 1: 'home'}

In [None]:
def get_game_id_for_team(team_handle, year):

    season_page = requests.get(f'https://www.basketball-reference.com/teams/{team_handle}/{year}_games.html')
    season_page = BeautifulSoup(season_page.text, 'html.parser')
    games = {}
    for row in season_page.find('table', {'id': 'games'}).tbody.find_all('td'):
        if row['data-stat'] == 'date_game':
            game_date = row['csk'].replace('-', '')
            #print(game_date)
        if row['data-stat'] == 'game_location':
            away = row.text
            #print(len(away))
            if not away:
                games[game_date] = 'home'
            else:
                games[game_date] = 'away'
    gid_list = [x + '0' + team_handle for x in games.keys() if games[x] == 'home']
    return gid_list

In [None]:
def get_team_stats(teams, page, gid):
    team_stats_tables = []
    teams_playing = []
    for team_slug in teams:
        team_stats_tables.append(page.find('div', attrs={'id': f'all_box_{team_slug}_basic'}).find('tfoot').find_all('td'))
        teams_playing.append(team_slug.upper())
    
    game_stats = []
    
    for i, item in enumerate(team_stats_tables):
        team_stats = [gid, teams_playing[i], home_away_dict[i]]
        for row in item:
            if row.attrs['data-stat'] != 'plus_minus':
                team_stats.append(row.text)
        game_stats.append(team_stats)
    return game_stats

In [None]:
def get_box_score_stats(gid):
    box_score_page = requests.get(f'https://www.basketball-reference.com/boxscores/{gid}.html')
    box_score_page = BeautifulSoup(box_score_page.text, 'html.parser')
    bs_page_teams = []
    for item in box_score_page.find('div', attrs={'class', 'scorebox'}).find_all('strong'):
        team_slug = team_handles_dict[item.text.replace('\n', '')]
        bs_page_teams.append(team_slug.lower())
    return get_team_stats(bs_page_teams, box_score_page, gid)

## Clean Away Team Data

In [23]:
def clean_away_team(url):  
    df = pd.read_html(url)
    away_team = pd.DataFrame.merge(df[0],df[7])
    away_team = away_team[away_team.index != 5]

    labels = away_team.columns
    away_team=away_team.droplevel(0, axis=1)

    drop = (away_team['MP'] != 'Did Not Play').iloc[:, 0]
    away_team = away_team[drop]
    away_team.fillna(0,inplace=True)
    away_team.drop(away_team.columns[21],axis = 1,inplace=True)

    away_team = away_team.reset_index(drop = True)
    return away_team

## Clean Home Team Data

In [4]:
def clean_home_team(url):
    df = pd.read_html(url)
    home_team = pd.DataFrame.merge(df[8],df[15])
    home_team = home_team[home_team.index != 5]


    home_team=home_team.droplevel(0, axis=1)

    drop = (home_team['MP'] != 'Did Not Play').iloc[:, 0]
    home_team = home_team[drop]
    home_team.fillna(0,inplace=True)
    home_team.drop(home_team.columns[21],axis = 1,inplace=True)

    home_team = home_team.reset_index(drop = True)
    return home_team

In [14]:
h_df = clean_home_team('https://www.basketball-reference.com/boxscores/202002060LAL.html')
h_df

Unnamed: 0,Starters,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,...,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg
0,Anthony Davis,14,21,0.667,0,0,0.0,4,5,0.8,...,3.2,37.6,20.7,12.6,3.5,8.3,0.0,24.5,139.0,107.0
1,LeBron James,8,19,0.421,1,8,0.125,1,1,1.0,...,7.1,24.1,15.7,56.9,2.5,0.0,23.6,29.5,97.0,117.0
2,Danny Green,5,7,0.714,3,4,0.75,2,2,1.0,...,4.2,4.1,4.1,12.4,4.5,0.0,27.6,14.9,128.0,115.0
3,Avery Bradley,5,10,0.5,3,4,0.75,2,2,1.0,...,0.0,4.8,2.4,20.4,0.0,0.0,0.0,17.6,142.0,126.0
4,JaVale McGee,2,6,0.333,0,1,0.0,0,0,0.0,...,8.1,31.4,19.9,7.7,0.0,6.9,0.0,15.9,84.0,117.0
5,Alex Caruso,4,7,0.571,2,3,0.667,2,2,1.0,...,0.0,0.0,0.0,18.9,1.7,0.0,11.3,14.1,139.0,123.0
6,Kentavious Caldwell-Pope,2,6,0.333,0,4,0.0,0,0,0.0,...,0.0,5.2,2.7,4.9,0.0,0.0,14.3,12.4,63.0,126.0
7,Kyle Kuzma,2,8,0.25,0,4,0.0,0,1,0.0,...,0.0,38.3,19.4,6.0,2.4,0.0,0.0,18.1,55.0,116.0
8,Rajon Rondo,1,5,0.2,0,3,0.0,0,0,0.0,...,0.0,0.0,0.0,7.9,0.0,0.0,44.4,26.1,28.0,127.0
9,Dwight Howard,2,2,1.0,0,0,0.0,1,3,0.333,...,29.5,0.0,14.5,0.0,0.0,0.0,0.0,32.0,147.0,127.0


In [24]:
anew_df = clean_away_team('https://www.basketball-reference.com/boxscores/202002060LAL.html')
anew_df

Unnamed: 0,Starters,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,...,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg
0,Russell Westbrook,17,28,0.607,1,2,0.5,6,8,0.75,...,3.4,24.3,13.7,30.3,1.2,0.0,20.2,46.6,109.0,109.0
1,James Harden,3,10,0.3,1,6,0.167,7,7,1.0,...,0.0,25.8,12.7,24.6,2.7,0.0,18.7,20.2,112.0,106.0
2,P.J. Tucker,4,6,0.667,3,5,0.6,0,0,0.0,...,8.1,16.7,12.4,0.0,0.0,2.6,14.3,9.9,155.0,111.0
3,Eric Gordon,5,10,0.5,5,8,0.625,0,0,0.0,...,0.0,8.7,4.3,4.6,0.0,0.0,0.0,14.7,148.0,114.0
4,Danuel House,4,8,0.5,2,4,0.5,3,3,1.0,...,0.0,4.6,2.3,14.3,1.7,2.9,9.7,16.3,137.0,108.0
5,Robert Covington,5,9,0.556,4,7,0.571,0,0,0.0,...,8.5,26.3,17.3,18.6,3.2,5.4,25.0,17.9,129.0,100.0
6,Austin Rivers,3,7,0.429,2,5,0.4,0,0,0.0,...,0.0,6.4,3.2,0.0,2.3,0.0,0.0,15.3,112.0,109.0
7,Ben McLemore,2,6,0.333,1,4,0.25,0,0,0.0,...,0.0,7.9,3.9,0.0,0.0,0.0,0.0,16.1,84.0,114.0
8,Thabo Sefolosha,0,1,0.0,0,1,0.0,0,0,0.0,...,10.2,20.9,15.5,0.0,3.8,6.5,0.0,3.6,76.0,99.0
9,Team Totals,43,85,0.506,19,42,0.452,16,18,0.889,...,15.8,83.8,49.3,46.5,7.8,8.3,14.7,100.0,117.7,108.0
