### Import Frameworks + Libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import os


### Pointer Logic

- Create pointer variables to save .html files to a particular directory

In [2]:
SCORE_DIR = "data/scores"
box_scores = os.listdir(SCORE_DIR)
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html") and "2023" in f or "2024" in f]

### Parse HTML Function

- Takes in a particular HTML file for a specific game, and removes excess table data that is not required in the df (s.decompose())

In [3]:
def parse_html(box_score):
    with open(box_score) as f:
        html = f.read()

    soup = BeautifulSoup(html)

    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]

    return soup
    

### Parse Line Score

- Takes in a BeautifulSoup object to parse the total points/teams playing

In [4]:
def read_line_score(soup):
    line_score = pd.read_html(str(soup), attrs={"id": "line_score"})[0]
    cols = list(line_score.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score.columns = cols

    line_score = line_score[["team", "total"]]
    return line_score

### Read Stats

- Reads in the boxscore HTML table and converts it to a dataframe using Pandas

In [5]:
def read_stats(soup, team, stat) -> pd.DataFrame:
    df = pd.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
    df = df.apply(pd.to_numeric, errors="coerce")

    return df

### Read Season Info

- Returns the particular season info of the game we are currently parsing

In [6]:
def read_season_info(soup) -> pd.DataFrame:
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all("a")]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [7]:
games = []

for box_score in box_scores:
    soup = parse_html(box_score)
    line_score = read_line_score(soup)

    teams = list(line_score["team"])
    summaries = []
    team_mapping = {}

    for i, team in enumerate(teams):
        team_mapping[f"team{i}"] = team
        basic = read_stats(soup, team, "basic")
        advanced = read_stats(soup, team, "advanced")

        # only take the final row in the DataFrame and concatenate basic and advanced stats 
        totals = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])

        totals.index = totals.index.str.lower() + f"_team{i}"
            
        totals[f"team{i}"] = team
        summaries.append(totals)

    game = pd.concat(summaries, axis=0)
    game["winner"] = team_mapping["team0"] if game["pts_team0"] > game["pts_team1"] else team_mapping["team1"]
    game = game.to_frame().T

    game["season"] = read_season_info(soup)
    game["date"] = os.path.basename(box_score)[:8]
    game["date"] = pd.to_datetime(game["date"], format="%Y%m%d")
    games.append(game)

    if len(games) % 100 == 0:
        print(f"{len(games)} / {len(box_scores)}")

100 / 1793
200 / 1793
300 / 1793
400 / 1793
500 / 1793
600 / 1793
700 / 1793
800 / 1793
900 / 1793
1000 / 1793
1100 / 1793
1200 / 1793
1300 / 1793
1400 / 1793
1500 / 1793
1600 / 1793
1700 / 1793


### Drop Unnecessary Columns

In [53]:
for game in games:
    columns = game.columns
    
    if "unnamed: 16_team1" in columns:
        game.drop("unnamed: 16_team1", axis=1, inplace=True)
    if "unnamed: 16_team0" in columns:
        game.drop("unnamed: 16_team0", axis=1, inplace=True)
    if "bpm_team1" in columns:
        game.drop("bpm_team1", axis=1, inplace=True)
    if "bpm_team0" in columns:
        game.drop("bpm_team0", axis=1, inplace=True)

### Find Extraneous Columns

In [97]:
i = 0
unique_columns = set(games[0].columns)

for i, game in enumerate(games):
    if set(game.columns) != unique_columns:
        print(i)

In [98]:
games[0].columns
master_df = pd.read_csv("master_df.csv")
master_df.drop("Unnamed: 0", axis=1, inplace=True)
master_df
games_df = pd.concat(games, axis=0)

games_df

Unnamed: 0,mp_team0,fg_team0,fga_team0,fg%_team0,3p_team0,3pa_team0,3p%_team0,ft_team0,fta_team0,ft%_team0,...,stl%_team1,blk%_team1,tov%_team1,usg%_team1,ortg_team1,drtg_team1,team1,winner,season,date
Team Totals,240.0,43.0,83.0,0.518,15.0,37.0,0.405,17.0,23.0,0.739,...,6.3,6.5,11.7,100.0,115.3,123.7,BRK,BOS,2024,2024-02-13
Team Totals,240.0,41.0,84.0,0.488,12.0,38.0,0.316,23.0,25.0,0.92,...,6.0,13.0,18.1,100.0,109.3,117.3,ATL,MIA,2024,2023-11-11
Team Totals,240.0,48.0,98.0,0.49,15.0,38.0,0.395,14.0,17.0,0.824,...,2.1,10.0,5.2,100.0,116.8,129.3,CLE,IND,2024,2023-10-28
Team Totals,240.0,43.0,89.0,0.483,18.0,41.0,0.439,24.0,25.0,0.96,...,8.2,18.8,13.1,100.0,94.5,131.4,PHI,NYK,2024,2024-01-05
Team Totals,240.0,31.0,91.0,0.341,8.0,40.0,0.2,16.0,22.0,0.727,...,3.1,9.8,11.1,100.0,108.1,88.5,MIA,MIA,2023,2023-05-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Team Totals,240.0,40.0,94.0,0.426,17.0,45.0,0.378,16.0,25.0,0.64,...,4.9,14.3,12.3,100.0,117.2,111.3,MIA,MIA,2024,2023-12-25
Team Totals,265.0,38.0,91.0,0.418,11.0,35.0,0.314,27.0,34.0,0.794,...,4.4,12.5,8.6,100.0,104.6,100.2,MIN,MIN,2023,2023-02-01
Team Totals,240.0,32.0,89.0,0.36,9.0,43.0,0.209,18.0,21.0,0.857,...,8.9,13.0,9.5,100.0,114.0,90.2,GSW,GSW,2023,2023-03-02
Team Totals,240.0,43.0,87.0,0.494,10.0,27.0,0.37,21.0,27.0,0.778,...,8.7,13.3,12.2,100.0,106.1,112.9,SAS,MIN,2024,2023-11-10


In [4]:
games_df.columns

games_df

NameError: name 'games_df' is not defined

In [92]:
master_df.columns

master_df

Unnamed: 0,mp_team0,fg_team0,fga_team0,fg%_team0,3p_team0,3pa_team0,3p%_team0,ft_team0,fta_team0,ft%_team0,...,stl%_team1,blk%_team1,tov%_team1,usg%_team1,ortg_team1,drtg_team1,team1,winner,season,date
0,240.0,47.0,97.0,0.485,15.0,41.0,0.366,13.0,19.0,0.684,...,4.9,16.1,16.0,100.0,118.6,119.6,GSW,HOU,2018,2017-10-17
1,240.0,36.0,88.0,0.409,8.0,32.0,0.250,19.0,25.0,0.760,...,3.0,7.1,15.3,100.0,102.7,99.7,CLE,CLE,2018,2017-10-17
2,240.0,33.0,83.0,0.398,12.0,45.0,0.267,27.0,29.0,0.931,...,5.2,10.5,15.5,100.0,103.9,109.1,SAC,HOU,2018,2017-10-18
3,240.0,29.0,73.0,0.397,9.0,30.0,0.300,23.0,29.0,0.793,...,14.2,7.0,7.3,100.0,103.6,91.4,DET,DET,2018,2017-10-18
4,240.0,48.0,94.0,0.511,9.0,18.0,0.500,12.0,15.0,0.800,...,5.1,5.3,13.6,100.0,112.6,118.7,DAL,ATL,2018,2017-10-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7597,240.0,45.0,84.0,0.536,17.0,35.0,0.486,12.0,14.0,0.857,...,6.4,10.2,8.4,100.0,131.1,127.9,BOS,BOS,2024,2024-03-20
7598,240.0,41.0,95.0,0.432,13.0,37.0,0.351,7.0,18.0,0.389,...,7.1,20.7,19.7,100.0,116.7,103.5,PHO,PHO,2024,2024-03-20
7599,240.0,50.0,88.0,0.568,11.0,29.0,0.379,11.0,18.0,0.611,...,7.8,1.7,12.7,100.0,100.7,119.3,DET,IND,2024,2024-03-20
7600,240.0,38.0,83.0,0.458,13.0,36.0,0.361,18.0,19.0,0.947,...,3.4,4.3,11.5,100.0,117.9,121.3,CLE,MIA,2024,2024-03-20


In [3]:
combined_df_test = pd.concat([games_df.head(), master_df.head()], axis=0, ignore_index=True)

NameError: name 'games_df' is not defined

In [39]:
games_df = games_df.sort_values(by="date")
games_df = games_df.reset_index(drop=True)

In [40]:
games_df 

Unnamed: 0,mp_team0,fg_team0,fga_team0,fg%_team0,3p_team0,3pa_team0,3p%_team0,ft_team0,fta_team0,ft%_team0,...,stl%_team1,blk%_team1,tov%_team1,usg%_team1,ortg_team1,drtg_team1,team1,winner,season,date
0,240.0,47.0,97.0,0.485,15.0,41.0,0.366,13.0,19.0,0.684,...,4.9,16.1,16.0,100.0,118.6,119.6,GSW,HOU,2018,2017-10-17
1,240.0,36.0,88.0,0.409,8.0,32.0,0.25,19.0,25.0,0.76,...,3.0,7.1,15.3,100.0,102.7,99.7,CLE,CLE,2018,2017-10-17
2,240.0,33.0,83.0,0.398,12.0,45.0,0.267,27.0,29.0,0.931,...,5.2,10.5,15.5,100.0,103.9,109.1,SAC,HOU,2018,2017-10-18
3,240.0,29.0,73.0,0.397,9.0,30.0,0.3,23.0,29.0,0.793,...,14.2,7.0,7.3,100.0,103.6,91.4,DET,DET,2018,2017-10-18
4,240.0,48.0,94.0,0.511,9.0,18.0,0.5,12.0,15.0,0.8,...,5.1,5.3,13.6,100.0,112.6,118.7,DAL,ATL,2018,2017-10-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7597,240.0,45.0,84.0,0.536,17.0,35.0,0.486,12.0,14.0,0.857,...,6.4,10.2,8.4,100.0,131.1,127.9,BOS,BOS,2024,2024-03-20
7598,240.0,41.0,95.0,0.432,13.0,37.0,0.351,7.0,18.0,0.389,...,7.1,20.7,19.7,100.0,116.7,103.5,PHO,PHO,2024,2024-03-20
7599,240.0,50.0,88.0,0.568,11.0,29.0,0.379,11.0,18.0,0.611,...,7.8,1.7,12.7,100.0,100.7,119.3,DET,IND,2024,2024-03-20
7600,240.0,38.0,83.0,0.458,13.0,36.0,0.361,18.0,19.0,0.947,...,3.4,4.3,11.5,100.0,117.9,121.3,CLE,MIA,2024,2024-03-20


In [34]:
games_df.to_csv("master_df.csv")

## Encoding Data

- label encoding --> went with this one for now, but am not sure if this is good since it implies some type of ordering which is not true
- one-hot encoding --> this one will add 30 columns to the df, is this what we want?

In [None]:
# need to confirm previous years do not have different team encodings

team_encoding = { 
    # ATLANTIC
    "TOR": 1,
    "BOS": 2,
    "NYK": 3, 
    "BRK": 4,
    "PHI": 5,

    # CENTRAL
    "CLE": 6,
    "IND": 7,
    "DET": 8,
    "CHI": 9,
    "MIL": 10,

    # SOUTHEAST
    "MIA": 11,
    "ATL": 12,
    "CHO": 13,
    "WAS": 14,
    "ORL": 15,

    # NORTHWEST
    "OKC": 16,
    "POR": 17,
    "UTA": 18,
    "DEN": 19,
    "MIN": 20,

    # PACIFIC
    "GSW": 21, 
    "LAC": 22,
    "SAC": 23,
    "PHO": 24,
    "LAL": 25,

    # SOUTH WEST
    "SAS": 26,
    "DAL": 27,
    "MEM": 28,
    "HOU": 29,
    "NOP": 30
}

# update those values
encoded_games_df = games_df

encoded_games_df.replace(team_encoding, inplace=True)

encoded_games_df



### Cumulative Average Logic    

In [41]:
"""
Goal: We want to calculative the cumulative average of each column in the dataframe. We define the cumulative average as the
average of each particular stat (let's say points) up until (and not including) the game. 

Implications: This means that for the first game of each team of each season, each column will have a 0. 

Algorithm:
- We first need to identify the first date of each season we're parsing. This is because the cumulative average of each 
stat will be reset for each season

- For each season, we will leverage a hashmap to store each team's cumulative average. Quick Demo:

{
    Raptors: {
        num_games: 2
        cum_pts: 108
        cum_fgs: 12
        cum_rbs: 20
    }
}

- In this hashmap, we will be updating num_games everytime the particular team plays the game. 

"""

print("Hello")




Hello
