### Import Frameworks + Libraries

In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import os


### Pointer Logic

- Create pointer variables to save .html files to a particular directory

In [3]:
SCORE_DIR = "data/scores"
box_scores = os.listdir(SCORE_DIR)
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

### Parse HTML Function

- Takes in a particular HTML file for a specific game, and removes excess table data that is not required in the df (s.decompose())

In [4]:
def parse_html(box_score):
    with open(box_score) as f:
        html = f.read()

    soup = BeautifulSoup(html)

    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]

    return soup
    

### Parse Line Score

- Takes in a BeautifulSoup object to parse the total points/teams playing

In [5]:
def read_line_score(soup):
    line_score = pd.read_html(str(soup), attrs={"id": "line_score"})[0]
    cols = list(line_score.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score.columns = cols

    line_score = line_score[["team", "total"]]
    return line_score

### Read Stats

- Reads in the boxscore HTML table and converts it to a dataframe using Pandas

In [6]:
def read_stats(soup, team, stat) -> pd.DataFrame:
    df = pd.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
    df = df.apply(pd.to_numeric, errors="coerce")

    return df

### Read Season Info

- Returns the particular season info of the game we are currently parsing

In [7]:
def read_season_info(soup) -> pd.DataFrame:
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all("a")]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [8]:
games = []

for box_score in box_scores:
    soup = parse_html(box_score)
    line_score = read_line_score(soup)

    teams = list(line_score["team"])
    summaries = []
    team_mapping = {}

    for i, team in enumerate(teams):
        team_mapping[f"team{i}"] = team
        basic = read_stats(soup, team, "basic")
        advanced = read_stats(soup, team, "advanced")

        # only take the final row in the DataFrame and concatenate basic and advanced stats 
        totals = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])

        totals.index = totals.index.str.lower() + f"_team{i}"
            
        totals[f"team{i}"] = team
        summaries.append(totals)

    game = pd.concat(summaries, axis=0)
    game["winner"] = team_mapping["team0"] if game["pts_team0"] > game["pts_team1"] else team_mapping["team1"]
    game = game.to_frame().T

    game["season"] = read_season_info(soup)
    game["date"] = os.path.basename(box_score)[:8]
    game["date"] = pd.to_datetime(game["date"], format="%Y%m%d")
    games.append(game)

    if len(games) % 100 == 0:
        print(f"{len(games)} / {len(box_scores)}")

100 / 7602
200 / 7602
300 / 7602
400 / 7602
500 / 7602
600 / 7602
700 / 7602
800 / 7602
900 / 7602
1000 / 7602
1100 / 7602
1200 / 7602
1300 / 7602
1400 / 7602
1500 / 7602
1600 / 7602
1700 / 7602
1800 / 7602
1900 / 7602
2000 / 7602
2100 / 7602
2200 / 7602
2300 / 7602
2400 / 7602
2500 / 7602
2600 / 7602
2700 / 7602
2800 / 7602
2900 / 7602
3000 / 7602
3100 / 7602
3200 / 7602
3300 / 7602
3400 / 7602
3500 / 7602
3600 / 7602
3700 / 7602
3800 / 7602
3900 / 7602
4000 / 7602
4100 / 7602
4200 / 7602
4300 / 7602
4400 / 7602
4500 / 7602
4600 / 7602
4700 / 7602
4800 / 7602
4900 / 7602
5000 / 7602
5100 / 7602
5200 / 7602
5300 / 7602
5400 / 7602
5500 / 7602
5600 / 7602
5700 / 7602
5800 / 7602
5900 / 7602
6000 / 7602
6100 / 7602
6200 / 7602
6300 / 7602
6400 / 7602
6500 / 7602
6600 / 7602
6700 / 7602
6800 / 7602
6900 / 7602
7000 / 7602
7100 / 7602
7200 / 7602
7300 / 7602
7400 / 7602
7500 / 7602
7600 / 7602


### Drop Unnecessary Columns

In [35]:
for game in games:
    columns = game.columns
    
    if "unnamed: 16_team1" in columns:
        game.drop("unnamed: 16_team1", axis=1, inplace=True)
    if "unnamed: 16_team0" in columns:
        game.drop("unnamed: 16_team0", axis=1, inplace=True)
    if "bpm_team1" in columns:
        game.drop("bpm_team1", axis=1, inplace=True)
    if "bpm_team0" in columns:
        game.drop("bpm_team0", axis=1, inplace=True)

### Find Extraneous Columns

In [37]:
i = 0
unique_columns = set(games[0].columns)

for i, game in enumerate(games):
    if set(game.columns) != unique_columns:
        print(i)

In [38]:
games_df = pd.concat(games, axis=0)
games_df.reset_index(drop=True, inplace=True)

games_df

Unnamed: 0,mp_team0,fg_team0,fga_team0,fg%_team0,3p_team0,3pa_team0,3p%_team0,ft_team0,fta_team0,ft%_team0,...,stl%_team1,blk%_team1,tov%_team1,usg%_team1,ortg_team1,drtg_team1,team1,winner,season,date
0,240.0,38.0,72.0,0.528,16.0,37.0,0.432,18.0,21.0,0.857,...,11.7,5.7,9.9,100.0,114.1,117.3,MIA,POR,2023,2022-11-07
1,240.0,41.0,78.0,0.526,8.0,24.0,0.333,15.0,19.0,0.789,...,2.3,5.6,12.3,100.0,102.7,119.8,DAL,CLE,2023,2022-12-14
2,240.0,37.0,87.0,0.425,7.0,33.0,0.212,32.0,35.0,0.914,...,9.5,9.3,8.6,100.0,119.4,107.1,TOR,TOR,2023,2022-12-07
3,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,0.739,...,8.5,16.7,10.2,100.0,113.3,104.8,SAS,SAS,2018,2017-10-18
4,240.0,27.0,86.0,0.314,6.0,26.0,0.231,15.0,20.0,0.75,...,10.2,13.3,7.0,100.0,94.0,76.7,MEM,MEM,2021,2021-04-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7597,240.0,32.0,89.0,0.36,9.0,43.0,0.209,18.0,21.0,0.857,...,8.9,13.0,9.5,100.0,114.0,90.2,GSW,GSW,2023,2023-03-02
7598,240.0,49.0,85.0,0.576,12.0,27.0,0.444,16.0,18.0,0.889,...,10.1,10.3,10.7,100.0,128.8,127.8,SAC,SAC,2023,2022-12-28
7599,240.0,40.0,89.0,0.449,6.0,32.0,0.188,23.0,34.0,0.676,...,3.9,5.3,11.6,100.0,91.8,105.4,NYK,MIL,2019,2018-12-25
7600,240.0,34.0,95.0,0.358,7.0,33.0,0.212,17.0,23.0,0.739,...,3.1,4.8,12.6,100.0,109.8,93.6,LAC,LAC,2019,2018-10-19


In [39]:
games_df = games_df.sort_values(by="date")
games_df = games_df.reset_index(drop=True)

In [40]:
games_df 

Unnamed: 0,mp_team0,fg_team0,fga_team0,fg%_team0,3p_team0,3pa_team0,3p%_team0,ft_team0,fta_team0,ft%_team0,...,stl%_team1,blk%_team1,tov%_team1,usg%_team1,ortg_team1,drtg_team1,team1,winner,season,date
0,240.0,47.0,97.0,0.485,15.0,41.0,0.366,13.0,19.0,0.684,...,4.9,16.1,16.0,100.0,118.6,119.6,GSW,HOU,2018,2017-10-17
1,240.0,36.0,88.0,0.409,8.0,32.0,0.25,19.0,25.0,0.76,...,3.0,7.1,15.3,100.0,102.7,99.7,CLE,CLE,2018,2017-10-17
2,240.0,33.0,83.0,0.398,12.0,45.0,0.267,27.0,29.0,0.931,...,5.2,10.5,15.5,100.0,103.9,109.1,SAC,HOU,2018,2017-10-18
3,240.0,29.0,73.0,0.397,9.0,30.0,0.3,23.0,29.0,0.793,...,14.2,7.0,7.3,100.0,103.6,91.4,DET,DET,2018,2017-10-18
4,240.0,48.0,94.0,0.511,9.0,18.0,0.5,12.0,15.0,0.8,...,5.1,5.3,13.6,100.0,112.6,118.7,DAL,ATL,2018,2017-10-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7597,240.0,45.0,84.0,0.536,17.0,35.0,0.486,12.0,14.0,0.857,...,6.4,10.2,8.4,100.0,131.1,127.9,BOS,BOS,2024,2024-03-20
7598,240.0,41.0,95.0,0.432,13.0,37.0,0.351,7.0,18.0,0.389,...,7.1,20.7,19.7,100.0,116.7,103.5,PHO,PHO,2024,2024-03-20
7599,240.0,50.0,88.0,0.568,11.0,29.0,0.379,11.0,18.0,0.611,...,7.8,1.7,12.7,100.0,100.7,119.3,DET,IND,2024,2024-03-20
7600,240.0,38.0,83.0,0.458,13.0,36.0,0.361,18.0,19.0,0.947,...,3.4,4.3,11.5,100.0,117.9,121.3,CLE,MIA,2024,2024-03-20


In [34]:
games_df.to_csv("master_df.csv")

## Encoding Data

- label encoding --> went with this one for now, but am not sure if this is good since it implies some type of ordering which is not true
- one-hot encoding --> this one will add 30 columns to the df, is this what we want?

In [None]:
# need to confirm previous years do not have different team encodings

team_encoding = { 
    # ATLANTIC
    "TOR": 1,
    "BOS": 2,
    "NYK": 3, 
    "BRK": 4,
    "PHI": 5,

    # CENTRAL
    "CLE": 6,
    "IND": 7,
    "DET": 8,
    "CHI": 9,
    "MIL": 10,

    # SOUTHEAST
    "MIA": 11,
    "ATL": 12,
    "CHO": 13,
    "WAS": 14,
    "ORL": 15,

    # NORTHWEST
    "OKC": 16,
    "POR": 17,
    "UTA": 18,
    "DEN": 19,
    "MIN": 20,

    # PACIFIC
    "GSW": 21, 
    "LAC": 22,
    "SAC": 23,
    "PHO": 24,
    "LAL": 25,

    # SOUTH WEST
    "SAS": 26,
    "DAL": 27,
    "MEM": 28,
    "HOU": 29,
    "NOP": 30
}

# update those values
encoded_games_df = games_df

encoded_games_df.replace(team_encoding, inplace=True)

encoded_games_df



### Cumulative Average Logic    

In [None]:
"""
Goal: We want to calculative the cumulative average of each column in the dataframe. We define the cumulative average as the
average of each particular stat (let's say points) up until (and not including) the game. 

Implications: This means that for the first game of each team of each season, each column will have a 0. 

Algorithm:
- We first need to identify the first date of each season we're parsing. This is because the cumulative average of each 
stat will be reset for each season

- For each season, we will leverage a hashmap to store each team's cumulative average. Quick Demo:

{
    Raptors: {
        num_games: 2
        cum_pts: 108
        cum_fgs: 12
        cum_rbs: 20
    }
}

- In this hashmap, we will be updating num_games everytime the particular team plays the game. 

"""



