# Transform Stage Prototype

In [1]:
from pathlib import Path
import time

import pandas as pd
from sqlalchemy import create_engine

from config import db_username, db_password, db_endpoint, port

<hr>

### Read in the CSVs

In [2]:
data_folder = Path("data")

# Team Stats -------------------------------------------------------------------

with open(Path(f"{data_folder}/stat_types_and_seasons.txt"), 'r') as opened_file:
        data = opened_file.read()
data = data.split("\n")[:-1]
stat_types = data[0:4]
seasons = data[4:]

all_stats = []
for index in range(2):
    season_type_stats = {}
    for stat_type in stat_types:
        # Split and join the stat_type string with underscores for the filepath
        # below.
        split_string = stat_type.split()
        stat_type_underscores = "_".join(split_string).lower()
        if index == 1:
            stat_type_underscores = f"playoffs_{stat_type_underscores}"        

        # Create filepath for reading from
        table_filepath = Path(f"{data_folder}/{stat_type_underscores}.csv")

        # Read the DataFrame from file
        season_type_stats[stat_type] = pd.read_csv(table_filepath) 
    all_stats.append(season_type_stats)
team_stats = all_stats[0]
team_stats_playoffs = all_stats[1]

# Playoffs ---------------------------------------------------------------------
playoff_teams_filepath = Path(f"{data_folder}/playoff_teams_df.csv")
playoff_teams_df = pd.read_csv(playoff_teams_filepath)

# Champions --------------------------------------------------------------------
champions_filepath = Path(f"{data_folder}/champions_df.csv")
champions_df = pd.read_csv(champions_filepath, index_col="SEASON")
champions_df.index.name = None

### Test That the Data Loaded Correctly

In [3]:
print("Teams General Traditional length:", len(team_stats["Teams General Traditional"]))
print("Teams General Advanced length:", len(team_stats["Teams General Advanced"]))
print("Teams General Misc length:", len(team_stats["Teams General Misc"]))
print("Teams Clutch Traditional length:", len(team_stats["Teams Clutch Traditional"]))
team_stats["Teams General Advanced"]

Teams General Traditional length: 772
Teams General Advanced length: 772
Teams General Misc length: 772
Teams Clutch Traditional length: 772


Unnamed: 0,TEAM,GP,W,L,MIN,OFFRTG,DEFRTG,NETRTG,AST%,AST/TO,...,OREB%,DREB%,REB%,TOV%,EFG%,TS%,PACE,PIE,POSS,SEASON
0,Phoenix Suns,82,64,18,3946.0,114.2,106.8,7.5,62.7,2.12,...,26.4,72.8,50.3,12.9,54.9,58.1,100.26,54.8,8242,2021-22
1,Memphis Grizzlies,82,56,26,3956.0,114.3,108.9,5.3,59.7,1.97,...,33.8,72.6,52.6,13.0,52.2,55.3,100.52,53.0,8295,2021-22
2,Golden State Warriors,82,53,29,3946.0,112.1,106.6,5.5,66.9,1.82,...,26.9,73.6,51.0,15.0,55.2,58.2,98.74,53.6,8121,2021-22
3,Miami Heat,82,53,29,3971.0,113.0,108.4,4.5,64.4,1.75,...,27.8,73.5,51.0,14.9,54.7,58.4,96.53,52.9,7987,2021-22
4,Dallas Mavericks,82,52,30,3951.0,112.5,109.1,3.5,59.5,1.87,...,25.6,73.3,49.6,13.0,53.8,57.2,95.64,51.1,7871,2021-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,Philadelphia 76ers,82,22,60,3956.0,102.6,109.5,-6.9,56.4,1.18,...,35.9,65.4,50.4,17.9,47.0,51.8,97.05,45.3,8006,1996-97
768,Denver Nuggets,82,21,61,3986.0,103.1,109.7,-6.6,64.4,1.39,...,32.4,66.6,49.6,17.5,48.6,53.0,93.67,46.1,7776,1996-97
769,San Antonio Spurs,82,20,62,3946.0,102.1,110.8,-8.8,58.8,1.34,...,34.4,64.3,49.0,17.1,47.2,51.2,88.45,44.0,7268,1996-97
770,Boston Celtics,82,15,67,3981.0,102.9,110.1,-7.2,58.4,1.34,...,32.9,65.8,48.2,16.7,47.4,52.0,96.78,43.4,8014,1996-97


In [4]:
print("Playoffs Teams General Traditional length:", len(team_stats_playoffs["Teams General Traditional"]))
print("Playoffs Teams General Advanced length:", len(team_stats_playoffs["Teams General Advanced"]))
print("Playoffs Teams General Misc length:", len(team_stats_playoffs["Teams General Misc"]))
print("Playoffs Teams Clutch Traditional length:", len(team_stats_playoffs["Teams Clutch Traditional"]))
team_stats_playoffs["Teams General Advanced"]

Playoffs Teams General Traditional length: 416
Playoffs Teams General Advanced length: 416
Playoffs Teams General Misc length: 416
Playoffs Teams Clutch Traditional length: 405


Unnamed: 0,TEAM,GP,W,L,MIN,OFFRTG,DEFRTG,NETRTG,AST%,AST/TO,...,OREB%,DREB%,REB%,TOV%,EFG%,TS%,PACE,PIE,POSS,SEASON
0,Boston Celtics,3,3,0,144.0,118.6,114.5,4.1,62.6,1.79,...,35.4,68.0,51.2,15.1,56.7,60.7,94.67,54.5,285.0,2021-22
1,Golden State Warriors,4,3,1,192.0,125.4,116.2,9.3,66.5,2.09,...,26.2,65.6,46.6,14.1,62.7,64.9,97.25,54.7,389.0,2021-22
2,Miami Heat,4,3,1,192.0,118.4,104.0,14.5,66.7,2.08,...,30.4,77.5,53.7,13.7,55.3,58.7,94.75,58.1,380.0,2021-22
3,Milwaukee Bucks,4,3,1,192.0,107.7,94.2,13.5,58.8,1.54,...,22.9,80.9,53.2,15.2,54.8,56.8,100.13,55.8,402.0,2021-22
4,Philadelphia 76ers,4,3,1,197.0,120.7,111.7,9.0,60.3,1.52,...,29.2,71.1,52.5,16.1,58.3,63.2,91.01,55.5,372.0,2021-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,Portland Trail Blazers,4,1,3,192.0,102.6,109.9,-7.4,60.3,1.27,...,29.9,66.5,47.8,17.7,47.9,52.8,87.75,41.8,350.0,1996-97
412,Charlotte Hornets,3,0,3,144.0,107.1,117.7,-10.6,50.5,1.38,...,35.5,74.8,52.3,14.9,49.2,53.5,89.00,42.0,268.0,1996-97
413,Los Angeles Clippers,3,0,3,144.0,104.9,118.9,-13.9,50.0,1.36,...,26.8,63.5,43.2,13.6,47.6,52.7,88.17,36.2,264.0,1996-97
414,Minnesota Timberwolves,3,0,3,144.0,111.2,122.9,-11.7,60.0,2.57,...,31.6,63.6,46.1,10.4,48.7,51.6,90.00,42.5,269.0,1996-97


In [5]:
playoff_teams_df

Unnamed: 0,2021-22,2020-21,2019-20,2018-19,2017-18,2016-17,2015-16,2014-15,2013-14,2012-13,...,2005-06,2004-05,2003-04,2002-03,2001-02,2000-01,1999-00,1998-99,1997-98,1996-97
0,Boston Celtics,Milwaukee Bucks,Los Angeles Lakers,Toronto Raptors,Golden State Warriors,Golden State Warriors,Cleveland Cavaliers,Golden State Warriors,San Antonio Spurs,Miami Heat,...,Miami Heat,San Antonio Spurs,Detroit Pistons,San Antonio Spurs,Los Angeles Lakers,Los Angeles Lakers,Los Angeles Lakers,San Antonio Spurs,Chicago Bulls,Chicago Bulls
1,Golden State Warriors,Phoenix Suns,Miami Heat,Golden State Warriors,Cleveland Cavaliers,Cleveland Cavaliers,Golden State Warriors,Cleveland Cavaliers,Miami Heat,San Antonio Spurs,...,Dallas Mavericks,Detroit Pistons,Los Angeles Lakers,New Jersey Nets,New Jersey Nets,Philadelphia 76ers,Indiana Pacers,New York Knicks,Utah Jazz,Utah Jazz
2,Miami Heat,Atlanta Hawks,Boston Celtics,Milwaukee Bucks,Houston Rockets,Boston Celtics,Oklahoma City Thunder,Houston Rockets,Indiana Pacers,Indiana Pacers,...,Detroit Pistons,Miami Heat,Indiana Pacers,Dallas Mavericks,Sacramento Kings,Milwaukee Bucks,Portland Trail Blazers,Indiana Pacers,Indiana Pacers,Houston Rockets
3,Milwaukee Bucks,LA Clippers,Denver Nuggets,Portland Trail Blazers,Boston Celtics,San Antonio Spurs,Toronto Raptors,Atlanta Hawks,Oklahoma City Thunder,Memphis Grizzlies,...,Phoenix Suns,Phoenix Suns,Minnesota Timberwolves,Detroit Pistons,Boston Celtics,San Antonio Spurs,New York Knicks,Portland Trail Blazers,Los Angeles Lakers,Miami Heat
4,Philadelphia 76ers,Brooklyn Nets,Toronto Raptors,Philadelphia 76ers,New Orleans Pelicans,Washington Wizards,Miami Heat,Los Angeles Clippers,Washington Wizards,Golden State Warriors,...,Los Angeles Clippers,Seattle SuperSonics,New Jersey Nets,Sacramento Kings,Dallas Mavericks,Charlotte Hornets,Miami Heat,Utah Jazz,Charlotte Hornets,New York Knicks
5,Dallas Mavericks,Philadelphia 76ers,LA Clippers,Denver Nuggets,Philadelphia 76ers,Houston Rockets,San Antonio Spurs,Washington Wizards,Los Angeles Clippers,New York Knicks,...,Cleveland Cavaliers,Dallas Mavericks,Sacramento Kings,Los Angeles Lakers,Charlotte Hornets,Toronto Raptors,Philadelphia 76ers,Los Angeles Lakers,San Antonio Spurs,Seattle SuperSonics
6,Memphis Grizzlies,Utah Jazz,Milwaukee Bucks,Houston Rockets,Utah Jazz,Toronto Raptors,Portland Trail Blazers,Memphis Grizzlies,Portland Trail Blazers,Oklahoma City Thunder,...,San Antonio Spurs,Indiana Pacers,San Antonio Spurs,Philadelphia 76ers,Detroit Pistons,Dallas Mavericks,Phoenix Suns,Philadelphia 76ers,New York Knicks,Los Angeles Lakers
7,Minnesota Timberwolves,Denver Nuggets,Houston Rockets,Boston Celtics,Toronto Raptors,Utah Jazz,Atlanta Hawks,Chicago Bulls,Brooklyn Nets,Chicago Bulls,...,New Jersey Nets,Washington Wizards,Miami Heat,Boston Celtics,San Antonio Spurs,Sacramento Kings,Utah Jazz,Atlanta Hawks,Seattle SuperSonics,Atlanta Hawks
8,New Orleans Pelicans,Dallas Mavericks,Oklahoma City Thunder,San Antonio Spurs,Indiana Pacers,LA Clippers,Charlotte Hornets,San Antonio Spurs,Atlanta Hawks,Brooklyn Nets,...,Los Angeles Lakers,Boston Celtics,New Orleans Hornets,Orlando Magic,Indiana Pacers,New York Knicks,Milwaukee Bucks,Detroit Pistons,Houston Rockets,Detroit Pistons
9,Phoenix Suns,Los Angeles Lakers,Utah Jazz,LA Clippers,Milwaukee Bucks,Atlanta Hawks,Indiana Pacers,Brooklyn Nets,Dallas Mavericks,Atlanta Hawks,...,Chicago Bulls,Houston Rockets,Dallas Mavericks,Portland Trail Blazers,Philadelphia 76ers,Utah Jazz,Sacramento Kings,Miami Heat,Miami Heat,Orlando Magic


In [6]:
champions_df.reset_index(inplace=True)
champions_df = champions_df.rename(columns = {"index": "SEASON"})
champions_df = champions_df[["TEAM", "SEASON"]]
champions_df

Unnamed: 0,TEAM,SEASON
0,Milwaukee Bucks,2020-21
1,Los Angeles Lakers,2019-20
2,Toronto Raptors,2018-19
3,Golden State Warriors,2017-18
4,Golden State Warriors,2016-17
5,Cleveland Cavaliers,2015-16
6,Golden State Warriors,2014-15
7,San Antonio Spurs,2013-14
8,Miami Heat,2012-13
9,Miami Heat,2011-12


<br>
<hr>
<br>

## Transform Stage

### Current Variables

#### Team Stats
* `team_stats`: *dict* of the form `{<stat_type>: <DataFrame>, ...}` with the following 4 stat_type keys:
    * "Teams General Traditional"
    * "Teams General Advanced"
    * "Teams General Misc"
    * "Teams Clutch Traditional"
        
    <br>
        
    * Example Usage:
    ```python
    team_stats["Teams General Advanced"]
    ```

#### Playoff Team Stats
* `team_stats_playoffs`: *dict* of the form `{<stat_type>: <DataFrame>, ...}` with the following 4 stat_type keys:
    * "Teams General Traditional"
    * "Teams General Advanced"
    * "Teams General Misc"
    * "Teams Clutch Traditional"
        
    <br>
        
    * Example Usage:
    ```python
    team_stats_playoffs["Teams General Advanced"]
    ```
    
#### Playoff Teams
* `playoff_teams_df`: *DataFrame*, each season is a column, and each row is a playoff team for that season

#### Champions
* `champions_df`: *DataFrame*, the index is the season and the TEAM column is the name of the champion team for that season

### Connect to the Database

In [8]:
database = "nba_stats"

db_uri = f"postgresql://{db_username}:{db_password}@{db_endpoint}:{port}/{database}"
engine = create_engine(db_uri)

### Load: Regular Season Team Statistics

In [None]:
start_time = time.time()
teams_gen_traditional = team_stats["Teams General Traditional"]
teams_gen_advanced = team_stats["Teams General Advanced"]
teams_gen_misc = team_stats["Teams General Misc"]
teams_clutch_trad = team_stats["Teams Clutch Traditional"]

teams_gen_traditional.to_sql(name='teams_gen_traditional', con=engine, if_exists='replace', index=False)
teams_gen_advanced.to_sql(name='teams_gen_advanced', con=engine, if_exists='replace', index=False)
teams_gen_misc.to_sql(name='teams_gen_misc', con=engine, if_exists='replace', index=False)
teams_clutch_trad.to_sql(name='teams_clutch_trad', con=engine, if_exists='replace', index=False)

print(f'Reg Season Done. {time.time() - start_time} total seconds elapsed')

### Load: Playoff Team Statistics

In [None]:
start_time = time.time()
playoffs_gen_traditional = team_stats_playoffs["Teams General Traditional"]
playoffs_gen_advanced = team_stats_playoffs["Teams General Advanced"]
playoffs_gen_misc = team_stats_playoffs["Teams General Misc"]
playoffs_clutch_trad = team_stats_playoffs["Teams Clutch Traditional"]

playoffs_gen_traditional.to_sql(name='playoffs_gen_traditional', con=engine, if_exists='replace', index=False)
playoffs_gen_advanced.to_sql(name='playoffs_gen_advanced', con=engine, if_exists='replace', index=False)
playoffs_gen_misc.to_sql(name='playoffs_gen_misc', con=engine, if_exists='replace', index=False)
playoffs_clutch_trad.to_sql(name='playoffs_clutch_trad', con=engine, if_exists='replace', index=False)

print(f'Playoffs Done. {time.time() - start_time} total seconds elapsed')

### Load: Playoff Teams and Champions

In [9]:
playoff_teams_df.to_sql(name='playoffs_teams', con=engine, if_exists='replace', index=False)

champions_df.to_sql(name='champions', con=engine, if_exists='replace', index=False)