In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine, inspect, Integer, String, DateTime, Float, SmallInteger, Boolean
from pathlib import Path


load_dotenv()
# need to get the url for the database:
database_url = os.getenv("DATABASE_URL")
sql_pass = os.environ.get('MYSQL_ROOT_PASSWORD')
db_url = f"mysql://root:{sql_pass}@127.0.0.1:3306/data_pipeline"
engine = create_engine(db_url)
engine.connect()
print("Successfully connected to engine")
# its either /app/raw_data/nbashots or raw_data/nba_shots
path = Path("./raw_data/nbashots/")
csv_files = list(path.glob('*.csv'))
csv_files.sort()

table_name="shot"
inspector = inspect(engine)
table_exists = inspector.has_table(table_name)
print("Shots table exists: ", table_exists)



Successfully connected to engine
Shots table exists:  True


In [2]:
shots_dtypes = {
    'PLAYER_ID': Integer(),
    'PLAYER_NAME': String(255),
    'TEAM_ID': Integer(),
    'TEAM_NAME': String(255),
    'GAME_ID': Integer(),
    'GAME_DATE': DateTime(),
    'EVENT_TYPE': String(50),
    'SHOT_MADE': Boolean(),
    'ACTION_TYPE': String(100),
    'SHOT_TYPE': String(50),
    'BASIC_ZONE': String(50),
    'ZONE_NAME': String(100),
    'ZONE_ABB': String(20),
    'ZONE_RANGE': String(50),
    'LOC_X': Float(),
    'LOC_Y': Float(),
    'SHOT_DISTANCE': SmallInteger(),
    'QUARTER': SmallInteger(),
    'MINS_LEFT': SmallInteger(),
    'SECS_LEFT': SmallInteger(),
    'SEASON_1': Integer(),
    'SEASON_2': String(20),
    'POSITION_GROUP': String(20),
    'POSITION': String(20),
    'HOME_TEAM': String(20),
    'AWAY_TEAM': String(20),    
}

In [3]:
all_data = []
for i, file in enumerate(csv_files):
    df = pd.read_csv(file, iterator=True, chunksize=50000)
    total_rows = 0
    for chunk in df:
        chunk.GAME_DATE = pd.to_datetime(chunk.GAME_DATE)
        # if i == 0 and total_rows == 0:
        #     chunk.head(n=0).to_sql(name=table_name, con=engine, if_exists="replace", dtype=shots_dtypes)
        # l = len(chunk)
        # total_rows += l
        # chunk.to_sql(name=table_name, con=engine, if_exists="append", dtype=shots_dtypes)
        # print("Inserted %d chunks" % l)
        all_data.append(chunk)
    # print(f"Total chunks inserted for {file}: {total_rows}")


In [4]:
combined_df = pd.concat(all_data, ignore_index=True)

In [5]:
len(combined_df)

4231262

In [90]:
unique_players = combined_df[['PLAYER_ID']].drop_duplicates()
unique_players_2 = combined_df[['PLAYER_ID', 'PLAYER_NAME']].drop_duplicates()
unique_teams = combined_df[['TEAM_ID', 'TEAM_NAME']].drop_duplicates()
unique_seasons = combined_df[['SEASON_1', 'SEASON_2']].drop_duplicates().reset_index()
unique_games = combined_df[['GAME_ID', 'GAME_DATE', 'HOME_TEAM', 'AWAY_TEAM', 'SEASON_1', 'SEASON_2']].drop_duplicates()
unique_zones = combined_df[['BASIC_ZONE', 'ZONE_NAME', 'ZONE_ABB', 'ZONE_RANGE']].drop_duplicates()
unique_pos = combined_df[['POSITION_GROUP', 'POSITION']].drop_duplicates()
unique_team_abbrev = combined_df[['HOME_TEAM']].drop_duplicates()
unique_team_abbrev

Unnamed: 0,HOME_TEAM
0,POR
8,BOS
19,SAS
50,UTA
61,MEM
63,IND
64,MIA
90,HOU
94,ORL
98,WAS


In [65]:
unique_teams

Unnamed: 0,TEAM_ID,TEAM_NAME
0,1610612747,Los Angeles Lakers
1,1610612757,Portland Trail Blazers
8,1610612737,Atlanta Hawks
13,1610612738,Boston Celtics
19,1610612743,Denver Nuggets
20,1610612759,San Antonio Spurs
50,1610612762,Utah Jazz
61,1610612763,Memphis Grizzlies
63,1610612741,Chicago Bulls
64,1610612751,New Jersey Nets


In [66]:
len(unique_players)

2162

In [67]:
len(unique_players_2)

2171

In [68]:
a = unique_players_2[['PLAYER_ID']]
b = unique_players_2
duplicates = a[a.duplicated(keep=False)]

In [69]:
b = b[b['PLAYER_ID'].isin(duplicates['PLAYER_ID'])]


In [70]:
b

Unnamed: 0,PLAYER_ID,PLAYER_NAME
1941170,203493,Reggie Bullock
2769262,1628384,O.G. Anunoby
2840842,1628408,P.J. Dozier
3203573,1628384,OG Anunoby
3387672,1630231,Kenyon Martin Jr.
3387825,1630197,Alekesej Pokusevski
3389330,1630214,Xavier Tillman Sr.
3579108,1630527,Brandon Boston
3593612,1630288,Jeff Dowtin
3810432,1630288,Jeff Dowtin Jr.


In [71]:
len(unique_teams)

36

In [72]:
unique_teams

Unnamed: 0,TEAM_ID,TEAM_NAME
0,1610612747,Los Angeles Lakers
1,1610612757,Portland Trail Blazers
8,1610612737,Atlanta Hawks
13,1610612738,Boston Celtics
19,1610612743,Denver Nuggets
20,1610612759,San Antonio Spurs
50,1610612762,Utah Jazz
61,1610612763,Memphis Grizzlies
63,1610612741,Chicago Bulls
64,1610612751,New Jersey Nets


In [73]:
a = unique_teams[['TEAM_ID']]
b = unique_teams
duplicates = a[a.duplicated(keep=False)]

In [74]:
b = b[b['TEAM_ID'].isin(duplicates['TEAM_ID'])]

In [75]:
b

Unnamed: 0,TEAM_ID,TEAM_NAME
64,1610612751,New Jersey Nets
103,1610612740,New Orleans Hornets
104,1610612746,Los Angeles Clippers
107,1610612760,Seattle SuperSonics
190053,1610612766,Charlotte Bobcats
387593,1610612740,New Orleans/Oklahoma City Hornets
978394,1610612760,Oklahoma City Thunder
1739304,1610612751,Brooklyn Nets
1941057,1610612740,New Orleans Pelicans
2145547,1610612766,Charlotte Hornets


In [76]:
len(unique_teams)

36

In [91]:
len(unique_games)

25227

In [92]:
a = unique_games[["GAME_ID"]]
b = unique_games

In [93]:
duplicates = a[a.duplicated(keep=False)]
b = b[b['GAME_ID'].isin(duplicates['GAME_ID'])]

In [94]:
b

Unnamed: 0,GAME_ID,GAME_DATE,HOME_TEAM,AWAY_TEAM,SEASON_1,SEASON_2
3575310,22000000,2020-12-25,MIA,NOP,2021,2020-21
3576179,22000000,2020-12-23,PHX,DAL,2021,2020-21
3576180,22000000,2020-12-23,BOS,MIL,2021,2020-21
3578254,22000000,2020-12-22,BKN,GSW,2021,2020-21
3578256,22000000,2020-12-22,LAL,LAC,2021,2020-21


In [95]:
a["GAME_ID"].max()

np.int64(22301230)