In [1]:
import pandas as pd
import numpy as np
import sqlite3

In [2]:
nba = pd.read_csv('https://raw.githubusercontent.com/jkropko/contrans/main/examples/ASA%20All%20NBA%20Raw%20Data.csv')

In [3]:
nba.columns

Index(['game_id', 'game_date', 'OT', 'H_A', 'Team_Abbrev', 'Team_Score',
       'Team_pace', 'Team_efg_pct', 'Team_tov_pct', 'Team_orb_pct',
       'Team_ft_rate', 'Team_off_rtg', 'Inactives', 'Opponent_Abbrev',
       'Opponent_Score', 'Opponent_pace', 'Opponent_efg_pct',
       'Opponent_tov_pct', 'Opponent_orb_pct', 'Opponent_ft_rate',
       'Opponent_off_rtg', 'player', 'player_id', 'starter', 'mp', 'fg', 'fga',
       'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb',
       'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus',
       'did_not_play', 'is_inactive', 'ts_pct', 'efg_pct', 'fg3a_per_fga_pct',
       'fta_per_fga_pct', 'orb_pct', 'drb_pct', 'trb_pct', 'ast_pct',
       'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'off_rtg', 'def_rtg', 'bpm',
       'season', 'minutes', 'double_double', 'triple_double', 'DKP', 'FDP',
       'SDP', 'DKP_per_minute', 'FDP_per_minute', 'SDP_per_minute',
       'pf_per_minute', 'ts', 'last_60_minutes_per_game_s

In [4]:
nba = nba[['game_id', 'game_date', 'OT', 'H_A', 'Team_Abbrev', 'Team_Score',
       'Team_pace', 'Team_efg_pct', 'Team_tov_pct', 'Team_orb_pct',
       'Team_ft_rate', 'Team_off_rtg', 'Inactives', 'Opponent_Abbrev',
       'player', 'player_id', 'starter', 'mp', 'fg', 'fga',
       'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb',
       'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus',
       'did_not_play', 'is_inactive', 'off_rtg', 'def_rtg', 'bpm']]

In [5]:
pd.set_option('display.max_rows',90)
nba.head(3).T

Unnamed: 0,0,1,2
game_id,202204100BRK,202204100BRK,202204100BRK
game_date,2022-04-10,2022-04-10,2022-04-10
OT,0,0,0
H_A,A,A,A
Team_Abbrev,IND,IND,IND
Team_Score,126,126,126
Team_pace,103.9,103.9,103.9
Team_efg_pct,0.543,0.543,0.543
Team_tov_pct,5.9,5.9,5.9
Team_orb_pct,20.8,20.8,20.8


## First normal form.

Every table has a primary key?  Yes.

Non-atomic data?  Yes, Inactives.  But this is redundant, so we just delete this column.

Repeating groups?  Not anymore!

So we have first normal form relational database!  Yay!

In [6]:
nba = nba.drop(['Inactives'],axis=1)
nba.head(3).T

Unnamed: 0,0,1,2
game_id,202204100BRK,202204100BRK,202204100BRK
game_date,2022-04-10,2022-04-10,2022-04-10
OT,0,0,0
H_A,A,A,A
Team_Abbrev,IND,IND,IND
Team_Score,126,126,126
Team_pace,103.9,103.9,103.9
Team_efg_pct,0.543,0.543,0.543
Team_tov_pct,5.9,5.9,5.9
Team_orb_pct,20.8,20.8,20.8


## Second normal form.

Is the data in first normal form (1NF)?  Yup!

Every non-prime column must depend on the ENTIRE primary key (game_id + player_id) and not just part of the primary key (just game_id or just player_id).  (i.e., can't have attributes that are functionally dependent on *part* of a primary key)  right now, we have non-prime attributes that depend on the game but not the player; this needs to be resolved before we can proceed.

In [7]:
nba['game_player_id'] = nba['game_id'] + '_' + nba['player_id']
nba.head(3).T

Unnamed: 0,0,1,2
game_id,202204100BRK,202204100BRK,202204100BRK
game_date,2022-04-10,2022-04-10,2022-04-10
OT,0,0,0
H_A,A,A,A
Team_Abbrev,IND,IND,IND
Team_Score,126,126,126
Team_pace,103.9,103.9,103.9
Team_efg_pct,0.543,0.543,0.543
Team_tov_pct,5.9,5.9,5.9
Team_orb_pct,20.8,20.8,20.8


Now that we've created game_player_id that is the primary key, we're now good on 2NF.

## Third normal form.

Data is in 2NF.

Every non-prime attribute is non-transitively dependent on every attribute.  (i.e., no transitive dependencies between/amongst non-primary keys)

Calculated columns: 
- fg_pct
- fg3_pct
- ft_pct
- trb
We will delete these.

In [8]:
nba = nba.drop(['fg_pct','fg3_pct','ft_pct','trb'],axis=1)

In [12]:
nba.head(6).T

Unnamed: 0,0,1,2,3,4,5
game_id,202204100BRK,202204100BRK,202204100BRK,202204100BRK,202204100BRK,202204100BRK
game_date,2022-04-10,2022-04-10,2022-04-10,2022-04-10,2022-04-10,2022-04-10
OT,0,0,0,0,0,0
H_A,A,A,A,A,A,A
Team_Abbrev,IND,IND,IND,IND,IND,IND
Team_Score,126,126,126,126,126,126
Team_pace,103.9,103.9,103.9,103.9,103.9,103.9
Team_efg_pct,0.543,0.543,0.543,0.543,0.543,0.543
Team_tov_pct,5.9,5.9,5.9,5.9,5.9,5.9
Team_orb_pct,20.8,20.8,20.8,20.8,20.8,20.8


Transitive dependencies:
- Some columns depend on player
- Some columns depend on game
- Some columns depend on the team
- Some columns depend on the team + game (e.g., team score)

Each of these will get a separate table.

In [15]:
nba_teamgame = nba[['game_id','H_A', 'Team_Abbrev', 'Team_Score',
       'Team_pace', 'Team_efg_pct', 'Team_tov_pct', 'Team_orb_pct',
       'Team_ft_rate', 'Team_off_rtg', 'Opponent_Abbrev']]
nba_teamgame = nba_teamgame.drop_duplicates()

In [16]:
nba_teamgame

Unnamed: 0,game_id,H_A,Team_Abbrev,Team_Score,Team_pace,Team_efg_pct,Team_tov_pct,Team_orb_pct,Team_ft_rate,Team_off_rtg,Opponent_Abbrev
0,202204100BRK,A,IND,126,103.9,0.543,5.9,20.8,0.125,121.3,BRK
12,202204100BRK,H,BRK,134,103.9,0.691,17.9,29.6,0.272,129.0,IND
25,202204100CHO,A,WAS,108,97.7,0.489,8.7,31.5,0.170,110.5,CHO
37,202204100CHO,H,CHO,124,97.7,0.640,15.2,29.7,0.112,126.9,WAS
52,202204100CLE,A,MIL,115,101.9,0.511,10.5,17.4,0.284,112.9,CLE
...,...,...,...,...,...,...,...,...,...,...,...
21491,202112190MIN,H,MIN,111,91.8,0.565,9.0,15.0,0.312,120.9,DAL
21492,202112210DAL,A,MIN,102,93.9,0.538,16.2,20.9,0.215,108.6,DAL
21629,202112280MIN,H,MIN,88,93.1,0.441,9.6,20.4,0.153,94.5,NYK
21708,202112230UTA,A,MIN,116,102.1,0.530,11.6,30.0,0.089,113.7,UTA


In [17]:
nba_game = nba[['game_id','game_date','OT']].drop_duplicates()
nba_game

Unnamed: 0,game_id,game_date,OT
0,202204100BRK,2022-04-10,0
25,202204100CHO,2022-04-10,0
52,202204100CLE,2022-04-10,0
77,202204100DAL,2022-04-10,0
103,202204100DEN,2022-04-10,1
...,...,...,...
19708,202110300MIN,2021-10-30,0
19726,202112150DEN,2021-12-15,0
19748,202202010MIN,2022-02-01,0
20615,202203270BOS,2022-03-27,0


In [18]:
nba_player = nba[['player_id','player']].drop_duplicates()
nba_player

Unnamed: 0,player_id,player
0,halibty01,Tyrese Haliburton
1,hieldbu01,Buddy Hield
2,brissos01,Oshae Brissett
3,jacksis01,Isaiah Jackson
4,mccontj01,T.J. McConnell
...,...,...
31515,garrema01,Marcus Garrett
31535,chalmma01,Mario Chalmers
31538,holmaar01,Aric Holman
31540,scrubja01,Jay Scrubb


In [19]:
nba_playergame = nba.drop(['player','game_date','OT','H_A', 'Team_Abbrev', 'Team_Score',
       'Team_pace', 'Team_efg_pct', 'Team_tov_pct', 'Team_orb_pct',
       'Team_ft_rate', 'Team_off_rtg', 'Opponent_Abbrev'],axis=1)

In [20]:
nba_playergame

Unnamed: 0,game_id,player_id,starter,mp,fg,fga,fg3,fg3a,ft,fta,...,tov,pf,pts,plus_minus,did_not_play,is_inactive,off_rtg,def_rtg,bpm,game_player_id
0,202204100BRK,halibty01,1,39:28,7,14,2,5,1,1,...,1,0,17,-9,0,0,137,132,1.7,202204100BRK_halibty01
1,202204100BRK,hieldbu01,1,35:53,8,23,5,14,0,0,...,2,3,21,0,0,0,94,128,-2.3,202204100BRK_hieldbu01
2,202204100BRK,brissos01,1,35:47,10,20,5,10,3,4,...,0,5,28,-9,0,0,137,133,4.4,202204100BRK_brissos01
3,202204100BRK,jacksis01,1,32:01,3,4,0,0,1,2,...,2,5,7,3,0,0,89,128,-9.2,202204100BRK_jacksis01
4,202204100BRK,mccontj01,1,30:52,5,15,3,7,1,2,...,0,3,14,7,0,0,104,126,-1.7,202204100BRK_mccontj01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31603,202201130NOP,gabriwe01,0,4:26,1,1,1,1,0,0,...,2,2,3,-4,0,0,62,110,-6.4,202201130NOP_gabriwe01
31604,202201150SAS,gabriwe01,0,0:00,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0.0,202201150SAS_gabriwe01
31605,202112220SAC,wrighmo01,0,1:28,0,0,0,0,0,0,...,0,0,0,1,0,0,217,103,24.4,202112220SAC_wrighmo01
31606,202112260LAC,wrighmo01,0,0:00,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0.0,202112260LAC_wrighmo01


Our 3NF database has 4 tables (entities):

In [21]:
nba_game

Unnamed: 0,game_id,game_date,OT
0,202204100BRK,2022-04-10,0
25,202204100CHO,2022-04-10,0
52,202204100CLE,2022-04-10,0
77,202204100DAL,2022-04-10,0
103,202204100DEN,2022-04-10,1
...,...,...,...
19708,202110300MIN,2021-10-30,0
19726,202112150DEN,2021-12-15,0
19748,202202010MIN,2022-02-01,0
20615,202203270BOS,2022-03-27,0


In [23]:
nba_player

Unnamed: 0,player_id,player
0,halibty01,Tyrese Haliburton
1,hieldbu01,Buddy Hield
2,brissos01,Oshae Brissett
3,jacksis01,Isaiah Jackson
4,mccontj01,T.J. McConnell
...,...,...
31515,garrema01,Marcus Garrett
31535,chalmma01,Mario Chalmers
31538,holmaar01,Aric Holman
31540,scrubja01,Jay Scrubb


In [25]:
nba_teamgame
#primary key here is superkey, which is game_id + Team_Abbrev

Unnamed: 0,game_id,H_A,Team_Abbrev,Team_Score,Team_pace,Team_efg_pct,Team_tov_pct,Team_orb_pct,Team_ft_rate,Team_off_rtg,Opponent_Abbrev
0,202204100BRK,A,IND,126,103.9,0.543,5.9,20.8,0.125,121.3,BRK
12,202204100BRK,H,BRK,134,103.9,0.691,17.9,29.6,0.272,129.0,IND
25,202204100CHO,A,WAS,108,97.7,0.489,8.7,31.5,0.170,110.5,CHO
37,202204100CHO,H,CHO,124,97.7,0.640,15.2,29.7,0.112,126.9,WAS
52,202204100CLE,A,MIL,115,101.9,0.511,10.5,17.4,0.284,112.9,CLE
...,...,...,...,...,...,...,...,...,...,...,...
21491,202112190MIN,H,MIN,111,91.8,0.565,9.0,15.0,0.312,120.9,DAL
21492,202112210DAL,A,MIN,102,93.9,0.538,16.2,20.9,0.215,108.6,DAL
21629,202112280MIN,H,MIN,88,93.1,0.441,9.6,20.4,0.153,94.5,NYK
21708,202112230UTA,A,MIN,116,102.1,0.530,11.6,30.0,0.089,113.7,UTA


In [26]:
nba_playergame

Unnamed: 0,game_id,player_id,starter,mp,fg,fga,fg3,fg3a,ft,fta,...,tov,pf,pts,plus_minus,did_not_play,is_inactive,off_rtg,def_rtg,bpm,game_player_id
0,202204100BRK,halibty01,1,39:28,7,14,2,5,1,1,...,1,0,17,-9,0,0,137,132,1.7,202204100BRK_halibty01
1,202204100BRK,hieldbu01,1,35:53,8,23,5,14,0,0,...,2,3,21,0,0,0,94,128,-2.3,202204100BRK_hieldbu01
2,202204100BRK,brissos01,1,35:47,10,20,5,10,3,4,...,0,5,28,-9,0,0,137,133,4.4,202204100BRK_brissos01
3,202204100BRK,jacksis01,1,32:01,3,4,0,0,1,2,...,2,5,7,3,0,0,89,128,-9.2,202204100BRK_jacksis01
4,202204100BRK,mccontj01,1,30:52,5,15,3,7,1,2,...,0,3,14,7,0,0,104,126,-1.7,202204100BRK_mccontj01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31603,202201130NOP,gabriwe01,0,4:26,1,1,1,1,0,0,...,2,2,3,-4,0,0,62,110,-6.4,202201130NOP_gabriwe01
31604,202201150SAS,gabriwe01,0,0:00,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0.0,202201150SAS_gabriwe01
31605,202112220SAC,wrighmo01,0,1:28,0,0,0,0,0,0,...,0,0,0,1,0,0,217,103,24.4,202112220SAC_wrighmo01
31606,202112260LAC,wrighmo01,0,0:00,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0.0,202112260LAC_wrighmo01


In [27]:
nba_db = sqlite3.connect("nba.db")

In [28]:
nba_game.to_sql('games', nba_db, index=False, chunksize=1000, if_exists='replace')
nba_playergame.to_sql('player_game', nba_db, index=False, chunksize=1000, if_exists='replace')
nba_teamgame.to_sql('team_game', nba_db, index=False, chunksize=1000, if_exists='replace')
nba_player.to_sql('players', nba_db, index=False, chunksize=1000, if_exists='replace')

621

In [41]:
myquery = '''
select * 
from team_game
where Team_Abbrev like '%CL%' and Team_Score > 120
'''
pd.read_sql(myquery, nba_db)

Unnamed: 0,game_id,H_A,Team_Abbrev,Team_Score,Team_pace,Team_efg_pct,Team_tov_pct,Team_orb_pct,Team_ft_rate,Team_off_rtg,Opponent_Abbrev
0,202204100CLE,H,CLE,133,101.9,0.644,9.0,23.3,0.128,130.5,MIL
1,202203080IND,A,CLE,127,98.2,0.542,6.1,33.3,0.253,129.4,IND
2,202110200MEM,A,CLE,121,101.1,0.581,9.0,14.9,0.14,119.7,MEM
3,202111070NYK,A,CLE,126,96.4,0.65,13.7,30.6,0.1,130.7,NYK
4,202112100MIN,A,CLE,123,103.6,0.618,13.7,16.7,0.212,118.7,MIN
5,202112150CLE,H,CLE,124,96.4,0.675,14.2,22.5,0.2,128.7,HOU
6,202112260CLE,H,CLE,144,107.2,0.632,9.7,28.9,0.094,134.4,TOR
7,202202280CLE,H,CLE,122,96.8,0.628,18.4,30.3,0.392,126.0,MIN


In [46]:
nba_playergame.columns

Index(['game_id', 'player_id', 'starter', 'mp', 'fg', 'fga', 'fg3', 'fg3a',
       'ft', 'fta', 'orb', 'drb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts',
       'plus_minus', 'did_not_play', 'is_inactive', 'off_rtg', 'def_rtg',
       'bpm', 'game_player_id'],
      dtype='object')