In [1]:
import pandas as pd

In [None]:

# Read all CSV files
ranking = pd.read_csv("datasets/ranking.csv", parse_dates=["STANDINGSDATE"])
teams = pd.read_csv("datasets/teams.csv")
players = pd.read_csv("datasets/players.csv")
players_data = pd.read_csv("datasets/players_data.csv")
games = pd.read_csv("datasets/games.csv", parse_dates=["GAME_DATE_EST"])
game_details = pd.read_csv("datasets/games_details.csv")

In [3]:
# Step 1: Prepare dim_season (combine seasons from all datasets)
# From ranking.csv (SEASON_ID like 22022 -> 2022)
ranking['season_type'] = ranking['SEASON_ID'].astype(str).str[0]
ranking = ranking[ranking['season_type'] == '2'].copy()
ranking['season_year'] = ranking['SEASON_ID'].astype(str).str[1:].astype(int)

# From players.csv (SEASON like 2019)
players['season_year'] = players['SEASON'].astype(int)

# From players_data.csv (season like 1996-97)
players_data['season_year'] = players_data['season'].apply(lambda x: int(x.split('-')[0]))

# From games.csv (SEASON like 2022)
games['season_year'] = games['SEASON'].astype(int)

# Combine unique season years from all datasets
all_season_years = pd.concat([
    pd.Series(ranking['season_year'].unique()),
    pd.Series(players['season_year'].unique()),
    pd.Series(players_data['season_year'].unique()),
    pd.Series(games['season_year'].unique())
]).unique()

# sort the unique season years
all_season_years.sort()


dim_season = pd.DataFrame(all_season_years, columns=['season_year'])
dim_season['season_id'] = dim_season.index + 1

# check the data types of the columns
# print(dim_season.dtypes)

# Convert 'season_id' and 'season_year' to Python int type (not numpy.int64)
dim_season['season_id'] = dim_season['season_id'].astype(int)
dim_season['season_year'] = dim_season['season_year'].astype(int)


In [4]:
# Map season_year to season_id in all datasets
ranking = ranking.merge(dim_season[['season_year', 'season_id']], on='season_year', how='left')
players = players.merge(dim_season[['season_year', 'season_id']], on='season_year', how='left')
players_data = players_data.merge(dim_season[['season_year', 'season_id']], on='season_year', how='left')
games = games.merge(dim_season[['season_year', 'season_id']], on='season_year', how='left')

In [5]:
# Step 2: Prepare dim_team from teams.csv
dim_team = teams[['TEAM_ID', 'ABBREVIATION', 'NICKNAME', 'YEARFOUNDED', 'MIN_YEAR', 'MAX_YEAR', 'CITY', 'ARENA', 'ARENACAPACITY', 'OWNER', 'GENERALMANAGER', 'HEADCOACH']].copy()
dim_team.rename(columns={
    'TEAM_ID': 'team_id',
    'ABBREVIATION': 'abbreviation',
    'NICKNAME': 'nickname',
    'YEARFOUNDED': 'year_founded',
    'MIN_YEAR': 'start_year',
    'MAX_YEAR': 'end_year',
    'CITY': 'city',
    'ARENA': 'arena',
    'ARENACAPACITY': 'arena_capacity',
    'OWNER': 'owner',
    'GENERALMANAGER': 'general_manager',
    'HEADCOACH': 'head_coach'
}, inplace=True)

# Ensure team_id is an integer
dim_team['team_id'] = dim_team['team_id'].astype(int)

# Ensure year_founded, start_year, end_year are integers
dim_team['year_founded'] = dim_team['year_founded'].astype(int)
dim_team['start_year'] = dim_team['start_year'].astype(int)
dim_team['end_year'] = dim_team['end_year'].astype(int)

# Fill NaN in arena_capacity with 0 and ensure it's an integer
dim_team['arena_capacity'] = dim_team['arena_capacity'].fillna(0).astype(int)

# Fill NaN in string columns with 'Unknown'
dim_team.fillna({
    'abbreviation': 'Unknown',
    'nickname': 'Unknown',
    'city': 'Unknown',
    'arena': 'Unknown',
    'owner': 'Unknown',
    'general_manager': 'Unknown',
    'head_coach': 'Unknown'
}, inplace=True)

# Add conference from ranking.csv
team_conference = ranking[['TEAM_ID', 'CONFERENCE']].drop_duplicates().copy()
team_conference.rename(columns={'TEAM_ID': 'team_id', 'CONFERENCE': 'conference'}, inplace=True)
team_conference = (team_conference.groupby('team_id')['conference']
                   .agg(lambda x: x.mode()[0] if not x.empty else 'Unknown')
                   .reset_index())
dim_team = dim_team.merge(team_conference, on='team_id', how='left')
dim_team['conference'] = dim_team['conference'].fillna('Unknown')

# Ensure team_id is unique
dim_team = dim_team.drop_duplicates(subset=['team_id'])


In [6]:
# Step 3: Prepare dim_ranking from ranking.csv
# Drop unnecessary columns
ranking.drop(columns=['RETURNTOPLAY', 'LEAGUE_ID', 'TEAM', 'W_PCT', 'HOME_RECORD', 'ROAD_RECORD'], inplace=True)

# Get the last entry for each TEAM_ID per season_year
latest_per_season = (ranking.sort_values(["TEAM_ID", "season_year", "STANDINGSDATE"])
                     .groupby(["TEAM_ID", "season_year"])
                     .last()
                     .reset_index())

# Step 4: Prepare dim_date (combine dates from ranking.csv and games.csv)
# Dates from ranking.csv
ranking_dates = pd.DataFrame(latest_per_season['STANDINGSDATE'].unique(), columns=['date'])

# Dates from games.csv
game_dates = pd.DataFrame(games['GAME_DATE_EST'].unique(), columns=['date'])

# Combine unique dates
all_dates = pd.concat([ranking_dates, game_dates]).drop_duplicates().reset_index(drop=True)
all_dates = all_dates.sort_values(by='date')
dim_date = pd.DataFrame(all_dates, columns=['date'])
dim_date['date'] = pd.to_datetime(dim_date['date'])
dim_date['year'] = dim_date['date'].dt.year.astype(int)
dim_date['month'] = dim_date['date'].dt.month.astype(int)
dim_date['day'] = dim_date['date'].dt.day.astype(int)
dim_date = dim_date.sort_values(by='date', ascending=True).reset_index(drop=True)
dim_date['date_id'] = dim_date.index + 1

print(dim_date)
# Map STANDINGSDATE to date_id
latest_per_season = latest_per_season.merge(dim_date[['date', 'date_id']], 
                                          left_on='STANDINGSDATE', 
                                          right_on='date', 
                                          how='left')

# Prepare dim_ranking with date_id
dim_ranking = latest_per_season[['TEAM_ID', 'season_id', 'CONFERENCE', 'G', 'W', 'L', 'date_id']].copy()
dim_ranking.rename(columns={
    'TEAM_ID': 'team_id',
    'CONFERENCE': 'conference',
    'G': 'game_played',
    'W': 'wins',
    'L': 'lose'
}, inplace=True)
dim_ranking['ranking_id'] = dim_ranking.index + 1

# Ensure numeric columns are integers
dim_ranking['team_id'] = dim_ranking['team_id'].astype(int)
dim_ranking['season_id'] = dim_ranking['season_id'].astype(int)
dim_ranking['game_played'] = dim_ranking['game_played'].astype(int)
dim_ranking['wins'] = dim_ranking['wins'].astype(int)
dim_ranking['lose'] = dim_ranking['lose'].astype(int)
dim_ranking['date_id'] = dim_ranking['date_id'].astype(int)
dim_ranking['ranking_id'] = dim_ranking['ranking_id'].astype(int)

# Fill any missing string columns
dim_ranking['conference'] = dim_ranking['conference'].fillna('Unknown')

# Drop temporary columns
dim_ranking.drop(columns=['SEASON_ID', 'season_year', 'season_type'], inplace=True, errors='ignore')


           date  year  month  day  date_id
0    2003-10-04  2003     10    4        1
1    2003-10-05  2003     10    5        2
2    2003-10-06  2003     10    6        3
3    2003-10-07  2003     10    7        4
4    2003-10-08  2003     10    8        5
...         ...   ...    ...  ...      ...
4319 2022-12-18  2022     12   18     4320
4320 2022-12-19  2022     12   19     4321
4321 2022-12-20  2022     12   20     4322
4322 2022-12-21  2022     12   21     4323
4323 2022-12-22  2022     12   22     4324

[4324 rows x 5 columns]


In [7]:
# Step 5: Prepare dim_player_static and dim_player_dynamic
# Prepare dim_player_static
dim_player_static = players[['PLAYER_ID', 'PLAYER_NAME']].drop_duplicates().copy()
dim_player_static.rename(columns={'PLAYER_ID': 'player_id', 'PLAYER_NAME': 'player_name'}, inplace=True)


# Ensure player_id is an integer
dim_player_static['player_id'] = dim_player_static['player_id'].astype(int)

# Merge with players_data to get static attributes
players_data_static = players_data[['player_name', 'player_height', 'college', 'country', 'draft_year', 'draft_round', 'draft_number']].drop_duplicates(subset=['player_name']).copy()
dim_player_static = dim_player_static.merge(players_data_static, on='player_name', how='left')

dim_player_static['draft_year'] = dim_player_static['draft_year'].replace('Undrafted', 0)
dim_player_static['draft_round'] = dim_player_static['draft_year'].replace('Undrafted', 0)
dim_player_static['draft_number'] = dim_player_static['draft_year'].replace('Undrafted', 0)
dim_player_static.rename(columns={'player_height': 'height'}, inplace=True)

# Fill missing static attributes with defaults
dim_player_static.fillna({
    'draft_year': 0,
    'draft_round': 0,
    'draft_number': 0,
    'height': 0,
    'college': 'Unknown',
    'country': 'Unknown'
}, inplace=True)


# Ensure player_id is unique
if not dim_player_static['player_id'].is_unique:
    dim_player_static = dim_player_static.drop_duplicates(subset=['player_id'])

# Prepare dim_player_dynamic
# From players.csv (no weight data)
dim_player_dynamic_players = players[['PLAYER_ID', 'TEAM_ID', 'season_id']].copy()
dim_player_dynamic_players.rename(columns={'PLAYER_ID': 'player_id', 'TEAM_ID': 'team_id'}, inplace=True)
dim_player_dynamic_players['weight'] = None
dim_player_dynamic_players['source'] = 'players'

# Ensure player_id and team_id are integers
dim_player_dynamic_players['player_id'] = dim_player_dynamic_players['player_id'].astype(int)
dim_player_dynamic_players['team_id'] = dim_player_dynamic_players['team_id'].astype(int)
dim_player_dynamic_players['season_id'] = dim_player_dynamic_players['season_id'].astype(int)

# From players_data.csv (has weight data)
dim_player_dynamic_data = players_data[['player_name', 'team_abbreviation', 'season_id', 'player_weight']].copy()
dim_player_dynamic_data.rename(columns={'player_weight': 'weight'}, inplace=True)

# Ensure weight is numeric
dim_player_dynamic_data['weight'] = pd.to_numeric(dim_player_dynamic_data['weight'], errors='coerce').round(2)


# Map player_name to player_id
dim_player_dynamic_data = dim_player_dynamic_data.merge(dim_player_static[['player_name', 'player_id']], on='player_name', how='left')

# Drop rows with NaN player_id (unmatched players)
dim_player_dynamic_data = dim_player_dynamic_data.dropna(subset=['player_id'])

# Ensure player_id is an integer after merge
dim_player_dynamic_data['player_id'] = dim_player_dynamic_data['player_id'].astype(int)

# Map team_abbreviation to team_id using dim_team
dim_player_dynamic_data = dim_player_dynamic_data.merge(dim_team[['abbreviation', 'team_id']], 
                                                        left_on='team_abbreviation', 
                                                        right_on='abbreviation', 
                                                        how='left')

# Fill NaN team_id with 0 (indicating unknown team)
dim_player_dynamic_data['team_id'] = dim_player_dynamic_data['team_id'].fillna(0).astype(int)
dim_player_dynamic_data.drop(columns=['abbreviation'], inplace=True)


# Ensure season_id is an integer
dim_player_dynamic_data['season_id'] = dim_player_dynamic_data['season_id'].astype(int)


# Select columns for dim_player_dynamic
dim_player_dynamic_data = dim_player_dynamic_data[['player_id', 'team_id', 'season_id', 'weight']]
dim_player_dynamic_data['source'] = 'players_data'


# Combine both sources
if not dim_player_dynamic_data.empty:
    dim_player_dynamic = pd.concat([dim_player_dynamic_players, dim_player_dynamic_data], ignore_index=True)
else:
    print("\nWarning: dim_player_dynamic_data is empty after dropping unmatched players.")
    dim_player_dynamic = dim_player_dynamic_players.copy()

# Ensure player_id, team_id, and season_id are integers after concatenation
dim_player_dynamic['player_id'] = dim_player_dynamic['player_id'].astype(int)
dim_player_dynamic['team_id'] = dim_player_dynamic['team_id'].astype(int)
dim_player_dynamic['season_id'] = dim_player_dynamic['season_id'].astype(int)

# Clean dim_player_dynamic
dim_player_dynamic = dim_player_dynamic.drop_duplicates(subset=['player_id', 'team_id', 'season_id'])

# Fill missing weights with 0
dim_player_dynamic['weight'] = dim_player_dynamic['weight'].fillna(0)

# Drop the source column
dim_player_dynamic.drop(columns=['source'], inplace=True)

  dim_player_dynamic = pd.concat([dim_player_dynamic_players, dim_player_dynamic_data], ignore_index=True)


In [8]:
# Step 6: Prepare fact_game from games.csv
fact_game = games[['GAME_ID', 'GAME_DATE_EST', 'season_id', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 
                   'PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home',
                   'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away',
                   'HOME_TEAM_WINS']].copy()

fact_game.fillna(0, inplace=True)

# Convert score-related fields to integers
score_columns = ['PTS_home', 'AST_home', 'REB_home', 'PTS_away', 'AST_away', 'REB_away']
for col in score_columns:
    fact_game[col] = fact_game[col].astype(int)

fact_game.rename(columns={
    'GAME_ID': 'game_id',
    'HOME_TEAM_ID': 'home_team_id',
    'VISITOR_TEAM_ID': 'visitor_team_id',
    'PTS_home': 'home_points',
    'FG_PCT_home': 'home_field_goal_percentage',
    'FT_PCT_home': 'home_free_throw_percentage',
    'FG3_PCT_home': 'home_three_pointer_percentage',
    'AST_home': 'home_assists',
    'REB_home': 'home_rebounds',
    'PTS_away': 'visitor_points',
    'FG_PCT_away': 'visitor_field_goal_percentage',
    'FT_PCT_away': 'visitor_free_throw_percentage',
    'FG3_PCT_away': 'visitor_three_pointer_percentage',
    'AST_away': 'visitor_assists',
    'REB_away': 'visitor_rebounds',
    'HOME_TEAM_WINS': 'home_team_wins'
}, inplace=True)


# Map GAME_DATE_EST to date_id
fact_game = fact_game.merge(dim_date[['date', 'date_id']], 
                            left_on='GAME_DATE_EST', 
                            right_on='date', 
                            how='left')

# Drop the date column
fact_game.drop(columns=['GAME_DATE_EST', 'date'], inplace=True)

# Ensure home_team_wins is boolean
fact_game['home_team_wins'] = fact_game['home_team_wins'].astype(bool)

In [9]:
# Step 7: Prepare dim_player_performance from game_details.csv
dim_player_performance = game_details[['GAME_ID', 'PLAYER_ID', 'TEAM_ID', 'MIN', 'FGM', 'FGA', 'FG_PCT', 
                                      'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 
                                      'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS']].copy()

dim_player_performance.fillna(0, inplace=True)

# Convert score-related fields to integers
score_columns = ['FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'REB', 
                 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS']
for col in score_columns:
    dim_player_performance[col] = dim_player_performance[col].astype(int)

dim_player_performance.rename(columns={
    'GAME_ID': 'game_id',
    'PLAYER_ID': 'player_id',
    'TEAM_ID': 'team_id',
    'MIN': 'minutes_played',
    'FGM': 'field_goals_made',
    'FGA': 'field_goals_attempted',
    'FG_PCT': 'field_goal_percentage',
    'FG3M': 'three_pointers_made',
    'FG3A': 'three_pointers_attempted',
    'FG3_PCT': 'three_pointer_percentage',
    'FTM': 'free_throws_made',
    'FTA': 'free_throws_attempted',
    'FT_PCT': 'free_throw_percentage',
    'OREB': 'offensive_rebounds',
    'DREB': 'defensive_rebounds',
    'REB': 'total_rebounds',
    'AST': 'assists',
    'STL': 'steals',
    'BLK': 'blocks',
    'TO': 'turnovers',
    'PF': 'personal_fouls',
    'PTS': 'points'
}, inplace=True)

# Generate performance_id
dim_player_performance['performance_id'] = dim_player_performance.index + 1

# Ensure only valid player_id and team_id (foreign key constraints)
dim_player_performance = dim_player_performance[dim_player_performance['player_id'].isin(dim_player_static['player_id'])]
dim_player_performance = dim_player_performance[dim_player_performance['team_id'].isin(dim_team['team_id'])]

## Game Fact

In [10]:
fact_game

Unnamed: 0,game_id,season_id,home_team_id,visitor_team_id,home_points,home_field_goal_percentage,home_free_throw_percentage,home_three_pointer_percentage,home_assists,home_rebounds,visitor_points,visitor_field_goal_percentage,visitor_free_throw_percentage,visitor_three_pointer_percentage,visitor_assists,visitor_rebounds,home_team_wins,date_id
0,22200477,27,1610612740,1610612759,126,0.484,0.926,0.382,25,46,117,0.478,0.815,0.321,23,44,True,4324
1,22200478,27,1610612762,1610612764,120,0.488,0.952,0.457,16,40,112,0.561,0.765,0.333,20,37,True,4324
2,22200466,27,1610612739,1610612749,114,0.482,0.786,0.313,22,37,106,0.470,0.682,0.433,20,46,True,4323
3,22200467,27,1610612755,1610612765,113,0.441,0.909,0.297,27,49,93,0.392,0.735,0.261,15,46,True,4323
4,22200468,27,1610612737,1610612741,108,0.429,1.000,0.378,22,47,110,0.500,0.773,0.292,20,47,False,4323
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26646,11400007,19,1610612737,1610612740,93,0.419,0.821,0.421,24,50,87,0.366,0.643,0.375,17,43,True,2463
26647,11400004,19,1610612741,1610612764,81,0.338,0.719,0.381,18,40,85,0.411,0.636,0.267,17,47,False,2463
26648,11400005,19,1610612747,1610612743,98,0.448,0.682,0.500,29,45,95,0.387,0.659,0.500,19,43,True,2463
26649,11400002,19,1610612761,1610612758,99,0.440,0.771,0.333,21,30,94,0.469,0.725,0.385,18,45,True,2462


## Dimensions

### Player Data

In [11]:
dim_player_static

Unnamed: 0,player_id,player_name,height,college,country,draft_year,draft_round,draft_number
0,1626220,Royce O'Neale,198.12,Baylor,USA,0,0,0
1,202711,Bojan Bogdanovic,203.20,Unknown,Croatia,2011,2011,2011
2,203497,Rudy Gobert,215.90,Unknown,France,2013,2013,2013
3,1628378,Donovan Mitchell,190.50,Louisville,USA,2017,2017,2017
4,201144,Mike Conley,185.42,Ohio State,USA,2007,2007,2007
...,...,...,...,...,...,...,...,...
1764,201831,Lanny Smith,0.00,Unknown,Unknown,0,0,0
1765,201999,Warren Carter,0.00,Unknown,Unknown,0,0,0
1766,201834,Bennet Davis,0.00,Unknown,Unknown,0,0,0
1767,201646,Brian Hamilton,0.00,Unknown,Unknown,0,0,0


In [12]:
dim_player_dynamic

Unnamed: 0,player_id,team_id,season_id,weight
0,1626220,1610612762,24,0.00
1,202711,1610612762,24,0.00
2,203497,1610612762,24,0.00
3,1628378,1610612762,24,0.00
4,201144,1610612762,24,0.00
...,...,...,...,...
16246,203925,1610612751,27,99.79
16247,204060,1610612749,27,99.79
16248,203954,1610612755,27,127.01
16249,1628381,1610612737,27,102.51


### Player Performance

In [13]:
dim_player_performance

Unnamed: 0,game_id,player_id,team_id,minutes_played,field_goals_made,field_goals_attempted,field_goal_percentage,three_pointers_made,three_pointers_attempted,three_pointer_percentage,...,offensive_rebounds,defensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points,performance_id
0,22200477,1629641,1610612759,18:06,1,1,1.000,0,0,0.0,...,1,1,2,0,1,0,2,5,2,1
2,22200477,1627751,1610612759,21:42,6,9,0.667,0,0,0.0,...,1,3,4,1,1,0,2,4,13,3
5,22200477,1628380,1610612759,18:04,2,6,0.333,0,0,0.0,...,1,1,2,2,0,0,0,3,6,6
6,22200477,203926,1610612759,16:55,2,8,0.250,1,5,0.2,...,1,3,4,1,0,0,1,0,5,7
7,22200477,1626196,1610612759,21:20,5,11,0.455,2,5,0.4,...,1,2,3,3,0,0,1,2,14,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
668623,11200005,202706,1610612743,19,4,9,0.444,3,6,0.5,...,0,2,2,0,2,0,1,3,17,668624
668624,11200005,202702,1610612743,23,7,11,0.636,0,0,0.0,...,1,0,1,1,1,0,3,3,18,668625
668625,11200005,201585,1610612743,15,3,7,0.429,0,0,0.0,...,3,5,8,0,1,0,0,3,6,668626
668626,11200005,202389,1610612743,19,1,1,1.000,0,0,0.0,...,1,2,3,1,0,0,4,2,2,668627


### Team

In [14]:
dim_team

Unnamed: 0,team_id,abbreviation,nickname,year_founded,start_year,end_year,city,arena,arena_capacity,owner,general_manager,head_coach,conference
0,1610612737,ATL,Hawks,1949,1949,2019,Atlanta,State Farm Arena,18729,Tony Ressler,Travis Schlenk,Lloyd Pierce,East
1,1610612738,BOS,Celtics,1946,1946,2019,Boston,TD Garden,18624,Wyc Grousbeck,Danny Ainge,Brad Stevens,East
2,1610612740,NOP,Pelicans,2002,2002,2019,New Orleans,Smoothie King Center,0,Tom Benson,Trajan Langdon,Alvin Gentry,East
3,1610612741,CHI,Bulls,1966,1966,2019,Chicago,United Center,21711,Jerry Reinsdorf,Gar Forman,Jim Boylen,East
4,1610612742,DAL,Mavericks,1980,1980,2019,Dallas,American Airlines Center,19200,Mark Cuban,Donnie Nelson,Rick Carlisle,West
5,1610612743,DEN,Nuggets,1976,1976,2019,Denver,Pepsi Center,19099,Stan Kroenke,Tim Connelly,Michael Malone,West
6,1610612745,HOU,Rockets,1967,1967,2019,Houston,Toyota Center,18104,Tilman Fertitta,Daryl Morey,Mike D'Antoni,West
7,1610612746,LAC,Clippers,1970,1970,2019,Los Angeles,Staples Center,19060,Steve Ballmer,Michael Winger,Doc Rivers,West
8,1610612747,LAL,Lakers,1948,1948,2019,Los Angeles,Staples Center,19060,Jerry Buss Family Trust,Rob Pelinka,Frank Vogel,West
9,1610612748,MIA,Heat,1988,1988,2019,Miami,AmericanAirlines Arena,19600,Micky Arison,Pat Riley,Erik Spoelstra,East


### Ranking

In [15]:
dim_ranking

Unnamed: 0,team_id,season_id,conference,game_played,wins,lose,date_id,ranking_id
0,1610612737,7,East,82,35,47,1,1
1,1610612737,8,East,82,28,54,235,2
2,1610612737,9,East,82,13,69,454,3
3,1610612737,10,East,82,26,56,685,4
4,1610612737,11,East,82,30,52,911,5
...,...,...,...,...,...,...,...,...
623,1610612766,23,East,82,39,43,3601,624
624,1610612766,24,East,65,23,42,3817,625
625,1610612766,25,East,72,33,39,4018,626
626,1610612766,26,East,82,43,39,4245,627


### Season

In [16]:
dim_season

Unnamed: 0,season_year,season_id
0,1996,1
1,1997,2
2,1998,3
3,1999,4
4,2000,5
5,2001,6
6,2002,7
7,2003,8
8,2004,9
9,2005,10


### Date

In [17]:
dim_date

Unnamed: 0,date,year,month,day,date_id
0,2003-10-04,2003,10,4,1
1,2003-10-05,2003,10,5,2
2,2003-10-06,2003,10,6,3
3,2003-10-07,2003,10,7,4
4,2003-10-08,2003,10,8,5
...,...,...,...,...,...
4319,2022-12-18,2022,12,18,4320
4320,2022-12-19,2022,12,19,4321
4321,2022-12-20,2022,12,20,4322
4322,2022-12-21,2022,12,21,4323


### Export DataFrames Into CSV

In [18]:
import os


folder_path = 'output_csv'
os.makedirs(folder_path, exist_ok=True)

dfs = {
    'fact_game': fact_game,
    'dim_player_performance': dim_player_performance,
    'dim_date': dim_date,
    'dim_season': dim_season,
    'dim_team': dim_team,
    'dim_ranking': dim_ranking,
    'dim_player_static': dim_player_static,
    'dim_player_dynamic': dim_player_dynamic
}

for name, df in dfs.items():
    df.to_csv(os.path.join(folder_path, f"{name}.csv"), index=False)

## Insert DataFrames Into PostgreSQL

In [19]:
import psycopg2
from psycopg2.extras import execute_values
import pandas as pd
import numpy as np
import logging

In [20]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Database connection parameters
db_params = {
    "dbname": "NBA",
    "user": "postgres",
    "password": "password",
    "host": "localhost",
    "port": "5433"
}

# Schema creation SQL statements
sql_schema_file = 'create_schema.sql'
try:
    with open(sql_schema_file, 'r') as file:
        schema_sql = file.read()
except Exception as e:
    logger.error(f"Error reading schema file: {e}")
    raise

In [21]:
def create_schema(connection, schema_sql):
    """Create database schema."""
    try:
        cursor = connection.cursor()
        cursor.execute(schema_sql)
        connection.commit()
        logger.info("Database schema created successfully.")
    except Exception as e:
        logger.error(f"Error creating schema: {e}")
        connection.rollback()
        raise
    finally:
        cursor.close()

def insert_data(connection, insert_query, values, table_name):
    """Insert data into a table using batch insert."""
    try:
        cursor = connection.cursor()
        logger.info(f"Inserting {len(values)} rows into {table_name}")
        # Log types of first row for debugging
        if values:
            logger.debug(f"First row types: {[type(v) for v in values[0]]}")
        execute_values(cursor, insert_query, values)
        connection.commit()
        logger.info(f"Data inserted successfully into {table_name}")
    except Exception as e:
        logger.error(f"Error inserting data into {table_name}: {e}")
        connection.rollback()
        raise
    finally:
        cursor.close()

In [None]:
# Main execution
try:
    # Establish database connection
    connection = psycopg2.connect(**db_params)
    logger.info("Connected to PostgreSQL database.")

    # Create schema
    create_schema(connection, schema_sql)

    # Insert dim_season
    logger.info("Preparing dim_season")
    logger.info(f"dim_season dtypes:\n{dim_season.dtypes}")
    # Explicitly convert to Python int in the values list
    values = [(int(row['season_id']), int(row['season_year'])) for index, row in dim_season.iterrows()]
    insert_query = """
    INSERT INTO dim_season (season_id, season_year)
    VALUES %s
    ON CONFLICT (season_id) DO NOTHING;
    """
    insert_data(connection, insert_query, values, "dim_season")

    # Insert dim_date
    logger.info("Preparing dim_date")
    logger.info(f"dim_date dtypes:\n{dim_date.dtypes}")
    dim_date['date'] = pd.to_datetime(dim_date['date'])
    values = [
        (int(row['date_id']), row['date'], int(row['year']), int(row['month']), int(row['day']))
        for index, row in dim_date.iterrows()
    ]
    insert_query = """
    INSERT INTO dim_date (date_id, date, year, month, day)
    VALUES %s
    ON CONFLICT (date_id) DO NOTHING;
    """
    insert_data(connection, insert_query, values, "dim_date")

    # Insert dim_team
    logger.info("Preparing dim_team")
    logger.info(f"dim_team dtypes:\n{dim_team.dtypes}")
    values = [
        (
            int(row['team_id']), row['abbreviation'], row['nickname'], int(row['year_founded']),
            int(row['start_year']), int(row['end_year']), row['city'], row['arena'],
            int(row['arena_capacity']), row['owner'], row['general_manager'], row['head_coach'],
            row['conference']
        ) for index, row in dim_team.iterrows()
    ]
    insert_query = """
    INSERT INTO dim_team (team_id, abbreviation, nickname, year_founded, start_year, end_year,
                          city, arena, arena_capacity, owner, general_manager, head_coach, conference)
    VALUES %s
    ON CONFLICT (team_id) DO NOTHING;
    """
    insert_data(connection, insert_query, values, "dim_team")

    # Insert dim_player_static
    logger.info("Preparing dim_player_static")
    logger.info(f"dim_player_static dtypes:\n{dim_player_static.dtypes}")
    values = [
        (
            int(row['player_id']), row['player_name'], float(row['height']) if pd.notnull(row['height']) else None,
            row['college'], row['country'], int(row['draft_year']), int(row['draft_round']),
            int(row['draft_number'])
        ) for index, row in dim_player_static.iterrows()
    ]
    insert_query = """
    INSERT INTO dim_player_static (player_id, player_name, height, college, country,
                                  draft_year, draft_round, draft_number)
    VALUES %s
    ON CONFLICT (player_id) DO NOTHING;
    """
    insert_data(connection, insert_query, values, "dim_player_static")

    # Insert dim_player_dynamic
    logger.info("Preparing dim_player_dynamic")
    logger.info(f"dim_player_dynamic dtypes:\n{dim_player_dynamic.dtypes}")
    dim_player_dynamic = dim_player_dynamic[dim_player_dynamic['team_id'] != 0]  # Remove invalid team_id
    values = [
        (
            int(row['player_id']), int(row['team_id']), int(row['season_id']),
            float(row['weight']) if pd.notnull(row['weight']) else None
        ) for index, row in dim_player_dynamic.iterrows()
    ]
    insert_query = """
    INSERT INTO dim_player_dynamic (player_id, team_id, season_id, weight)
    VALUES %s
    ON CONFLICT (player_id, team_id, season_id) DO NOTHING;
    """
    insert_data(connection, insert_query, values, "dim_player_dynamic")

    # Insert dim_ranking
    logger.info("Preparing dim_ranking")
    logger.info(f"dim_ranking dtypes:\n{dim_ranking.dtypes}")
    values = [
        (
            int(row['ranking_id']), int(row['team_id']), int(row['season_id']), row['conference'],
            int(row['game_played']), int(row['wins']), int(row['lose']), int(row['date_id'])
        ) for index, row in dim_ranking.iterrows()
    ]
    insert_query = """
    INSERT INTO dim_ranking (ranking_id, team_id, season_id, conference, game_played, wins, lose, date_id)
    VALUES %s
    ON CONFLICT (ranking_id) DO NOTHING;
    """
    insert_data(connection, insert_query, values, "dim_ranking")

    # Insert fact_game
    logger.info("Preparing fact_game")
    logger.info(f"fact_game dtypes:\n{fact_game.dtypes}")
    values = [
        (
            int(row['game_id']), int(row['season_id']), int(row['home_team_id']), int(row['visitor_team_id']),
            int(row['home_points']), float(row['home_field_goal_percentage']) if pd.notnull(row['home_field_goal_percentage']) else None,
            float(row['home_free_throw_percentage']) if pd.notnull(row['home_free_throw_percentage']) else None,
            float(row['home_three_pointer_percentage']) if pd.notnull(row['home_three_pointer_percentage']) else None,
            int(row['home_assists']), int(row['home_rebounds']), int(row['visitor_points']),
            float(row['visitor_field_goal_percentage']) if pd.notnull(row['visitor_field_goal_percentage']) else None,
            float(row['visitor_free_throw_percentage']) if pd.notnull(row['visitor_free_throw_percentage']) else None,
            float(row['visitor_three_pointer_percentage']) if pd.notnull(row['visitor_three_pointer_percentage']) else None,
            int(row['visitor_assists']), int(row['visitor_rebounds']), bool(row['home_team_wins']),
            int(row['date_id'])
        ) for index, row in fact_game.iterrows()
    ]
    insert_query = """
    INSERT INTO fact_game (
        game_id, season_id, home_team_id, visitor_team_id, home_points, home_field_goal_percentage,
        home_free_throw_percentage, home_three_pointer_percentage, home_assists, home_rebounds,
        visitor_points, visitor_field_goal_percentage, visitor_free_throw_percentage,
        visitor_three_pointer_percentage, visitor_assists, visitor_rebounds, home_team_wins, date_id
    )
    VALUES %s
    ON CONFLICT (game_id) DO NOTHING;
    """
    insert_data(connection, insert_query, values, "fact_game")

    # Insert dim_player_performance
    logger.info("Preparing dim_player_performance")
    logger.info(f"dim_player_performance dtypes:\n{dim_player_performance.dtypes}")
    values = [
        (
            int(row['performance_id']), int(row['game_id']), int(row['player_id']), int(row['team_id']),
            str(row['minutes_played']), int(row['field_goals_made']), int(row['field_goals_attempted']),
            float(row['field_goal_percentage']) if pd.notnull(row['field_goal_percentage']) else None,
            int(row['three_pointers_made']), int(row['three_pointers_attempted']),
            float(row['three_pointer_percentage']) if pd.notnull(row['three_pointer_percentage']) else None,
            int(row['free_throws_made']), int(row['free_throws_attempted']),
            float(row['free_throw_percentage']) if pd.notnull(row['free_throw_percentage']) else None,
            int(row['offensive_rebounds']), int(row['defensive_rebounds']), int(row['total_rebounds']),
            int(row['assists']), int(row['steals']), int(row['blocks']),
            int(row['turnovers']), int(row['personal_fouls']), int(row['points'])
        ) for index, row in dim_player_performance.iterrows()
    ]
    insert_query = """
    INSERT INTO dim_player_performance (
        performance_id, game_id, player_id, team_id, minutes_played, field_goals_made,
        field_goals_attempted, field_goal_percentage, three_pointers_made, three_pointers_attempted,
        three_pointer_percentage, free_throws_made, free_throws_attempted, free_throw_percentage,
        offensive_rebounds, defensive_rebounds, total_rebounds, assists, steals, blocks,
        turnovers, personal_fouls, points
    )
    VALUES %s
    ON CONFLICT (performance_id) DO NOTHING;
    """
    insert_data(connection, insert_query, values, "dim_player_performance")

except Exception as e:
    logger.error(f"An error occurred: {e}")
    if 'connection' in locals():
        connection.rollback()
finally:
    if 'connection' in locals():
        connection.close()
        logger.info("Database connection closed.")
