In [1]:
import pandas as pd

In [2]:

# Read all CSV files
ranking = pd.read_csv("ranking.csv", parse_dates=["STANDINGSDATE"])
teams = pd.read_csv("teams.csv")
players = pd.read_csv("players.csv")
players_data = pd.read_csv("players_data.csv")
games = pd.read_csv("games.csv", parse_dates=["GAME_DATE_EST"])
game_details = pd.read_csv("games_details.csv")


  game_details = pd.read_csv("games_details.csv")


In [3]:
# Step 1: Prepare dim_season (combine seasons from all datasets)
# From ranking.csv (SEASON_ID like 22022 -> 2022)
ranking['season_type'] = ranking['SEASON_ID'].astype(str).str[0]
ranking = ranking[ranking['season_type'] == '2'].copy()
ranking['season_year'] = ranking['SEASON_ID'].astype(str).str[1:].astype(int)

# From players.csv (SEASON like 2019)
players['season_year'] = players['SEASON'].astype(int)

# From players_data.csv (season like 1996-97)
players_data['season_year'] = players_data['season'].apply(lambda x: int(x.split('-')[0]))

# From games.csv (SEASON like 2022)
games['season_year'] = games['SEASON'].astype(int)

# Combine unique season years from all datasets
all_season_years = pd.concat([
    pd.Series(ranking['season_year'].unique()),
    pd.Series(players['season_year'].unique()),
    pd.Series(players_data['season_year'].unique()),
    pd.Series(games['season_year'].unique())
]).unique()
dim_season = pd.DataFrame(all_season_years, columns=['season_year'])
dim_season['season_id'] = dim_season.index + 1

# Convert 'season_id' and 'season_year' to Python int type (not numpy.int64)
dim_season['season_id'] = dim_season['season_id'].astype(int)
dim_season['season_year'] = dim_season['season_year'].astype(int)

In [4]:
dim_season

Unnamed: 0,season_year,season_id
0,2022,1
1,2021,2
2,2020,3
3,2019,4
4,2013,5
5,2012,6
6,2011,7
7,2010,8
8,2009,9
9,2008,10


In [5]:
# Map season_year to season_id in all datasets
ranking = ranking.merge(dim_season[['season_year', 'season_id']], on='season_year', how='left')
players = players.merge(dim_season[['season_year', 'season_id']], on='season_year', how='left')
players_data = players_data.merge(dim_season[['season_year', 'season_id']], on='season_year', how='left')
games = games.merge(dim_season[['season_year', 'season_id']], on='season_year', how='left')

In [6]:
# Step 2: Prepare dim_team from teams.csv
dim_team = teams[['TEAM_ID', 'ABBREVIATION', 'NICKNAME', 'YEARFOUNDED', 'MIN_YEAR', 'MAX_YEAR', 'CITY', 'ARENA', 'ARENACAPACITY', 'OWNER', 'GENERALMANAGER', 'HEADCOACH']].copy()
dim_team.rename(columns={
    'TEAM_ID': 'team_id',
    'ABBREVIATION': 'abbreviation',
    'NICKNAME': 'nickname',
    'YEARFOUNDED': 'year_founded',
    'MIN_YEAR': 'start_year',
    'MAX_YEAR': 'end_year',
    'CITY': 'city',
    'ARENA': 'arena',
    'ARENACAPACITY': 'arena_capacity',
    'OWNER': 'owner',
    'GENERALMANAGER': 'general_manager',
    'HEADCOACH': 'head_coach'
}, inplace=True)

# Ensure team_id is an integer
dim_team['team_id'] = dim_team['team_id'].astype(int)

# Ensure year_founded, start_year, end_year are integers
dim_team['year_founded'] = dim_team['year_founded'].astype(int)
dim_team['start_year'] = dim_team['start_year'].astype(int)
dim_team['end_year'] = dim_team['end_year'].astype(int)

# Fill NaN in arena_capacity with 0 and ensure it's an integer
dim_team['arena_capacity'] = dim_team['arena_capacity'].fillna(0).astype(int)

# Fill NaN in string columns with 'Unknown'
dim_team.fillna({
    'abbreviation': 'Unknown',
    'nickname': 'Unknown',
    'city': 'Unknown',
    'arena': 'Unknown',
    'owner': 'Unknown',
    'general_manager': 'Unknown',
    'head_coach': 'Unknown'
}, inplace=True)

# Add conference from ranking.csv
team_conference = ranking[['TEAM_ID', 'CONFERENCE']].drop_duplicates().copy()
team_conference.rename(columns={'TEAM_ID': 'team_id', 'CONFERENCE': 'conference'}, inplace=True)
team_conference = (team_conference.groupby('team_id')['conference']
                   .agg(lambda x: x.mode()[0] if not x.empty else 'Unknown')
                   .reset_index())
dim_team = dim_team.merge(team_conference, on='team_id', how='left')
dim_team['conference'] = dim_team['conference'].fillna('Unknown')

# Ensure team_id is unique
dim_team = dim_team.drop_duplicates(subset=['team_id'])


In [7]:
dim_team

Unnamed: 0,team_id,abbreviation,nickname,year_founded,start_year,end_year,city,arena,arena_capacity,owner,general_manager,head_coach,conference
0,1610612737,ATL,Hawks,1949,1949,2019,Atlanta,State Farm Arena,18729,Tony Ressler,Travis Schlenk,Lloyd Pierce,East
1,1610612738,BOS,Celtics,1946,1946,2019,Boston,TD Garden,18624,Wyc Grousbeck,Danny Ainge,Brad Stevens,East
2,1610612740,NOP,Pelicans,2002,2002,2019,New Orleans,Smoothie King Center,0,Tom Benson,Trajan Langdon,Alvin Gentry,East
3,1610612741,CHI,Bulls,1966,1966,2019,Chicago,United Center,21711,Jerry Reinsdorf,Gar Forman,Jim Boylen,East
4,1610612742,DAL,Mavericks,1980,1980,2019,Dallas,American Airlines Center,19200,Mark Cuban,Donnie Nelson,Rick Carlisle,West
5,1610612743,DEN,Nuggets,1976,1976,2019,Denver,Pepsi Center,19099,Stan Kroenke,Tim Connelly,Michael Malone,West
6,1610612745,HOU,Rockets,1967,1967,2019,Houston,Toyota Center,18104,Tilman Fertitta,Daryl Morey,Mike D'Antoni,West
7,1610612746,LAC,Clippers,1970,1970,2019,Los Angeles,Staples Center,19060,Steve Ballmer,Michael Winger,Doc Rivers,West
8,1610612747,LAL,Lakers,1948,1948,2019,Los Angeles,Staples Center,19060,Jerry Buss Family Trust,Rob Pelinka,Frank Vogel,West
9,1610612748,MIA,Heat,1988,1988,2019,Miami,AmericanAirlines Arena,19600,Micky Arison,Pat Riley,Erik Spoelstra,East


In [8]:
# Step 3: Prepare dim_ranking from ranking.csv
# Drop unnecessary columns
ranking.drop(columns=['RETURNTOPLAY', 'LEAGUE_ID', 'TEAM', 'W_PCT', 'HOME_RECORD', 'ROAD_RECORD'], inplace=True)

# Get the last entry for each TEAM_ID per season_year
latest_per_season = (ranking.sort_values(["TEAM_ID", "season_year", "STANDINGSDATE"])
                     .groupby(["TEAM_ID", "season_year"])
                     .last()
                     .reset_index())
# Step 4: Prepare dim_date (combine dates from ranking.csv and games.csv)
# Dates from ranking.csv
ranking_dates = pd.DataFrame(latest_per_season['STANDINGSDATE'].unique(), columns=['date'])

# Dates from games.csv
game_dates = pd.DataFrame(games['GAME_DATE_EST'].unique(), columns=['date'])

# Combine unique dates
all_dates = pd.concat([ranking_dates, game_dates]).drop_duplicates().reset_index(drop=True)
dim_date = pd.DataFrame(all_dates, columns=['date'])
dim_date['date'] = pd.to_datetime(dim_date['date'])
dim_date['year'] = dim_date['date'].dt.year.astype(int)
dim_date['month'] = dim_date['date'].dt.month.astype(int)
dim_date['day'] = dim_date['date'].dt.day.astype(int)
dim_date['date_id'] = dim_date.index + 1

# Map STANDINGSDATE to date_id
latest_per_season = latest_per_season.merge(dim_date[['date', 'date_id']], 
                                          left_on='STANDINGSDATE', 
                                          right_on='date', 
                                          how='left')

# Prepare dim_ranking with date_id
dim_ranking = latest_per_season[['TEAM_ID', 'season_id', 'CONFERENCE', 'G', 'W', 'L', 'date_id']].copy()
dim_ranking.rename(columns={
    'TEAM_ID': 'team_id',
    'CONFERENCE': 'conference',
    'G': 'game_played',
    'W': 'wins',
    'L': 'lose'
}, inplace=True)
dim_ranking['ranking_id'] = dim_ranking.index + 1

# Ensure numeric columns are integers
dim_ranking['team_id'] = dim_ranking['team_id'].astype(int)
dim_ranking['season_id'] = dim_ranking['season_id'].astype(int)
dim_ranking['game_played'] = dim_ranking['game_played'].astype(int)
dim_ranking['wins'] = dim_ranking['wins'].astype(int)
dim_ranking['lose'] = dim_ranking['lose'].astype(int)
dim_ranking['date_id'] = dim_ranking['date_id'].astype(int)
dim_ranking['ranking_id'] = dim_ranking['ranking_id'].astype(int)

# Fill any missing string columns
dim_ranking['conference'] = dim_ranking['conference'].fillna('Unknown')

# Drop temporary columns
dim_ranking.drop(columns=['SEASON_ID', 'season_year', 'season_type'], inplace=True, errors='ignore')


In [9]:
dim_ranking

Unnamed: 0,team_id,season_id,conference,game_played,wins,lose,date_id,ranking_id
0,1610612737,16,East,82,35,47,1,1
1,1610612737,15,East,82,28,54,2,2
2,1610612737,14,East,82,13,69,3,3
3,1610612737,13,East,82,26,56,4,4
4,1610612737,12,East,82,30,52,5,5
...,...,...,...,...,...,...,...,...
623,1610612766,17,East,82,39,43,17,624
624,1610612766,4,East,65,23,42,18,625
625,1610612766,3,East,72,33,39,19,626
626,1610612766,2,East,82,43,39,20,627


In [26]:
# Step 5: Prepare dim_player_static and dim_player_dynamic
# Prepare dim_player_static
dim_player_static = players[['PLAYER_ID', 'PLAYER_NAME']].drop_duplicates().copy()
dim_player_static.rename(columns={'PLAYER_ID': 'player_id', 'PLAYER_NAME': 'player_name'}, inplace=True)

# Ensure player_id is an integer
dim_player_static['player_id'] = dim_player_static['player_id'].astype(int)

# Merge with players_data to get static attributes
players_data_static = players_data[['player_name', 'player_height', 'college', 'country', 'draft_year', 'draft_round', 'draft_number']].drop_duplicates(subset=['player_name']).copy()
dim_player_static = dim_player_static.merge(players_data_static, on='player_name', how='left')

# Rename player_height to height and keep it as a numeric value (in cm)
dim_player_static.rename(columns={'player_height': 'height'}, inplace=True)

dim_player_static['draft_year'] = dim_player_static['draft_year'].replace('Undrafted', 0)
dim_player_static['draft_round'] = dim_player_static['draft_year'].replace('Undrafted', 0)
dim_player_static['draft_number'] = dim_player_static['draft_year'].replace('Undrafted', 0)

# Fill missing static attributes with defaults
dim_player_static.fillna({
    'draft_year': 0,
    'draft_round': 0,
    'draft_number': 0,
    'height': 0,
    'college': 'Unknown',
    'country': 'Unknown'
}, inplace=True)


# Ensure player_id is unique
dim_player_static = dim_player_static.drop_duplicates(subset=['player_id'])

# Prepare dim_player_dynamic
# From players.csv (no weight data)
dim_player_dynamic_players = players[['PLAYER_ID', 'TEAM_ID', 'season_id']].copy()
dim_player_dynamic_players.rename(columns={'PLAYER_ID': 'player_id', 'TEAM_ID': 'team_id'}, inplace=True)
dim_player_dynamic_players['weight'] = None
dim_player_dynamic_players['source'] = 'players'

# Ensure player_id and team_id are integers
dim_player_dynamic_players['player_id'] = dim_player_dynamic_players['player_id'].astype(int)
dim_player_dynamic_players['team_id'] = dim_player_dynamic_players['team_id'].astype(int)
dim_player_dynamic_players['season_id'] = dim_player_dynamic_players['season_id'].astype(int)

# From players_data.csv (has weight data)
dim_player_dynamic_data = players_data[['player_name', 'team_abbreviation', 'season_id', 'player_weight']].copy()
dim_player_dynamic_data.rename(columns={'player_weight': 'weight'}, inplace=True)

# Ensure weight is numeric
dim_player_dynamic_data['weight'] = pd.to_numeric(dim_player_dynamic_data['weight'], errors='coerce')

# Map player_name to player_id
dim_player_dynamic_data = dim_player_dynamic_data.merge(dim_player_static[['player_name', 'player_id']], on='player_name', how='left')

# Drop rows with NaN player_id (unmatched players)
dim_player_dynamic_data = dim_player_dynamic_data.dropna(subset=['player_id'])

# Ensure player_id is an integer after merge
dim_player_dynamic_data['player_id'] = dim_player_dynamic_data['player_id'].astype(int)

# Map team_abbreviation to team_id using dim_team
dim_player_dynamic_data = dim_player_dynamic_data.merge(dim_team[['abbreviation', 'team_id']], 
                                                        left_on='team_abbreviation', 
                                                        right_on='abbreviation', 
                                                        how='left')

# Fill NaN team_id with 0 (indicating unknown team)
dim_player_dynamic_data['team_id'] = dim_player_dynamic_data['team_id'].fillna(0).astype(int)

# Ensure season_id is an integer
dim_player_dynamic_data['season_id'] = dim_player_dynamic_data['season_id'].astype(int)

# Select columns for dim_player_dynamic
dim_player_dynamic_data = dim_player_dynamic_data[['player_id', 'team_id', 'season_id', 'weight']]
dim_player_dynamic_data['source'] = 'players_data'

# Combine both sources
if not dim_player_dynamic_data.empty:
    dim_player_dynamic = pd.concat([dim_player_dynamic_players, dim_player_dynamic_data], ignore_index=True)
else:
    print("\nWarning: dim_player_dynamic_data is empty after dropping unmatched players.")
    dim_player_dynamic = dim_player_dynamic_players.copy()

# Ensure player_id, team_id, and season_id are integers after concatenation
dim_player_dynamic['player_id'] = dim_player_dynamic['player_id'].astype(int)
dim_player_dynamic['team_id'] = dim_player_dynamic['team_id'].astype(int)
dim_player_dynamic['season_id'] = dim_player_dynamic['season_id'].astype(int)

# Clean dim_player_dynamic
dim_player_dynamic = dim_player_dynamic.drop_duplicates(subset=['player_id', 'team_id', 'season_id'])

# Fill missing weights with 0
dim_player_dynamic['weight'] = dim_player_dynamic['weight'].fillna(0)

# Drop the source column
dim_player_dynamic.drop(columns=['source'], inplace=True)

  dim_player_dynamic = pd.concat([dim_player_dynamic_players, dim_player_dynamic_data], ignore_index=True)


In [27]:
dim_player_dynamic

Unnamed: 0,player_id,team_id,season_id,weight
0,1626220,1610612762,4,0.000000
1,202711,1610612762,4,0.000000
2,203497,1610612762,4,0.000000
3,1628378,1610612762,4,0.000000
4,201144,1610612762,4,0.000000
...,...,...,...,...
16246,203925,1610612751,1,99.790240
16247,204060,1610612749,1,99.790240
16248,203954,1610612755,1,127.005760
16249,1628381,1610612737,1,102.511792


In [12]:
dim_player_static

Unnamed: 0,player_id,player_name,height,college,country,draft_year,draft_round,draft_number
0,1626220,Royce O'Neale,198.12,Baylor,USA,0,0,0
1,202711,Bojan Bogdanovic,203.20,Unknown,Croatia,2011,2011,2011
2,203497,Rudy Gobert,215.90,Unknown,France,2013,2013,2013
3,1628378,Donovan Mitchell,190.50,Louisville,USA,2017,2017,2017
4,201144,Mike Conley,185.42,Ohio State,USA,2007,2007,2007
...,...,...,...,...,...,...,...,...
1764,201831,Lanny Smith,0.00,Unknown,Unknown,0,0,0
1765,201999,Warren Carter,0.00,Unknown,Unknown,0,0,0
1766,201834,Bennet Davis,0.00,Unknown,Unknown,0,0,0
1767,201646,Brian Hamilton,0.00,Unknown,Unknown,0,0,0


In [13]:
# Step 6: Prepare fact_game from games.csv
fact_game = games[['GAME_ID', 'GAME_DATE_EST', 'season_id', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'PTS_home', 'PTS_away', 'HOME_TEAM_WINS']].copy()
fact_game.rename(columns={
    'GAME_ID': 'game_id',
    'HOME_TEAM_ID': 'home_team_id',
    'VISITOR_TEAM_ID': 'visitor_team_id',
    'PTS_home': 'home_points',
    'PTS_away': 'visitor_points',
    'HOME_TEAM_WINS': 'home_team_wins'
}, inplace=True)

print(fact_game.isna().sum())
print(fact_game.dtypes)

# Map GAME_DATE_EST to date_id
fact_game = fact_game.merge(dim_date[['date', 'date_id']], 
                            left_on='GAME_DATE_EST', 
                            right_on='date', 
                            how='left')

# Drop the date column
fact_game.drop(columns=['GAME_DATE_EST', 'date'], inplace=True)

# Ensure numeric columns are integers
# fact_game['game_id'] = fact_game['game_id'].astype(int)
# fact_game['season_id'] = fact_game['season_id'].astype(int)
# fact_game['home_team_id'] = fact_game['home_team_id'].astype(int)
# fact_game['visitor_team_id'] = fact_game['visitor_team_id'].astype(int)
# fact_game['home_points'] = fact_game['home_points'].astype(int)
# fact_game['visitor_points'] = fact_game['visitor_points'].astype(int)
# fact_game['date_id'] = fact_game['date_id'].astype(int)

# Ensure home_team_wins is boolean
fact_game['home_team_wins'] = fact_game['home_team_wins'].astype(bool)

# Step 7: Prepare dim_player_performance from game_details.csv
dim_player_performance = game_details[['GAME_ID', 'PLAYER_ID', 'TEAM_ID', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS']].copy()
dim_player_performance.rename(columns={
    'GAME_ID': 'game_id',
    'PLAYER_ID': 'player_id',
    'TEAM_ID': 'team_id',
    'MIN': 'minutes_played',
    'FGM': 'field_goals_made',
    'FGA': 'field_goals_attempted',
    'FG_PCT': 'field_goal_percentage',
    'FG3M': 'three_pointers_made',
    'FG3A': 'three_pointers_attempted',
    'FG3_PCT': 'three_pointer_percentage',
    'FTM': 'free_throws_made',
    'FTA': 'free_throws_attempted',
    'FT_PCT': 'free_throw_percentage',
    'OREB': 'offensive_rebounds',
    'DREB': 'defensive_rebounds',
    'REB': 'total_rebounds',
    'AST': 'assists',
    'STL': 'steals',
    'BLK': 'blocks',
    'TO': 'turnovers',
    'PF': 'personal_fouls',
    'PTS': 'points',
}, inplace=True)

# Generate performance_id
dim_player_performance['performance_id'] = dim_player_performance.index + 1

# Ensure numeric columns are integers where appropriate
# dim_player_performance['game_id'] = dim_player_performance['game_id'].astype(int)
# dim_player_performance['player_id'] = dim_player_performance['player_id'].astype(int)
# dim_player_performance['team_id'] = dim_player_performance['team_id'].astype(int)
# dim_player_performance['field_goals_made'] = dim_player_performance['field_goals_made'].fillna(0).astype(int)
# dim_player_performance['field_goals_attempted'] = dim_player_performance['field_goals_attempted'].fillna(0).astype(int)
# dim_player_performance['three_pointers_made'] = dim_player_performance['three_pointers_made'].fillna(0).astype(int)
# dim_player_performance['three_pointers_attempted'] = dim_player_performance['three_pointers_attempted'].fillna(0).astype(int)
# dim_player_performance['free_throws_made'] = dim_player_performance['free_throws_made'].fillna(0).astype(int)
# dim_player_performance['free_throws_attempted'] = dim_player_performance['free_throws_attempted'].fillna(0).astype(int)
# dim_player_performance['offensive_rebounds'] = dim_player_performance['offensive_rebounds'].fillna(0).astype(int)
# dim_player_performance['defensive_rebounds'] = dim_player_performance['defensive_rebounds'].fillna(0).astype(int)
# dim_player_performance['total_rebounds'] = dim_player_performance['total_rebounds'].fillna(0).astype(int)
# dim_player_performance['assists'] = dim_player_performance['assists'].fillna(0).astype(int)
# dim_player_performance['steals'] = dim_player_performance['steals'].fillna(0).astype(int)
# dim_player_performance['blocks'] = dim_player_performance['blocks'].fillna(0).astype(int)
# dim_player_performance['turnovers'] = dim_player_performance['turnovers'].fillna(0).astype(int)
# dim_player_performance['personal_fouls'] = dim_player_performance['personal_fouls'].fillna(0).astype(int)
# dim_player_performance['points'] = dim_player_performance['points'].fillna(0).astype(int)
# dim_player_performance['plus_minus'] = dim_player_performance['plus_minus'].fillna(0).astype(int)
# dim_player_performance['performance_id'] = dim_player_performance['performance_id'].astype(int)

# # Fill missing percentages with 0
# dim_player_performance['field_goal_percentage'] = dim_player_performance['field_goal_percentage'].fillna(0)
# dim_player_performance['three_pointer_percentage'] = dim_player_performance['three_pointer_percentage'].fillna(0)
# dim_player_performance['free_throw_percentage'] = dim_player_performance['free_throw_percentage'].fillna(0)

# Fill missing minutes_played with '0:00'
# dim_player_performance['minutes_played'] = dim_player_performance['minutes_played'].fillna('0:00')

# Ensure only valid player_id and team_id (foreign key constraints)
dim_player_performance = dim_player_performance[dim_player_performance['player_id'].isin(dim_player_static['player_id'])]
dim_player_performance = dim_player_performance[dim_player_performance['team_id'].isin(dim_team['team_id'])]


game_id             0
GAME_DATE_EST       0
season_id           0
home_team_id        0
visitor_team_id     0
home_points        99
visitor_points     99
home_team_wins      0
dtype: int64
game_id                     int64
GAME_DATE_EST      datetime64[ns]
season_id                   int32
home_team_id                int64
visitor_team_id             int64
home_points               float64
visitor_points            float64
home_team_wins              int64
dtype: object


In [14]:
fact_game

Unnamed: 0,game_id,season_id,home_team_id,visitor_team_id,home_points,visitor_points,home_team_wins,date_id
0,22200477,1,1610612740,1610612759,126.0,117.0,True,21
1,22200478,1,1610612762,1610612764,120.0,112.0,True,21
2,22200466,1,1610612739,1610612749,114.0,106.0,True,22
3,22200467,1,1610612755,1610612765,113.0,93.0,True,22
4,22200468,1,1610612737,1610612741,108.0,110.0,False,22
...,...,...,...,...,...,...,...,...
26646,11400007,21,1610612737,1610612740,93.0,87.0,True,4322
26647,11400004,21,1610612741,1610612764,81.0,85.0,False,4322
26648,11400005,21,1610612747,1610612743,98.0,95.0,True,4322
26649,11400002,21,1610612761,1610612758,99.0,94.0,True,4323


In [15]:
dim_player_performance

Unnamed: 0,game_id,player_id,team_id,minutes_played,field_goals_made,field_goals_attempted,field_goal_percentage,three_pointers_made,three_pointers_attempted,three_pointer_percentage,...,offensive_rebounds,defensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points,performance_id
0,22200477,1629641,1610612759,18:06,1.0,1.0,1.000,0.0,0.0,0.0,...,1.0,1.0,2.0,0.0,1.0,0.0,2.0,5.0,2.0,1
2,22200477,1627751,1610612759,21:42,6.0,9.0,0.667,0.0,0.0,0.0,...,1.0,3.0,4.0,1.0,1.0,0.0,2.0,4.0,13.0,3
5,22200477,1628380,1610612759,18:04,2.0,6.0,0.333,0.0,0.0,0.0,...,1.0,1.0,2.0,2.0,0.0,0.0,0.0,3.0,6.0,6
6,22200477,203926,1610612759,16:55,2.0,8.0,0.250,1.0,5.0,0.2,...,1.0,3.0,4.0,1.0,0.0,0.0,1.0,0.0,5.0,7
7,22200477,1626196,1610612759,21:20,5.0,11.0,0.455,2.0,5.0,0.4,...,1.0,2.0,3.0,3.0,0.0,0.0,1.0,2.0,14.0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
668623,11200005,202706,1610612743,19,4.0,9.0,0.444,3.0,6.0,0.5,...,0.0,2.0,2.0,0.0,2.0,0.0,1.0,3.0,17.0,668624
668624,11200005,202702,1610612743,23,7.0,11.0,0.636,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,3.0,3.0,18.0,668625
668625,11200005,201585,1610612743,15,3.0,7.0,0.429,0.0,0.0,0.0,...,3.0,5.0,8.0,0.0,1.0,0.0,0.0,3.0,6.0,668626
668626,11200005,202389,1610612743,19,1.0,1.0,1.000,0.0,0.0,0.0,...,1.0,2.0,3.0,1.0,0.0,0.0,4.0,2.0,2.0,668627


In [16]:
import psycopg2
import sqlalchemy


In [17]:
import psycopg2

# Replace with your PostgreSQL credentials
connection = psycopg2.connect(
    dbname="demo",
    user="postgres",
    password="0000",
    host="localhost",  # or use the host IP if not local
    port="5432"  # Default port
)

cursor = connection.cursor()

# Optional: Verify the connection by running a simple query (e.g., check PostgreSQL version)
cursor.execute("SELECT version();")
db_version = cursor.fetchone()
print("PostgreSQL version:", db_version)



PostgreSQL version: ('PostgreSQL 17.4 on x86_64-windows, compiled by msvc-19.43.34808, 64-bit',)


In [18]:

# SQL insert statement (ensure player_id is the primary key)
insert_query = """
INSERT INTO dim_player_static (player_id, player_name, height, college, country, draft_year, draft_round, draft_number)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (player_id) DO NOTHING;
"""

# Loop through the DataFrame and insert each row into PostgreSQL
for index, row in dim_player_static.iterrows():
    cursor.execute(insert_query, (
        row['player_id'], 
        row['player_name'], 
        row['height'], 
        row['college'], 
        row['country'], 
        row['draft_year'], 
        row['draft_round'], 
        row['draft_number']
    ))

# Commit the transaction to ensure the changes are saved
connection.commit()

# Close the cursor and connection after the operation is done
cursor.close()
connection.close()

print("Data has been successfully inserted into the PostgreSQL table.")


Data has been successfully inserted into the PostgreSQL table.


In [19]:
# Replace with your PostgreSQL credentials
connection = psycopg2.connect(
    dbname="demo",
    user="postgres",
    password="0000",
    host="localhost",  # or use the host IP if not local
    port="5432"  # Default port
)

cursor = connection.cursor()

# Optional: Verify the connection by running a simple query (e.g., check PostgreSQL version)
cursor.execute("SELECT version();")
db_version = cursor.fetchone()
print("PostgreSQL version:", db_version)


PostgreSQL version: ('PostgreSQL 17.4 on x86_64-windows, compiled by msvc-19.43.34808, 64-bit',)


In [20]:

# SQL Insert Query
insert_query = """
INSERT INTO dim_team (team_id, abbreviation, nickname, year_founded, start_year, end_year, city, arena, arena_capacity, owner, general_manager, head_coach)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (team_id) DO NOTHING;  -- To avoid inserting duplicates (use DO UPDATE if you want to update existing rows)
"""

# Loop through the DataFrame and insert each row into the PostgreSQL table
for index, row in dim_team.iterrows():
    try:
        cursor.execute(insert_query, (
            row['team_id'], 
            row['abbreviation'], 
            row['nickname'], 
            row['year_founded'], 
            row['start_year'], 
            row['end_year'], 
            row['city'], 
            row['arena'], 
            row['arena_capacity'], 
            row['owner'], 
            row['general_manager'], 
            row['head_coach']
        ))
    except Exception as e:
        print(f"Error inserting row {index}: {e}")
        connection.rollback()  # Rollback the current transaction on error
        continue  # Skip to the next row

# Commit the transaction to ensure the changes are saved
connection.commit()

# Close the cursor and connection after the operation is done
cursor.close()
connection.close()

print("Data has been successfully inserted into the PostgreSQL table.")


Data has been successfully inserted into the PostgreSQL table.


In [21]:
# Replace with your PostgreSQL credentials

import psycopg2
from psycopg2.extras import execute_values

connection = psycopg2.connect(
    dbname="demo",
    user="postgres",
    password="0000",
    host="localhost",  # or use the host IP if not local
    port="5432"  # Default port
)

cursor = connection.cursor()

# Convert 'season_id' and 'season_year' columns to native Python int
dim_season['season_id'] = dim_season['season_id'].map(int)
dim_season['season_year'] = dim_season['season_year'].map(int)

# Prepare a list of tuples for batch insert
values = [(int(row['season_id']), int(row['season_year'])) for index, row in dim_season.iterrows()]

# Insert multiple rows at once using execute_values
insert_query = """
INSERT INTO dim_season (season_id, season_year)
VALUES %s
ON CONFLICT (season_id) DO NOTHING;
"""

# Use execute_values for batch insert
execute_values(cursor, insert_query, values)

# Commit the transaction to ensure the changes are saved
connection.commit()

# Close the cursor and connection after the operation is done
cursor.close()
connection.close()

print("Data has been successfully inserted into the PostgreSQL table.")



Data has been successfully inserted into the PostgreSQL table.


In [22]:
dim_date

Unnamed: 0,date,year,month,day,date_id
0,2003-10-04,2003,10,4,1
1,2004-10-11,2004,10,11,2
2,2005-10-09,2005,10,9,3
3,2006-10-04,2006,10,4,4
4,2007-10-05,2007,10,5,5
...,...,...,...,...,...
4319,2014-10-08,2014,10,8,4320
4320,2014-10-07,2014,10,7,4321
4321,2014-10-06,2014,10,6,4322
4322,2014-10-05,2014,10,5,4323


In [23]:
connection = psycopg2.connect(
    dbname="demo",
    user="postgres",
    password="0000",
    host="localhost",  # or use the host IP if not local
    port="5432"  # Default port
)

cursor = connection.cursor()


# Convert 'date' column to datetime if not already done
dim_date['date'] = pd.to_datetime(dim_date['date'])

# Explicitly convert 'year', 'month', and 'day' to Python native int type
dim_date['year'] = dim_date['year'].apply(int)
dim_date['month'] = dim_date['month'].apply(int)
dim_date['day'] = dim_date['day'].apply(int)

# Prepare a list of tuples for batch insert
values = [(row['date'], row['year'], row['month'], row['day']) for index, row in dim_date.iterrows()]

# SQL Insert Query for dim_date
insert_query = """
INSERT INTO dim_date (date, year, month, day)
VALUES %s
ON CONFLICT (date_id) DO NOTHING;  -- To avoid inserting duplicates
"""

# Use execute_values for batch insert
execute_values(cursor, insert_query, values)

# Commit the transaction to ensure the changes are saved
connection.commit()

# Close the cursor and connection after the operation is done
cursor.close()
connection.close()

print("Data has been successfully inserted into the PostgreSQL table.")


Data has been successfully inserted into the PostgreSQL table.


In [31]:
dim_player_dynamic

Unnamed: 0,player_id,team_id,season_id,weight
0,1626220,1610612762,4,0.000000
1,202711,1610612762,4,0.000000
2,203497,1610612762,4,0.000000
3,1628378,1610612762,4,0.000000
4,201144,1610612762,4,0.000000
...,...,...,...,...
16246,203925,1610612751,1,99.790240
16247,204060,1610612749,1,99.790240
16248,203954,1610612755,1,127.005760
16249,1628381,1610612737,1,102.511792


In [39]:
# Escaping the backslashes
excel_path = "C:\\Users\\User\\Desktop\\uob jamil\\MS\\Spring 1 SAPIENZA\\data management\\project\\data\\jj.xlsx"

dim_player_dynamic.to_excel(excel_path, index=False)  # index=False to exclude the index column from being saved

print(f"Data has been saved to {excel_path}")


Data has been saved to C:\Users\User\Desktop\uob jamil\MS\Spring 1 SAPIENZA\data management\project\data\jj.xlsx


In [41]:
connection = psycopg2.connect(
    dbname="demo",
    user="postgres",
    password="0000",
    host="localhost",  # or use the host IP if not local
    port="5432"  # Default port
)

cursor = connection.cursor()
# Drop rows where 'team_id' is 0 using the drop method
dim_player_dynamic = dim_player_dynamic.drop(dim_player_dynamic[dim_player_dynamic['team_id'] == 0].index)

# Convert columns to native Python int type and weight to float
#dim_player_dynamic['player_id'] = dim_player_dynamic['player_id'].apply(int)
#dim_player_dynamic['team_id'] = dim_player_dynamic['team_id'].apply(int)
#dim_player_dynamic['season'] = dim_player_dynamic['season'].apply(int)
#dim_player_dynamic['weight'] = dim_player_dynamic['weight'].apply(float)

# Prepare a list of tuples for batch insert
values = [(row['player_id'], row['team_id'], row['season_id'], row['weight']) for index, row in dim_player_dynamic.iterrows()]

# SQL Insert Query for dim_player_dynamic
insert_query = """
INSERT INTO dim_player_dynamic (player_id, team_id, season_id, weight)
VALUES %s
ON CONFLICT (player_id, team_id, season_id) DO NOTHING;  -- To avoid inserting duplicates
"""

# Use execute_values for batch insert
execute_values(cursor, insert_query, values)

# Commit the transaction to ensure the changes are saved
connection.commit()

# Close the cursor and connection after the operation is done
cursor.close()
connection.close()

print("Data has been successfully inserted into the PostgreSQL table.")


Data has been successfully inserted into the PostgreSQL table.
