In [1]:
import pandas as pd
import os

In [2]:
#read in csv
csv_file_path = os.path.join("..","..","Resources","mlb-2023-UTC.csv")
mlb_df = pd.read_csv(csv_file_path)
mlb_df.head()

Unnamed: 0,Match Number,Round Number,Date,Location,Home Team,Away Team,Result
0,1,1,30/03/2023 17:05,Nationals Park,Washington Nationals,Atlanta Braves,2 - 7
1,2,1,30/03/2023 17:05,Yankee Stadium,New York Yankees,San Francisco Giants,5 - 0
2,3,1,30/03/2023 18:10,Fenway Park,Boston Red Sox,Baltimore Orioles,9 - 10
3,4,1,30/03/2023 18:20,Wrigley Field,Chicago Cubs,Milwaukee Brewers,4 - 0
4,5,1,30/03/2023 19:10,Tropicana Field,Tampa Bay Rays,Detroit Tigers,4 - 0


In [3]:
#Split date column into date and time
mlb_df[['Date', 'Time']] = mlb_df['Date'].str.split(pat=' ', n=1, expand=True)

mlb_df.head()

Unnamed: 0,Match Number,Round Number,Date,Location,Home Team,Away Team,Result,Time
0,1,1,30/03/2023,Nationals Park,Washington Nationals,Atlanta Braves,2 - 7,17:05
1,2,1,30/03/2023,Yankee Stadium,New York Yankees,San Francisco Giants,5 - 0,17:05
2,3,1,30/03/2023,Fenway Park,Boston Red Sox,Baltimore Orioles,9 - 10,18:10
3,4,1,30/03/2023,Wrigley Field,Chicago Cubs,Milwaukee Brewers,4 - 0,18:20
4,5,1,30/03/2023,Tropicana Field,Tampa Bay Rays,Detroit Tigers,4 - 0,19:10


In [4]:
#change date format
mlb_df['Date'] = pd.to_datetime(mlb_df['Date'], format='%d/%m/%Y')
mlb_df['Date'] = mlb_df['Date'].dt.strftime('%m/%d/%Y')
mlb_df.head()

Unnamed: 0,Match Number,Round Number,Date,Location,Home Team,Away Team,Result,Time
0,1,1,03/30/2023,Nationals Park,Washington Nationals,Atlanta Braves,2 - 7,17:05
1,2,1,03/30/2023,Yankee Stadium,New York Yankees,San Francisco Giants,5 - 0,17:05
2,3,1,03/30/2023,Fenway Park,Boston Red Sox,Baltimore Orioles,9 - 10,18:10
3,4,1,03/30/2023,Wrigley Field,Chicago Cubs,Milwaukee Brewers,4 - 0,18:20
4,5,1,03/30/2023,Tropicana Field,Tampa Bay Rays,Detroit Tigers,4 - 0,19:10


In [5]:
#changing time zone to central time zone (CST)
mlb_df['Time'] = pd.to_datetime(mlb_df['Time'], utc=True)
# Convert the time column to CST time zone
mlb_df['CST Gametime'] = mlb_df['Time'].dt.tz_convert('America/Chicago')

# Only show the time in 12 hr format
mlb_df['CST Gametime'] = mlb_df['CST Gametime'].apply(lambda x: x.strftime('%I:%M %p'))

# Print the DataFrame
mlb_df.head()

Unnamed: 0,Match Number,Round Number,Date,Location,Home Team,Away Team,Result,Time,CST Gametime
0,1,1,03/30/2023,Nationals Park,Washington Nationals,Atlanta Braves,2 - 7,2023-08-29 17:05:00+00:00,12:05 PM
1,2,1,03/30/2023,Yankee Stadium,New York Yankees,San Francisco Giants,5 - 0,2023-08-29 17:05:00+00:00,12:05 PM
2,3,1,03/30/2023,Fenway Park,Boston Red Sox,Baltimore Orioles,9 - 10,2023-08-29 18:10:00+00:00,01:10 PM
3,4,1,03/30/2023,Wrigley Field,Chicago Cubs,Milwaukee Brewers,4 - 0,2023-08-29 18:20:00+00:00,01:20 PM
4,5,1,03/30/2023,Tropicana Field,Tampa Bay Rays,Detroit Tigers,4 - 0,2023-08-29 19:10:00+00:00,02:10 PM


In [6]:
#Drop old time column
mlb_df = mlb_df.drop(columns=['Time'])
mlb_df.head()

Unnamed: 0,Match Number,Round Number,Date,Location,Home Team,Away Team,Result,CST Gametime
0,1,1,03/30/2023,Nationals Park,Washington Nationals,Atlanta Braves,2 - 7,12:05 PM
1,2,1,03/30/2023,Yankee Stadium,New York Yankees,San Francisco Giants,5 - 0,12:05 PM
2,3,1,03/30/2023,Fenway Park,Boston Red Sox,Baltimore Orioles,9 - 10,01:10 PM
3,4,1,03/30/2023,Wrigley Field,Chicago Cubs,Milwaukee Brewers,4 - 0,01:20 PM
4,5,1,03/30/2023,Tropicana Field,Tampa Bay Rays,Detroit Tigers,4 - 0,02:10 PM


In [7]:
# Rename column
new_column_name = 'Week Number'
mlb_df.rename(columns={'Round Number': new_column_name}, inplace=True)
mlb_df.head()

Unnamed: 0,Match Number,Week Number,Date,Location,Home Team,Away Team,Result,CST Gametime
0,1,1,03/30/2023,Nationals Park,Washington Nationals,Atlanta Braves,2 - 7,12:05 PM
1,2,1,03/30/2023,Yankee Stadium,New York Yankees,San Francisco Giants,5 - 0,12:05 PM
2,3,1,03/30/2023,Fenway Park,Boston Red Sox,Baltimore Orioles,9 - 10,01:10 PM
3,4,1,03/30/2023,Wrigley Field,Chicago Cubs,Milwaukee Brewers,4 - 0,01:20 PM
4,5,1,03/30/2023,Tropicana Field,Tampa Bay Rays,Detroit Tigers,4 - 0,02:10 PM


In [8]:
#drop Match Number column
mlb_df = mlb_df.drop(columns=["Match Number"])
mlb_df.head()

Unnamed: 0,Week Number,Date,Location,Home Team,Away Team,Result,CST Gametime
0,1,03/30/2023,Nationals Park,Washington Nationals,Atlanta Braves,2 - 7,12:05 PM
1,1,03/30/2023,Yankee Stadium,New York Yankees,San Francisco Giants,5 - 0,12:05 PM
2,1,03/30/2023,Fenway Park,Boston Red Sox,Baltimore Orioles,9 - 10,01:10 PM
3,1,03/30/2023,Wrigley Field,Chicago Cubs,Milwaukee Brewers,4 - 0,01:20 PM
4,1,03/30/2023,Tropicana Field,Tampa Bay Rays,Detroit Tigers,4 - 0,02:10 PM


In [10]:
# Read in `team` csv so appropriate IDs can be added
team_path = os.path.join("..","..","Resources","team.csv")
team_df = pd.read_csv(team_path)

# Take only mlb teams and reset index

mlb_teams_df = team_df[team_df['league_id'] == 4].reset_index()

mlb_teams_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   index      30 non-null     int64 
 1   team_id    30 non-null     int64 
 2   team       30 non-null     object
 3   venue_id   30 non-null     int64 
 4   league_id  30 non-null     int64 
dtypes: int64(4), object(1)
memory usage: 1.3+ KB


In [11]:
# Confirm all team names match with mlb_teams_df
team_nf = []

for i in range(len(mlb_df)):
    home_team = mlb_df.loc[i, 'Home Team']
    away_team = mlb_df.loc[i, 'Away Team']
    
    if home_team not in list(mlb_teams_df['team']):
        if home_team not in team_nf:
            print(f"{home_team} not found. Adding to tracker...")
            team_nf.append(home_team)
    
    if away_team not in list(mlb_teams_df['team']):
        if away_team not in team_nf:
            print(f"{away_team} not found. Adding to tracker...")
            team_nf.append(away_team)
            
print(team_nf)

[]


In [12]:
# Assign each row a 'home_id', 'away_id', and 'venue_id'
mlb_df['home_id'] = ''
mlb_df['away_id'] = ''
mlb_df['venue_id'] = ''

for i in range(len(mlb_df)):
    home_team = mlb_teams_df[mlb_teams_df['team'] == mlb_df.loc[i, 'Home Team']]
    away_team = mlb_teams_df[mlb_teams_df['team'] == mlb_df.loc[i, 'Away Team']]
    
    mlb_df.loc[i, 'home_id'] = int(home_team['team_id'])
    mlb_df.loc[i, 'venue_id'] = int(home_team['venue_id'])
    mlb_df.loc[i, 'away_id'] = int(away_team['team_id'])

mlb_df
    

Unnamed: 0,Week Number,Date,Location,Home Team,Away Team,Result,CST Gametime,home_id,away_id,venue_id
0,1,03/30/2023,Nationals Park,Washington Nationals,Atlanta Braves,2 - 7,12:05 PM,111,122,98
1,1,03/30/2023,Yankee Stadium,New York Yankees,San Francisco Giants,5 - 0,12:05 PM,124,113,111
2,1,03/30/2023,Fenway Park,Boston Red Sox,Baltimore Orioles,9 - 10,01:10 PM,104,114,91
3,1,03/30/2023,Wrigley Field,Chicago Cubs,Milwaukee Brewers,4 - 0,01:20 PM,123,95,110
4,1,03/30/2023,Tropicana Field,Tampa Bay Rays,Detroit Tigers,4 - 0,02:10 PM,121,101,108
...,...,...,...,...,...,...,...,...,...,...
2425,27,10/01/2023,Citi Field,New York Mets,Philadelphia Phillies,,02:10 PM,99,100,86
2426,27,10/01/2023,Guaranteed Rate Field,Chicago White Sox,San Diego Padres,,02:10 PM,107,115,94
2427,27,10/01/2023,Kauffman Stadium,Kansas City Royals,New York Yankees,,02:10 PM,108,124,95
2428,27,10/01/2023,American Family Field,Milwaukee Brewers,Chicago Cubs,,02:10 PM,95,123,82


In [13]:
mlb_formatted_df = mlb_df[["Date", "CST Gametime", "home_id", "away_id", "venue_id"]]
mlb_formatted_df = mlb_formatted_df.rename(columns={"Date":"event_date",
                                                   "CST Gametime": "event_time_cst"})
mlb_formatted_df.head()

Unnamed: 0,event_date,event_time_cst,home_id,away_id,venue_id
0,03/30/2023,12:05 PM,111,122,98
1,03/30/2023,12:05 PM,124,113,111
2,03/30/2023,01:10 PM,104,114,91
3,03/30/2023,01:20 PM,123,95,110
4,03/30/2023,02:10 PM,121,101,108


In [14]:
output_path = os.path.join("..","..","Resources","mlb_events.csv")
mlb_formatted_df.to_csv(output_path)