In [65]:
# Web Pages
# MLB - https://www.baseball-reference.com/leagues/MLB-schedule.shtml#today
# NFL - https://www.pro-football-reference.com/years/2023/games.htm
# NBA - https://www.basketball-reference.com/leagues/NBA_2024_games.html
# NHL - https://www.hockey-reference.com/leagues/NHL_2024_games.html
# MLS - https://fbref.com/en/comps/22/schedule/Major-League-Soccer-Scores-and-Fixtures

In [14]:
!pip install webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.0-py2.py3-none-any.whl (27 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.0.0 webdriver-manager-4.0.0


In [1]:
import pandas as pd
import datetime as dt
from bs4 import BeautifulSoup
from splinter import Browser
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

# MLB

In [2]:
# Read in MLB schedule data with BeautifulSoup
browser = Browser('chrome', service=ChromeService(ChromeDriverManager().install()))
browser.visit('https://www.baseball-reference.com/leagues/MLB-schedule.shtml#today')

mlb_html = browser.html
mlb_soup = BeautifulSoup(mlb_html, 'html.parser')

In [4]:
full_schedule = mlb_soup.find('div', class_='section_content')

date_divs = full_schedule.find_all('div')

In [5]:
passed_today = 0
mlb_schedule_dict = {
    'date':[],
    'time':[],
    'away':[],
    'home':[]
}

for div in date_divs:
    date = div.find('h3')
    if date.find('span'):
        passed_today = 1
    if passed_today == 1:
        games = div.find_all(class_='game')
        for game in games:
            mlb_schedule_dict['date'].append(date.text)
            time = game.find('span').text
            mlb_schedule_dict['time'].append(time)
            teams = game.find_all('a')
            mlb_schedule_dict['away'].append(teams[0].text)
            mlb_schedule_dict['home'].append(teams[1].text)

In [83]:
mlb = pd.DataFrame(mlb_schedule_dict)
print(mlb.info())
mlb.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 376 entries, 0 to 375
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    376 non-null    object
 1   time    376 non-null    object
 2   away    376 non-null    object
 3   home    376 non-null    object
dtypes: object(4)
memory usage: 11.9+ KB
None


Unnamed: 0,date,time,away,home
0,Today's Games,2:10 pm,Chicago White Sox,Kansas City Royals
1,Today's Games,2:20 pm,San Francisco Giants,Chicago Cubs
2,Today's Games,4:05 pm,Houston Astros,Texas Rangers
3,Today's Games,4:07 pm,Toronto Blue Jays,Oakland Athletics
4,Today's Games,4:10 pm,Colorado Rockies,Arizona D'Backs


In [84]:
# CLEANING

# Combine 'date' and 'time' columns
mlb.loc[mlb['date'] == "Today's Games", 'date'] = "Monday, September 4, 2023"

# Update remaining dates
for i, row in mlb.iterrows():
    datetime_string = f"{row['date']} {row['time']}"

    datetime = dt.datetime.strptime(datetime_string, '%A, %B %d, %Y %I:%M %p')
    mlb.loc[i,'datetime'] = datetime
    
mlb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 376 entries, 0 to 375
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      376 non-null    object        
 1   time      376 non-null    object        
 2   away      376 non-null    object        
 3   home      376 non-null    object        
 4   datetime  376 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 14.8+ KB


In [151]:
# Bring in 'team.csv'

team_path = "../../Resources/team.csv"
team_df = pd.read_csv(team_path)

In [93]:
# Confirm all team names match with teams_mlb
teams_mlb = team_df[team_df['league_id'] == 4].reset_index()
team_nf = []

for i, row in mlb.iterrows():
    home_team = mlb.loc[i, 'home']
    away_team = mlb.loc[i, 'away']
    
    if home_team not in list(teams_mlb['team']):
        if home_team not in team_nf:
            print(f"{home_team} not found. Adding to tracker...")
            team_nf.append(home_team)
    
    if away_team not in list(teams_mlb['team']):
        if away_team not in team_nf:
            print(f"{away_team} not found. Adding to tracker...")
            team_nf.append(away_team)
            
print(team_nf)

Arizona D'Backs not found. Adding to tracker...
["Arizona D'Backs"]


In [98]:
# Update "Arizona D'Backs" to "Arizona Diamondbacks"
mlb = mlb.replace("Arizona D'Backs", "Arizona Diamondbacks")

In [99]:
# Assign each row a 'home_id', 'away_id', and 'venue_id'
mlb['home_id'] = ''
mlb['away_id'] = ''
mlb['venue_id'] = ''

for i, row in mlb.iterrows():
    home_team = teams_mlb[teams_mlb['team'] == mlb.loc[i, 'home']]
    away_team = teams_mlb[teams_mlb['team'] == mlb.loc[i, 'away']]
    
    mlb.loc[i, 'home_id'] = int(home_team['team_id'])
    mlb.loc[i, 'venue_id'] = int(home_team['venue_id'])
    mlb.loc[i, 'away_id'] = int(away_team['team_id'])

mlb.head()

Unnamed: 0,date,time,away,home,datetime,home_id,away_id,venue_id
0,"Monday, September 4, 2023",2:10 pm,Chicago White Sox,Kansas City Royals,2023-09-04 14:10:00,108,107,95
1,"Monday, September 4, 2023",2:20 pm,San Francisco Giants,Chicago Cubs,2023-09-04 14:20:00,123,113,110
2,"Monday, September 4, 2023",4:05 pm,Houston Astros,Texas Rangers,2023-09-04 16:05:00,105,110,92
3,"Monday, September 4, 2023",4:07 pm,Toronto Blue Jays,Oakland Athletics,2023-09-04 16:07:00,112,118,99
4,"Monday, September 4, 2023",4:10 pm,Colorado Rockies,Arizona Diamondbacks,2023-09-04 16:10:00,98,102,85


In [100]:
# remove unwanted columns 
cols = ['home_id','away_id','venue_id','datetime']
mlb_formatted = mlb[cols]
mlb_formatted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 376 entries, 0 to 375
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   home_id   376 non-null    object        
 1   away_id   376 non-null    object        
 2   venue_id  376 non-null    object        
 3   datetime  376 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 11.9+ KB


# NFL

In [107]:
nfl = pd.read_html('https://www.pro-football-reference.com/years/2023/games.htm')[0]

nfl.head()

Unnamed: 0,Week,Day,Unnamed: 2,VisTm,Pts,Unnamed: 5,HomeTm,Pts.1,Time
0,Pre0,Thu,August 3,New York Jets,16,@,Cleveland Browns,21,8:00 PM
1,Pre1,Thu,August 10,Houston Texans,20,@,New England Patriots,9,7:00 PM
2,Pre1,Thu,August 10,Minnesota Vikings,13,@,Seattle Seahawks,24,10:00 PM
3,Pre1,Fri,August 11,Green Bay Packers,36,@,Cincinnati Bengals,19,7:00 PM
4,Pre1,Fri,August 11,New York Giants,16,@,Detroit Lions,21,7:00 PM


In [121]:
# Drop unwanted rows and columns

# Rows
nfl = nfl.drop(nfl[nfl['Week'].str.contains('(Pre\d|Week)')].index).reset_index()

# Columns
nfl = nfl[['Unnamed: 2','Time','VisTm','HomeTm']]
nfl = nfl.rename(columns={
    'Unnamed: 2':'date',
    'Time':'time',
    'VisTm':'away',
    'HomeTm':'home'
})

nfl.head(30)

  nfl = nfl.drop(nfl[nfl['Week'].str.contains('(Pre\d|Week)')].index).reset_index()


Unnamed: 0,date,time,away,home
0,September 7,8:20 PM,Detroit Lions,Kansas City Chiefs
1,September 10,1:00 PM,Carolina Panthers,Atlanta Falcons
2,September 10,1:00 PM,Cincinnati Bengals,Cleveland Browns
3,September 10,1:00 PM,Jacksonville Jaguars,Indianapolis Colts
4,September 10,1:00 PM,Tampa Bay Buccaneers,Minnesota Vikings
5,September 10,1:00 PM,Tennessee Titans,New Orleans Saints
6,September 10,1:00 PM,San Francisco 49ers,Pittsburgh Steelers
7,September 10,1:00 PM,Houston Texans,Baltimore Ravens
8,September 10,1:00 PM,Arizona Cardinals,Washington Commanders
9,September 10,4:25 PM,Green Bay Packers,Chicago Bears


In [122]:
# Combine dates
for i, row in nfl.iterrows():
    datetime_string = f"{row['date']}, 2023 {row['time']}"

    datetime = dt.datetime.strptime(datetime_string, '%B %d, %Y %I:%M %p')
    nfl.loc[i,'datetime'] = datetime
    
nfl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      272 non-null    object        
 1   time      272 non-null    object        
 2   away      272 non-null    object        
 3   home      272 non-null    object        
 4   datetime  272 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 10.8+ KB


In [126]:
# Confirm all team names match with teams_mlb
teams = team_df[team_df['league_id'] == 2].reset_index()
team_nf = []

for i, row in nfl.iterrows():
    home_team = row['home']
    away_team = row['away']
    
    if home_team not in list(teams['team']):
        if home_team not in team_nf:
            print(f"{home_team} not found. Adding to tracker...")
            team_nf.append(home_team)
    
    if away_team not in list(teams['team']):
        if away_team not in team_nf:
            print(f"{away_team} not found. Adding to tracker...")
            team_nf.append(away_team)
            
print(team_nf)

[]


In [127]:
# Assign each row a 'home_id', 'away_id', and 'venue_id'
nfl['home_id'] = ''
nfl['away_id'] = ''
nfl['venue_id'] = ''

for i, row in nfl.iterrows():
    home_team = teams[teams['team'] == row['home']]
    away_team = teams[teams['team'] == row['away']]
    
    nfl.loc[i, 'home_id'] = int(home_team['team_id'])
    nfl.loc[i, 'venue_id'] = int(home_team['venue_id'])
    nfl.loc[i, 'away_id'] = int(away_team['team_id'])

nfl.head()

Unnamed: 0,date,time,away,home,datetime,home_id,away_id,venue_id
0,September 7,8:20 PM,Detroit Lions,Kansas City Chiefs,2023-09-07 20:20:00,32,39,31
1,September 10,1:00 PM,Carolina Panthers,Atlanta Falcons,2023-09-10 13:00:00,50,34,49
2,September 10,1:00 PM,Cincinnati Bengals,Cleveland Browns,2023-09-10 13:00:00,38,55,37
3,September 10,1:00 PM,Jacksonville Jaguars,Indianapolis Colts,2023-09-10 13:00:00,47,61,46
4,September 10,1:00 PM,Tampa Bay Buccaneers,Minnesota Vikings,2023-09-10 13:00:00,62,56,59


In [128]:
# remove unwanted columns 
cols = ['home_id','away_id','venue_id','datetime']
nfl_formatted = nfl[cols]
nfl_formatted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   home_id   272 non-null    object        
 1   away_id   272 non-null    object        
 2   venue_id  272 non-null    object        
 3   datetime  272 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 8.6+ KB


# NBA

In [137]:
nba = pd.read_html('https://www.basketball-reference.com/leagues/NBA_2024_games.html')[0]

nba.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Arena,Notes
0,"Tue, Oct 24, 2023",7:30p,Los Angeles Lakers,,Denver Nuggets,,,,,Ball Arena,
1,"Tue, Oct 24, 2023",10:00p,Phoenix Suns,,Golden State Warriors,,,,,Chase Center,
2,"Wed, Oct 25, 2023",7:00p,Houston Rockets,,Orlando Magic,,,,,Amway Center,
3,"Wed, Oct 25, 2023",7:00p,Boston Celtics,,New York Knicks,,,,,Madison Square Garden (IV),
4,"Wed, Oct 25, 2023",7:00p,Washington Wizards,,Indiana Pacers,,,,,Gainbridge Fieldhouse,


In [138]:
# Drop unwanted rows and columns

# Rows
nba = nba.drop(nba[nba['Date'] == 'Date'].index).reset_index()

# Columns
nba = nba[['Date','Start (ET)','Visitor/Neutral','Home/Neutral']]
nba = nba.rename(columns={
    'Date':'date',
    'Start (ET)':'time',
    'Visitor/Neutral':'away',
    'Home/Neutral':'home'
})

nba.head(30)

Unnamed: 0,date,time,away,home
0,"Tue, Oct 24, 2023",7:30p,Los Angeles Lakers,Denver Nuggets
1,"Tue, Oct 24, 2023",10:00p,Phoenix Suns,Golden State Warriors
2,"Wed, Oct 25, 2023",7:00p,Houston Rockets,Orlando Magic
3,"Wed, Oct 25, 2023",7:00p,Boston Celtics,New York Knicks
4,"Wed, Oct 25, 2023",7:00p,Washington Wizards,Indiana Pacers
5,"Wed, Oct 25, 2023",7:00p,Atlanta Hawks,Charlotte Hornets
6,"Wed, Oct 25, 2023",7:30p,Detroit Pistons,Miami Heat
7,"Wed, Oct 25, 2023",7:30p,Minnesota Timberwolves,Toronto Raptors
8,"Wed, Oct 25, 2023",7:30p,Cleveland Cavaliers,Brooklyn Nets
9,"Wed, Oct 25, 2023",8:00p,New Orleans Pelicans,Memphis Grizzlies


In [139]:
# Confirm all team names match with teams_mlb
teams = team_df[team_df['league_id'] == 1].reset_index()
team_nf = []

for i, row in nba.iterrows():
    home_team = row['home']
    away_team = row['away']
    
    if home_team not in list(teams['team']):
        if home_team not in team_nf:
            print(f"{home_team} not found. Adding to tracker...")
            team_nf.append(home_team)
    
    if away_team not in list(teams['team']):
        if away_team not in team_nf:
            print(f"{away_team} not found. Adding to tracker...")
            team_nf.append(away_team)
            
print(team_nf)

[]


In [140]:
# Combine dates
for i, row in nba.iterrows():
    datetime_string = f"{row['date']} {row['time']}m"

    datetime = dt.datetime.strptime(datetime_string, '%a, %b %d, %Y %I:%M%p')
    nba.loc[i,'datetime'] = datetime
    
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      54 non-null     object        
 1   time      54 non-null     object        
 2   away      54 non-null     object        
 3   home      54 non-null     object        
 4   datetime  54 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 2.2+ KB


In [141]:
# Assign each row a 'home_id', 'away_id', and 'venue_id'
nba['home_id'] = ''
nba['away_id'] = ''
nba['venue_id'] = ''

for i, row in nba.iterrows():
    home_team = teams[teams['team'] == row['home']]
    away_team = teams[teams['team'] == row['away']]
    
    nba.loc[i, 'home_id'] = int(home_team['team_id'])
    nba.loc[i, 'venue_id'] = int(home_team['venue_id'])
    nba.loc[i, 'away_id'] = int(away_team['team_id'])

nba.head()

Unnamed: 0,date,time,away,home,datetime,home_id,away_id,venue_id
0,"Tue, Oct 24, 2023",7:30p,Los Angeles Lakers,Denver Nuggets,2023-10-24 19:30:00,4,9,4
1,"Tue, Oct 24, 2023",10:00p,Phoenix Suns,Golden State Warriors,2023-10-24 22:00:00,7,12,7
2,"Wed, Oct 25, 2023",7:00p,Houston Rockets,Orlando Magic,2023-10-25 19:00:00,2,27,2
3,"Wed, Oct 25, 2023",7:00p,Boston Celtics,New York Knicks,2023-10-25 19:00:00,17,26,16
4,"Wed, Oct 25, 2023",7:00p,Washington Wizards,Indiana Pacers,2023-10-25 19:00:00,14,6,13


In [142]:
# remove unwanted columns 
cols = ['home_id','away_id','venue_id','datetime']
nba_formatted = nba[cols]
nba_formatted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   home_id   54 non-null     object        
 1   away_id   54 non-null     object        
 2   venue_id  54 non-null     object        
 3   datetime  54 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 1.8+ KB


# NHL

In [164]:
nhl = pd.read_html('https://www.hockey-reference.com/leagues/NHL_2024_games.html')[0]
nhl.head()

Unnamed: 0,Date,Visitor,G,Home,G.1,Unnamed: 5,Att.,LOG,Notes
0,2023-10-10,Chicago Blackhawks,,Pittsburgh Penguins,,,,,
1,2023-10-10,Nashville Predators,,Tampa Bay Lightning,,,,,
2,2023-10-10,Seattle Kraken,,Vegas Golden Knights,,,,,
3,2023-10-11,Chicago Blackhawks,,Boston Bruins,,,,,
4,2023-10-11,Ottawa Senators,,Carolina Hurricanes,,,,,


In [165]:
# Drop unwanted rows and columns

# Rows
nhl = nhl.drop(nhl[nhl['Date'] == 'Date'].index).reset_index()

# Columns
nhl = nhl[['Date','Visitor','Home']]
nhl = nhl.rename(columns={
    'Date':'date',
    'Visitor':'away',
    'Home':'home'
})
nhl.head()

Unnamed: 0,date,away,home
0,2023-10-10,Chicago Blackhawks,Pittsburgh Penguins
1,2023-10-10,Nashville Predators,Tampa Bay Lightning
2,2023-10-10,Seattle Kraken,Vegas Golden Knights
3,2023-10-11,Chicago Blackhawks,Boston Bruins
4,2023-10-11,Ottawa Senators,Carolina Hurricanes


In [166]:
# Confirm all team names match with teams_mlb
teams = team_df[team_df['league_id'] == 3].reset_index()
team_nf = []

for i, row in nhl.iterrows():
    home_team = row['home']
    away_team = row['away']
    
    if home_team not in list(teams['team']):
        if home_team not in team_nf:
            print(f"{home_team} not found. Adding to tracker...")
            team_nf.append(home_team)
    
    if away_team not in list(teams['team']):
        if away_team not in team_nf:
            print(f"{away_team} not found. Adding to tracker...")
            team_nf.append(away_team)
            
print(team_nf)

[]


In [167]:
# Combine dates
for i, row in nhl.iterrows():
    #datetime_string = f"{row['date']} {row['time']}m"

    datetime = dt.datetime.strptime(row['date'], '%Y-%m-%d')
    nhl.loc[i,'datetime'] = datetime
    
nhl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1312 entries, 0 to 1311
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      1312 non-null   object        
 1   away      1312 non-null   object        
 2   home      1312 non-null   object        
 3   datetime  1312 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 41.1+ KB


In [168]:
for i, row in nhl.iterrows():
    home_team = teams[teams['team'] == row['home']]
    away_team = teams[teams['team'] == row['away']]
    
    nhl.loc[i, 'home_id'] = int(home_team['team_id'])
    nhl.loc[i, 'venue_id'] = int(home_team['venue_id'])
    nhl.loc[i, 'away_id'] = int(away_team['team_id'])

nhl.head()

Unnamed: 0,date,away,home,datetime,home_id,venue_id,away_id
0,2023-10-10,Chicago Blackhawks,Pittsburgh Penguins,2023-10-10,82.0,73.0,92.0
1,2023-10-10,Nashville Predators,Tampa Bay Lightning,2023-10-10,63.0,60.0,67.0
2,2023-10-10,Seattle Kraken,Vegas Golden Knights,2023-10-10,90.0,79.0,71.0
3,2023-10-11,Chicago Blackhawks,Boston Bruins,2023-10-11,89.0,25.0,92.0
4,2023-10-11,Ottawa Senators,Carolina Hurricanes,2023-10-11,81.0,72.0,69.0


In [169]:
# remove unwanted columns 
cols = ['home_id','away_id','venue_id','datetime']
nhl_formatted = nhl[cols]
nhl_formatted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1312 entries, 0 to 1311
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   home_id   1312 non-null   float64       
 1   away_id   1312 non-null   float64       
 2   venue_id  1312 non-null   float64       
 3   datetime  1312 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(3)
memory usage: 41.1 KB


# MLS

In [156]:
mls = pd.read_html('https://fbref.com/en/comps/22/schedule/Major-League-Soccer-Scores-and-Fixtures')[0]
mls.head()

Unnamed: 0,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,Match Report,Notes
0,Sat,2023-02-25,15:30,Nashville,1.3,2–0,0.4,NYCFC,28051,Geodis Park,Armando Villarreal,Match Report,
1,Sat,2023-02-25,19:30,FC Cincinnati,1.7,2–1,1.4,Dynamo FC,25513,TQL Stadium,Chris Penso,Match Report,
2,Sat,2023-02-25,19:30,FC Dallas,0.9,0–1,0.8,Minnesota Utd,19096,Toyota Stadium,Ramy Touchan,Match Report,
3,Sat,2023-02-25,19:30,Atlanta Utd,1.9,2–1,1.2,San Jose,67538,Mercedes-Benz Stadium,Jon Freemon,Match Report,
4,Sat,2023-02-25,19:30,Philadelphia,3.2,4–1,0.6,Columbus Crew,18510,Subaru Park,Lukasz Szpala,Match Report,


In [170]:
# Drop unwanted rows and columns

# Rows
mls = mls.drop(mls[mls['Date'] == 'Date'].index).reset_index()

# Columns
mls = mls[['Date','Time','Away','Home']]
mls = mls.rename(columns={
    'Date':'date',
    'Time':'time',
    'Away':'away',
    'Home':'home'
})
mls.head()

Unnamed: 0,date,time,away,home
0,2023-02-25,15:30,NYCFC,Nashville
1,2023-02-25,19:30,Dynamo FC,FC Cincinnati
2,2023-02-25,19:30,Minnesota Utd,FC Dallas
3,2023-02-25,19:30,San Jose,Atlanta Utd
4,2023-02-25,19:30,Columbus Crew,Philadelphia


In [176]:
# Confirm all team names match with teams_mlb
teams = team_df[team_df['league_id'] == 5].reset_index()
team_nf = []

for i, row in mls.iterrows():
    home_team = row['home']
    away_team = row['away']
    
    if home_team not in list(teams['team']):
        if home_team not in team_nf:
            #print(f"{home_team} not found. Adding to tracker...")
            team_nf.append(home_team)
    
    if away_team not in list(teams['team']):
        if away_team not in team_nf:
            #print(f"{away_team} not found. Adding to tracker...")
            team_nf.append(away_team)
            
print(team_nf)

[]


In [175]:
# Update Team names to match schedule
mls_replace_dict = {
    'Nashville':'Nashville SC',
    'NYCFC':'New York City FC',
    'Dynamo FC':'Houston Dynamo FC',
    'Minnesota Utd':'Minnesota United FC',
    'Atlanta Utd':'Atlanta United FC',
    'San Jose':'San Jose Earthquakes',
    'Philadelphia':'Philadelphia Union',
    'Orlando City':'Orlando City SC',
    'NY Red Bulls':'New York Red Bulls',
    'Austin':'Austin FC',
    'St. Louis':'St. Louis City SC',
    'Charlotte':'Charlotte FC',
    'New England':'New England Revolution',
    'Vancouver':'Vancouver Whitecaps FC',
    'Inter Miami':'Inter Miami CF',
    'Seattle':'Seattle Sounders FC',
    'Sporting KC':'Sporting Kansas City',
    'Chicago Fire':'Chicago Fire FC'
}
mls = mls.replace(mls_replace_dict)

In [179]:
# Combine dates
for i, row in mls.iterrows():
    datetime_string = f"{row['date']} {row['time']}"

    datetime = dt.datetime.strptime(datetime_string, '%Y-%m-%d %H:%M')
    mls.loc[i,'datetime'] = datetime
    
mls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      493 non-null    object        
 1   time      493 non-null    object        
 2   away      493 non-null    object        
 3   home      493 non-null    object        
 4   datetime  493 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 19.4+ KB


In [180]:
for i, row in mls.iterrows():
    home_team = teams[teams['team'] == row['home']]
    away_team = teams[teams['team'] == row['away']]
    
    mls.loc[i, 'home_id'] = int(home_team['team_id'])
    mls.loc[i, 'venue_id'] = int(home_team['venue_id'])
    mls.loc[i, 'away_id'] = int(away_team['team_id'])

mls.head()

Unnamed: 0,date,time,away,home,datetime,home_id,venue_id,away_id
0,2023-02-25,15:30,New York City FC,Nashville SC,2023-02-25 15:30:00,139.0,51.0,151.0
1,2023-02-25,19:30,Houston Dynamo FC,FC Cincinnati,2023-02-25 19:30:00,150.0,132.0,141.0
2,2023-02-25,19:30,Minnesota United FC,FC Dallas,2023-02-25 19:30:00,149.0,131.0,125.0
3,2023-02-25,19:30,San Jose Earthquakes,Atlanta United FC,2023-02-25 19:30:00,138.0,49.0,140.0
4,2023-02-25,19:30,Columbus Crew,Philadelphia Union,2023-02-25 19:30:00,148.0,130.0,136.0
