In [621]:
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, NavigableString, Tag
import urllib.request
import requests
import pandas as pd
import warnings; warnings.simplefilter('ignore')

In [622]:
response = requests.get('https://www.baseball-reference.com/leagues/MLB/2019-schedule.shtml')
soup = BeautifulSoup(response.text,'html.parser')

In [623]:
alldates = soup.find_all('h3')

In [624]:
data = pd.DataFrame(columns = ['Date','VisitingTeam','VisitingScore','HomeTeam','HomeScore'])

In [625]:
### Scrape game results for all MLB games in 2019

datelist=[]
VisitingTeamList = []
VisitingTeamScoreList = []
HomeTeamList = []
HomeScoreList = []
flag = 0 ## to break out of loops

for header in soup.find_all('h3')[8:190]: ## Games start at 8
    
    nextNode = header
    while True:
        nextNode = nextNode.nextSibling
        if nextNode is None:
            break
        if (isinstance(nextNode, Tag) and flag == 0):
            if nextNode.name == "h2": 
                break
            if header.text == 'Today\'s Games': ## Stop scraping when getting to Today's date
                flag +=1
                break
            datelist.append(header.text) ## Append date
            try:
                VisitingTeamList.append(nextNode.get_text().splitlines()[1])  
            except IndexError:
                continue
            try: 
                VisitingTeamScoreList.append(nextNode.get_text().splitlines()[2])
            except IndexError:
                continue
            try: 
                HomeTeamList.append(nextNode.get_text().splitlines()[4])
            except IndexError:
                continue
            try:
                HomeScoreList.append(nextNode.get_text().splitlines()[5])
            except IndexError:
                continue
        
            
 

In [626]:
## Above scraping gets one instance of the date in excess, remove it
for x in set(datelist):
    datelist.remove(x)
data['Date'] = datelist
data['VisitingTeam'] = pd.Series(VisitingTeamList)
data['VisitingScore'] = pd.Series(VisitingTeamScoreList)
data['HomeTeam'] = pd.Series(HomeTeamList)
data['HomeScore'] = pd.Series(HomeScoreList)

In [627]:
data['VisitingScore']=data['VisitingScore'].map(lambda x: x.strip(' ()'))
data['HomeScore']=data['HomeScore'].map(lambda x: x.strip(' ()'))
data['Date'] = pd.to_datetime(data['Date'])

In [628]:
data.loc[data['HomeScore'] > data['VisitingScore'],'HomeWin'] = 1
data.loc[data['HomeScore'] < data['VisitingScore'],'HomeWin'] = 0

In [629]:
data['VisitingTeam'] = data['VisitingTeam'].str.strip()
data['HomeTeam'] = data['HomeTeam'].str.strip()
intteamdict = {'Toronto Blue Jays': 'Toronto', 'New York Yankees': 'Yankees', 'Boston Red Sox': 'Redsox', 'Baltimore Orioles': 'Orioles'}

def WinColumn(team): #Figure out if each team won or lost based on whether they were home or away
    
    data[intteamdict[team] + 'Win'] = ""
    data.loc[(data['HomeTeam'] == team) & (data['HomeWin'] == 1), intteamdict[team] + 'Win'] = 1
    data.loc[(data['HomeTeam'] == team) & (data['HomeWin'] == 0), intteamdict[team] + 'Win'] = -1
    data.loc[(data['VisitingTeam'] == team) & (data['HomeWin'] == 0), intteamdict[team] + 'Win'] = 1
    data.loc[(data['VisitingTeam'] == team) & (data['HomeWin'] == 1), intteamdict[team] + 'Win'] = -1
    
WinColumn('Toronto Blue Jays')
WinColumn('New York Yankees')
WinColumn('Boston Red Sox')
WinColumn('Baltimore Orioles')


In [630]:
### Create team specific df
Toronto_df = data.loc[data['TorontoWin'] != ""]
Orioles_df = data.loc[data['OriolesWin'] != ""]
Yankees_df = data.loc[data['YankeesWin'] != ""]
Redsox_df = data.loc[data['RedsoxWin'] != ""]

Toronto_df['GameNumber'] = list(range(1,toronto_df.shape[0]+1))
Orioles_df['GameNumber'] = list(range(1,orioles_df.shape[0]+1))
Yankees_df['GameNumber'] = list(range(1,yankees_df.shape[0]+1))
Redsox_df['GameNumber'] = list(range(1,redsox_df.shape[0]+1))

In [631]:
## Create WinLoss and WinLoss10 columns

def WinLossColumn(team_df,team):
    if team == 'Toronto':
        team_df.loc[team_df['GameNumber'] ==1,'WinLossRatio'] = 0
        team_df.loc[team_df['GameNumber'] ==1,'WinLossRatioLast10'] = 0
        team_df.loc[team_df['GameNumber'].isin(list(range(2, team_df.shape[0]+1))),'WinLossRatio'] = team_df[team + 'Win'].cumsum()[:-1].values
        team_df.loc[team_df['GameNumber'].isin(list(range(2, team_df.shape[0]+1))),'WinLossRatioLast10'] = team_df[team + 'Win'].rolling(min_periods=1, window=10).sum()[:-1].values
    
    else:
        team_df.loc[team_df['GameNumber'] ==1,team+'WinLossRatio'] = 0
        team_df.loc[team_df['GameNumber'] ==1,team+'WinLossRatioLast10'] = 0
        team_df.loc[team_df['GameNumber'].isin(list(range(2, team_df.shape[0]+1))),team+'WinLossRatio'] = team_df[team + 'Win'].cumsum()[:-1].values
        team_df.loc[team_df['GameNumber'].isin(list(range(2, team_df.shape[0]+1))),team+'WinLossRatioLast10'] = team_df[team + 'Win'].rolling(min_periods=1, window=10).sum()[:-1].values
    

WinLossColumn(Toronto_df,'Toronto')
WinLossColumn(Orioles_df,'Orioles')
WinLossColumn(Redsox_df,'Redsox')
WinLossColumn(Yankees_df,'Yankees')

In [632]:
## Set datetime so you are able to merge on it
def setdatetime(df):
    df['Date'] = pd.to_datetime(df['Date'],format= '%Y-%m-%d')
    df.set_index('Date',inplace=True,drop=True)

setdatetime(Toronto_df)
setdatetime(Yankees_df)
setdatetime(Orioles_df)
setdatetime(Redsox_df)


In [633]:
## The only columns you need for other teams are Date and WinLossRatio
Redsox_df = Redsox_df[['RedsoxWinLossRatio','RedsoxWinLossRatioLast10']]
Orioles_df = Orioles_df[['OriolesWinLossRatio','OriolesWinLossRatioLast10']]
Yankees_df = Yankees_df[['YankeesWinLossRatio','YankeesWinLossRatioLast10']]

In [634]:
##Merge on date
Toronto_df = pd.merge_asof(Toronto_df, Redsox_df, left_index=True, right_index=True)
Toronto_df = pd.merge_asof(Toronto_df, Orioles_df, left_index=True, right_index=True)
Toronto_df = pd.merge_asof(Toronto_df, Yankees_df, left_index=True, right_index=True)


In [635]:
## Create GamesBack column 
Toronto_df['GamesBack'] = Toronto_df['WinLossRatio'] - Toronto_df[['YankeesWinLossRatio','OriolesWinLossRatio','RedsoxWinLossRatio']].max(axis=1) 

In [636]:
## Use same team names as historical data
teamdict = {
"Arizona DBacks":"ARI",
"Atlanta Braves":"ATL",
"Baltimore Orioles":"BAL",
"Boston Red Sox":"BOS",
"Chicago Cubs":"CHN",
"Chicago White Sox":"CHA",
"Cincinnati Reds":"CIN",
"Cleveland Indians":"CLE",
"Colorado Rockies":"COL",
"Detroit Tigers":"DET",
"Houston Astros":"HOU",
"Kansas City Royals":"KCA",
"Los Angeles Angels":"LAA",
"Los Angeles Dodgers":"LAN",
"Miami Marlins":"FLO",
"Milwaukee Brewers":"MIL",
"Minnesota Twins":"MIN",
"New York Mets":"NYN",
"New York Yankees":"NYA",
"Oakland Athletics":"OAK",
"Philadelphia Phillies":"PHI",
"Pittsburgh Pirates":"PIT",
"San Diego Padres":"SDN",
"San Francisco Giants":"SFN",
"Seattle Mariners":"SEA",
"St. Louis Cardinals":"SLN",
"Tampa Bay Rays":"TBA",
"Texas Rangers":"TEX",
"Toronto Blue Jays":"TOR",
"Washington Nationals":"WAS",
}

Toronto_df['HomeTeam'] = Toronto_df['HomeTeam'].map(teamdict)
Toronto_df['VisitingTeam'] = Toronto_df['VisitingTeam'].map(teamdict)

In [637]:
Toronto_df = Toronto_df[['WinLossRatio','WinLossRatioLast10','RedsoxWinLossRatio','OriolesWinLossRatio','YankeesWinLossRatio','GamesBack','VisitingTeam']]

In [638]:
Toronto_df.to_csv('Toronto2019WinLoss.csv')