In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
from bs4 import BeautifulSoup
import urllib.request

## Import all columns from the webpage

In [3]:
def extract_team_data(team_name = None, year_start = None, year_end = None):
    teams = team_name
    print(teams)
    years = np.arange(year_start,year_end+1)
    sub_df = pd.DataFrame()

    for team in teams:
        print("Scraping data for: ", team, "from year:", year_start, "to year:", year_end)
        for year in years:
            try:
                year = str(year)
                try:
                    url = 'https://www.baseball-reference.com/teams/' + team + '/' + year + '-schedule-scores.shtml'
                    page = urllib.request.urlopen(url)
                except Exception:
                    pass
                soup = BeautifulSoup(page, 'html.parser')
                table = soup.find("table", {'id': 'team_schedule'})
                table_rows = table.find_all('tr')
                output = []
                for tr in table_rows:
                    td = tr.find_all('td')
                    row = [tr.text for tr in td]
                    output.append(row)
                df = pd.DataFrame(output)
                #print(df.head())
                df = df[[0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]
                df.columns = ['date', 'team', 'home-away-indicator', 'opponent',  'win-loss-tie', 'runs',
                              'runs_allowed', 'innings', 'record', 'rank', 'gb', 'win', 'loss', 'save',
                              'time', 'day_night', 'attendance', 'cLI', 'streak']
                df.dropna(how='all', axis=0, inplace=True)
                df['year'] = int(year)
                df['home_team'] = team
                df.drop(columns= ['team'], inplace=True)
                sub_df = pd.concat([sub_df, df], axis=0)
            except Exception as e:
                print(year)
                print(team)
                print(e)
                pass
    return sub_df

#Adapted From:
#https://github.com/OlivierLej/DataIsTheNewOil/blob/4b8bc4d5e56a7c5442843cf2ad1d0a0971f0945e/scraping_baseballreference.py


In [4]:
def partition_home_games(data = None):
    home_games = sub_df.loc[sub_df['xhome-away-indicator' ]== '']
    return home_games

def partition_away_games(data=None):
    away_games = sub_df.loc[sub_df['home-away-indicator' ]== '@']
    return away_games

## Fundamental data cleanup on columns

In [5]:
data = extract_team_data(team_name=['NYM', 'ATL', 'PHI',
                                     'WSN', 'MIA', 'MIL',
                                     'CIN', 'CHC', 'STL',
                                     'PIT', 'SFG', 'LAD',
                                     'SDP', 'COL', 'ARI',
                                     'BOS', 'TBR', 'NYY',
                                     'TOR', 'BAL', 'CHW',
                                     'CLE', 'DET', 'MIN',
                                     'KCR', 'HOU', 'OAK',
                                     'SEA', 'LAA', 'TEX'], 
                                     year_start=2015,year_end=2021)

['NYM', 'ATL', 'PHI', 'WSN', 'MIA', 'MIL', 'CIN', 'CHC', 'STL', 'PIT', 'SFG', 'LAD', 'SDP', 'COL', 'ARI', 'BOS', 'TBR', 'NYY', 'TOR', 'BAL', 'CHW', 'CLE', 'DET', 'MIN', 'KCR', 'HOU', 'OAK', 'SEA', 'LAA', 'TEX']
Scraping data for:  NYM from year: 2015 to year: 2021
Scraping data for:  ATL from year: 2015 to year: 2021
Scraping data for:  PHI from year: 2015 to year: 2021
Scraping data for:  WSN from year: 2015 to year: 2021
Scraping data for:  MIA from year: 2015 to year: 2021
Scraping data for:  MIL from year: 2015 to year: 2021
Scraping data for:  CIN from year: 2015 to year: 2021
Scraping data for:  CHC from year: 2015 to year: 2021
Scraping data for:  STL from year: 2015 to year: 2021
Scraping data for:  PIT from year: 2015 to year: 2021
Scraping data for:  SFG from year: 2015 to year: 2021
Scraping data for:  LAD from year: 2015 to year: 2021
Scraping data for:  SDP from year: 2015 to year: 2021
Scraping data for:  COL from year: 2015 to year: 2021
Scraping data for:  ARI from year

In [6]:
data.head()

Unnamed: 0,date,home-away-indicator,opponent,win-loss-tie,runs,runs_allowed,innings,record,rank,gb,win,loss,save,time,day_night,attendance,cLI,streak,year,home_team
1,"Monday, Apr 6",@,WSN,W,3,1,,1-0,1,Tied,Colon,Scherzer,Carlyle,2:35,D,42295,1.05,+,2015,NYM
2,"Wednesday, Apr 8",@,WSN,L,1,2,,1-1,2,1.5,Zimmermann,deGrom,Storen,2:21,N,25999,1.11,-,2015,NYM
3,"Thursday, Apr 9",@,WSN,W,6,3,,2-1,2,1.0,Harvey,Strasburg,,2:54,D,25327,1.11,+,2015,NYM
4,"Friday, Apr 10",@,ATL,L,3,5,,2-2,2,2.0,Johnson,Montero,Grilli,3:01,N,46279,1.16,-,2015,NYM
5,"Saturday, Apr 11",@,ATL,L,3,5,,2-3,3,3.0,Teheran,Gee,Johnson,2:25,N,36056,1.16,--,2015,NYM


In [7]:
data.attendance = data.attendance.str.replace(',','')
data.attendance = pd.to_numeric(data['attendance'])

### Split out the date information into separate columns

In [8]:
date_thangs = data.date.str.split(" ", expand=True)
data['day'] = date_thangs[0].str.replace(",", " ").str.strip()
data.day.unique()
data['month'] = date_thangs[1]
data['num-date'] = date_thangs[2]
data['multi-game'] = date_thangs[3]
data.drop(columns=['date'], inplace=True)

In [9]:
data.to_pickle(("./mlb_data.pkl"))