In [40]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
import warnings
warnings.filterwarnings("ignore")
from datetime import date
import lxml 


In [41]:
def load_data(team_1, start, end=None, team_2=None): 
    """
    rtype: DataFrame
    """
    if end is None: 
        end = start
    if team_2 is None: 
        team_2 = "all"
    payload = {"team1":team_1,"firstyear":str(start),"team2":team_2,"lastyear":str(end),"format":"HTML","submit":"Fetch"}
    url = "http://www.boydsworld.com/cgi/scores.pl?" 
    s = requests.Session()
    r = requests.get(url, params=payload)
    response = r.text 
    dfs = pd.read_html(StringIO(response), parse_dates=True)
    df = dfs[1].dropna(how="all", axis=1)
    col_names = ["date", "team_1", "team_1_score", "team_2", "team_2_score", "field"]
    df.columns = col_names
    df["date"] = df["date"].astype("datetime64[ns]")
    return df

In [42]:
def enrich_data(df, team_1, team_2=None):
    """
    """
    wins = df[(df["team_1"] == team_1) & (df["team_1_score"] > df["team_2_score"])]
    losses = df[(df["team_2"] == team_1) & (df["team_1_score"] > df["team_2_score"])]
    wins["opponent"] = wins["team_2"]
    losses["opponent"] = losses["team_1"]
    wins["runs_scored"] = wins["team_1_score"]
    wins["runs_allowed"] = wins["team_2_score"]
    losses["runs_scored"] = losses["team_2_score"]
    losses["runs_allowed"] = losses["team_1_score"]       
    df = pd.concat([wins,losses])
    df["run_difference"] = df["runs_scored"] - df["runs_allowed"]
    return df 

In [43]:
def set_dtypes(df):
    """
    """
    df["run_difference"] = df["run_difference"].astype(int)
    df["runs_allowed"] = df["runs_allowed"].astype(int)
    df["runs_scored"] = df["runs_scored"].astype(int)
    return df

In [44]:
def get_games(team_1, start, end=None, team_2=None):
    """
    Returns: a dataframe of all games played for a given team inclusive of given start & end year
    Data from boydsworld.com
    
    Parameter team_name: team whose games to select 
    Precondition: team_name is a lowercase string
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int >= 1992
    Parameter end: the end year of games
    Precondition: end is an int <= 2020
    """
    df = (load_data(team_1, start, end, team_2)
#             .pipe(handle_errors())
            .pipe(enrich_data, team_1)
            .pipe(set_dtypes)
            .drop(columns=["team_1","team_1_score","team_2","team_2_score"])
            .sort_values(by="date",axis=0, ascending=True)
         )
    # boydsworld sometimes struggles with single year inquiries 
    return df

In [50]:
dfs = []
for i in range(2010, 2021):
    dfs.append(get_games("Brown", i))

In [60]:
dfs[]

Unnamed: 0,date,field,opponent,runs_scored,runs_allowed,run_difference
0,2020-02-21,@neutral,Presbyterian,3,6,-3
1,2020-02-22,@Wofford,Wofford,3,2,1
2,2020-02-22,@Wofford,Wofford,3,12,-9
3,2020-02-23,@South Carolina-Upstate,South Carolina-Upstate,5,10,-5
4,2020-02-28,@neutral,Sacred Heart,1,2,-1
5,2020-02-29,@Wake Forest,Wake Forest,0,10,-10
6,2020-02-29,@Wake Forest,Wake Forest,4,7,-3
7,2020-03-01,@neutral,LaSalle,4,3,1
8,2020-03-06,@Kennesaw State,Kennesaw State,19,5,14
9,2020-03-07,@Kennesaw State,Kennesaw State,0,2,-2
