In [None]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
import warnings
# warnings.filterwarnings("ignore")
from datetime import date
import lxml 

In [None]:
def load_data(team_1,
              start,
              end=None,
              team_2="all", 
              url="http://www.boydsworld.com/cgi/scores.pl?", 
              col_names=["date", "team_1", "team_1_score", "team_2", "team_2_score", "field"], 
              parse_dates=True
             ):
    """
    rtype: DataFrame
    """
    if end is None: 
        end = start
    # build payload
    payload = {"team1":team_1,"firstyear":str(start),"team2":team_2,"lastyear":str(end),"format":"HTML","submit":"Fetch"}
    # start Requests session
    s = requests.Session()
    # send GET request
    r = requests.get(url, params=payload)
    response = r.text
    io = StringIO(response).read()
    # parse HTML into DataFrame
#     hf= StringIO(response)
    dfs = pd.read_html(io=io, parse_dates=parse_dates)
    df = dfs[1].dropna(how="all", axis=1)
    # reset column names
    df.columns = col_names
    if parse_dates:
        # make sure dates are parsed as type datetime64[ns]
        df["date"] = df["date"].astype("datetime64[ns]")
    return df

In [None]:
def enrich_data(df, team_1):
    """
    """
    wins = df[(df["team_1"] == team_1) & (df["team_1_score"] > df["team_2_score"])]
    losses = df[(df["team_2"] == team_1) & (df["team_1_score"] > df["team_2_score"])]
    wins["opponent"] = wins["team_2"]
    losses["opponent"] = losses["team_1"]
    wins["runs_scored"] = wins["team_1_score"]
    wins["runs_allowed"] = wins["team_2_score"]
    losses["runs_scored"] = losses["team_2_score"]
    losses["runs_allowed"] = losses["team_1_score"]       
    df = pd.concat([wins,losses])
    df["run_difference"] = df["runs_scored"] - df["runs_allowed"]
    return df 

In [None]:
def set_dtypes(df):
    """
    """
    df["run_difference"] = df["run_difference"].astype(int)
    df["runs_allowed"] = df["runs_allowed"].astype(int)
    df["runs_scored"] = df["runs_scored"].astype(int)
    return df

In [None]:
def get_games(team_1, start, end=None, team_2="all"):
    """
    Returns: a dataframe of all games played for a given team inclusive of given start & end year
    Data from boydsworld.com
    
    Parameter team_name: team whose games to select 
    Precondition: team_name is a lowercase string
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int >= 1992
    Parameter end: the end year of games
    Precondition: end is an int <= 2020
    """
    df = (load_data(team_1, start, end, team_2)
            .pipe(enrich_data, team_1)
            .pipe(set_dtypes)
            .drop(columns=["team_1","team_1_score","team_2","team_2_score"])
            .sort_values(by="date",axis=0, ascending=True)
         )
    # boydsworld sometimes struggles with single year inquiries 
    return df

In [None]:
get_games("Brown", 2016)

In [None]:
dfs[]