In [17]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [130]:
# FUNCTION FROM boydsworld_scraper notebook. 
# TODO: boydsworld_scraper into module, properly import into this notebook
# Imports
import requests
from io import StringIO
from datetime import date
import lxml 

In [103]:
def load_data(team_1, start, end=None, team_2=None): 
    """
    rtype: DataFrame
    """
    if end is None: 
        end = start
    if team_2 is None: 
        team_2 = "all"
    payload = {"team1":team_1,"firstyear":str(start),"team2":team_2,"lastyear":str(end),"format":"HTML","submit":"Fetch"}
    url = "http://www.boydsworld.com/cgi/scores.pl?" 
    s = requests.Session()
    r = requests.get(url, params=payload)
    response = r.text 
    dfs = pd.read_html(StringIO(response), parse_dates=True)
    df = dfs[1].dropna(how="all", axis=1)
    col_names = ["date", "team_1", "team_1_score", "team_2", "team_2_score", "field"]
    df.columns = col_names
    df["date"] = df["date"].astype("datetime64[ns]")
    return df

In [104]:
def enrich_data(df, team_1, team_2=None):
    """
    """
    wins = df[(df["team_1"] == team_1) & (df["team_1_score"] > df["team_2_score"])]
    losses = df[(df["team_2"] == team_1) & (df["team_1_score"] > df["team_2_score"])]
    wins["opponent"] = wins["team_2"]
    losses["opponent"] = losses["team_1"]
    wins["runs_scored"] = wins["team_1_score"]
    wins["runs_allowed"] = wins["team_2_score"]
    losses["runs_scored"] = losses["team_2_score"]
    losses["runs_allowed"] = losses["team_1_score"]       
    df = pd.concat([wins,losses])
    df["run_difference"] = df["runs_scored"] - df["runs_allowed"]
    return df 

In [105]:
def set_dtypes(df):
    """
    """
    df["run_difference"] = df["run_difference"].astype(int)
    df["runs_allowed"] = df["runs_allowed"].astype(int)
    df["runs_scored"] = df["runs_scored"].astype(int)
    return df

In [106]:
def get_games(team_1, start, end=None, team_2=None):
    """
    Returns: a dataframe of all games played for a given team inclusive of given start & end year
    Data from boydsworld.com
    
    Parameter team_name: team whose games to select 
    Precondition: team_name is a lowercase string
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int >= 1992
    Parameter end: the end year of games
    Precondition: end is an int <= 2020
    """
    df = (load_data(team_1, start, end, team_2)
#             .pipe(handle_errors())
            .pipe(enrich_data, team_1)
            .pipe(set_dtypes)
            .drop(columns=["team_1","team_1_score","team_2","team_2_score"])
            .sort_values(by="date",axis=0, ascending=True)
         )
    # boydsworld sometimes struggles with single year inquiries 
    return df

In [115]:
def calculate_actual_win_pct(team_name=None, games=None, start=None, end=None):
    """
    Returns: The actual (i.e. experimental) winning percentage of a given team over given games. 

    actual winning percentage = # game won / # games plated
   
    Parameter team_name: team to return actual winning % for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games over which to calculate
    Precondition: games is a DataFrame returned by getGames() function
    Parameter start: the start year of games
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """
    if games is None: 
            games = get_games(team_name, start, end)     
    actual_win_pct = len(games[games.run_difference > 0]) / len(games)
    return actual_win_pct

In [116]:
def calculate_pythagenpat_win_pct(team_name=None, games=None, start=None, end=None):
    """
    Returns: The PythagenPat winning percentage expectation of a given team over given games. 

    W% = R^x/(R^x + RA^x)
    where x = (RPG)^.287
    Developed by David Smyth and Patriot

    Parameter team_name: team to return expected winning % for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games over which to calculate
    Precondition: games is a DataFrame returned by getGames() function
    Parameter start: the start year of games
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """
    if games is None: 
        games = get_games(team_name, start, end)
    runs_scored_total = games.runs_scored.sum()
    runs_allowed_total = games.runs_allowed.sum()
    games_played_count = len(games) 
    if len(games) == 0: 
        runs_per_game = 0 
    else: 
        runs_per_game = runs_scored_total / games_played_count
        x = runs_per_game ** 0.287
    expected_win_pct = (runs_scored_total**x)/((runs_scored_total**x)+(runs_allowed_total**x))
    return expected_win_pct

In [227]:
# ACTUAL VS EXPECTED WINNING PCTS FOR IVY LEAGUE
season_results = []
teams = ["Brown", "Columbia", "Cornell", "Dartmouth", "Harvard", "Pennsylvania", "Princeton", "Yale"]
for team_name in teams:
    for i in range(2010,2021):
        games = get_games(team_1=team_name, start=i)
        games[games["opponent"].isin(teams)]
        result = [team_name, i, calculate_actual_win_pct(games=games), calculate_pythagenpat_win_pct(games=games)]
        season_results.append(result)
res = pd.DataFrame(season_results)
res.columns = ["team", "season", "actual_win_pct", "expected_win_pct"]
res

Unnamed: 0,team,season,actual_win_pct,expected_win_pct
0,Brown,2010,0.295455,0.371905
1,Brown,2011,0.309524,0.304792
2,Brown,2012,0.204545,0.335728
3,Brown,2013,0.175000,0.275952
4,Brown,2014,0.351351,0.401809
...,...,...,...,...
83,Yale,2016,0.404255,0.383164
84,Yale,2017,0.653846,0.569542
85,Yale,2018,0.511628,0.559347
86,Yale,2019,0.439024,0.493352


In [215]:
# ACTUAL VS EXPECTED WINNING PCTS FOR IVY LEAGUE
season_results = []
teams = ["Brown", "Columbia", "Cornell", "Dartmouth", "Harvard", "Pennsylvania", "Princeton", "Yale"]
for team_name in teams:
    for i in range(2010,2021):
        games = get_games(team_1=team_name, start=i)
        result = [team_name, i, calculate_actual_win_pct(games=games), calculate_pythagenpat_win_pct(games=games)]
        season_results.append(result)
res = pd.DataFrame(season_results)
res.columns = ["team", "season", "actual_win_pct", "expected_win_pct"]
res

Unnamed: 0,team,season,actual_win_pct,expected_win_pct
0,Brown,2010,0.295455,0.371905
1,Brown,2011,0.309524,0.304792
2,Brown,2012,0.204545,0.335728
3,Brown,2013,0.175000,0.275952
4,Brown,2014,0.351351,0.401809
...,...,...,...,...
83,Yale,2016,0.404255,0.383164
84,Yale,2017,0.653846,0.569542
85,Yale,2018,0.511628,0.559347
86,Yale,2019,0.439024,0.493352


In [179]:
# CONSTRUCT ROSTER
df = pd.read_excel("data/cornell/cornell_batting_individual_season_totals_2015_to_2020.xlsx")
unique = df.drop_duplicates("player_id")
roster = unique[["player_id", "player_name"]]
roster.to_csv("data/cornell/cornell_roster_2015_to_2020.csv", index=False)
pd.read_csv("data/cornell/cornell_roster_2015_to_2020.csv")

Unnamed: 0,player_id,player_name
0,1547004,"Scorza, Spencer"
1,1324528,"Tatum, Kevin"
2,1547003,"Karl, Ryan"
3,1439156,"Morris, Dan"
4,1546995,"Padulo, Frankie"
...,...,...
81,2347203,"Ross, Ryan"
82,2347220,"Alonso, Franco"
83,2347219,"Kaplan, Sam"
84,2111747,"Rodriguez, Diobel"


In [228]:
pd.read_csv("data/cornell/cornell_roster_2015_to_2020.csv").

Unnamed: 0,player_id,player_name
0,1547004,"Scorza, Spencer"
1,1324528,"Tatum, Kevin"
2,1547003,"Karl, Ryan"
3,1439156,"Morris, Dan"
4,1546995,"Padulo, Frankie"
...,...,...
81,2347203,"Ross, Ryan"
82,2347220,"Alonso, Franco"
83,2347219,"Kaplan, Sam"
84,2111747,"Rodriguez, Diobel"


In [244]:
def calculate_woba(player_id, season):
    """
    Returns: The Weighted On-Base Average for a given player in a given season
    Data from NCAA

    wOBA = (wBB×uBB + wHBP×HBP + w1B×1B + w2B×2B + w3B×3B +
    wHR×HR) / (AB + BB – IBB + SF + HBP)
    PA = AB + BB - IBB + SF + HBP 

    Parameter player_id: The NCAA ID of player to return for
    Precondition: player_id is an int
    Parameter year: The season to return for 
    Precondition: year is an int, 2015 <= year <= 2020
    """
    linear_weights = pd.read_csv("data/ncaa_d1_woba_linear_weights.csv")
    season_weights = linear_weights[linear_weights.Season==season]
    wbb = season_weights["wBB"].values[0]
    whbp = season_weights["wHBP"].values[0]
    w1b = season_weights["w1B"].values[0]
    w2b = season_weights["w2B"].values[0]
    w3b = season_weights["w3B"].values[0]
    whr = season_weights["wHR"].values[0]

    player_batting = (pd.read_excel("data/cornell/cornell_batting_individual_season_totals_2015_to_2020.xlsx")
                      .query(f"""player_id=={player_id}""")
                      .query(f"""season=={season}""")
                      .fillna(0)
                     )
    if len(player_batting) > 0: 
        walks = player_batting["BB"].values[0]
        hits_by_pitch = player_batting["HBP"].values[0]
        doubles = player_batting["2B"].values[0]
        triples = player_batting["3B"].values[0]
        home_runs = player_batting["HR"].values[0]
        hits =  player_batting["H"].values[0]
        singles = hits-(doubles+triples+home_runs)
        at_bats = player_batting["AB"].values[0]
        sac_flies = player_batting["SF"].values[0]
        sac_bunts = player_batting["SH"].values[0]

        plate_appearances = at_bats+walks+sac_flies+sac_bunts+hits_by_pitch
        if plate_appearances <= 0:
            res = 0 
        else: 
            res = (wbb*walks+whbp*hits_by_pitch+w1b*singles+w2b*doubles+w3b*triples+whr*home_runs)/plate_appearances
    else: 
        res = f"""No batting records found for player_id {player_id} within given range"""
    return res

In [245]:
calculate_woba(2347219, 2020)

0.12259846153846153

In [250]:
def calculate_wraa(player_id, season):
    """
    Returns: The Weighted Runs Above Average (wRAA) for a given player in a given season
    Data from NCAA

    [(wOBA−leagueWOBA) / wOBAscale] ∗ PA
    PA = AB + BB - IBB + SF + HBP 

    Parameter player_id: The NCAA ID of player to return for
    Precondtion: player_id is an int
    Parameter year: The season to return wRC for 
    Precondition: year is an int
    """
    player_batting = (pd.read_excel("data/cornell/cornell_batting_individual_season_totals_2015_to_2020.xlsx")
                          .query(f"""player_id=={player_id}""")
                          .query(f"""season=={season}""")
                          .fillna(0)
                      )
    if len(player_batting) > 0: 
        at_bats = player_batting["AB"].values[0]
        walks = player_batting["BB"].values[0]
        sac_flies = player_batting["SF"].values[0]
        sac_bunts = player_batting["SH"].values[0]
        hits_by_pitch = player_batting["HBP"].values[0]

        linear_weights = pd.read_csv("data/ncaa_d1_woba_linear_weights.csv")
        season_weights = linear_weights[linear_weights.Season == season]
        league_woba = season_weights["wOBA"].values[0]
        woba_scale = season_weights["wOBAScale"].values[0]
        league_runs_per_pa = season_weights["R/PA"].values[0]

        player_woba = calculate_woba(player_id, season)
        plate_appearances = at_bats+walks+sac_flies+sac_bunts+hits_by_pitch

        if plate_appearances < 0: 
            res = 0
        else: 
            res = ((((player_woba-league_woba)/woba_scale)))*(plate_appearances)
    else: 
        res = f"""No batting records found for player_id {player_id} within given range"""
    return res

In [251]:
calculate_wraa(2347219, 2020)

-2.652255707762557

In [262]:
def calculate_wrc(player_id, season):
    """
    Returns: The Weighted Runs Created for a given player in a given season
    Data from NCAA

    wRC = [((wOBA - lgwOBA) / wOBAScale) + (lgR / PA))] * PA
    PA = AB + BB - IBB + SF + HBP 
    
    wRC = (((wOBA-League wOBA)/wOBA Scale)+(League R/PA))*PA


    Parameter player_id: The ID of player to return for
    Precondition: player_id is an int
    Parameter year: The season to return for 
    Precondition: year is an int 
    """
    player_batting = (pd.read_excel("data/cornell/cornell_batting_individual_season_totals_2015_to_2020.xlsx")
                          .query(f"""player_id=={player_id}""")
                          .query(f"""season=={season}""")
                          .fillna(0)
                      )
    if len(player_batting) > 0: 
        at_bats = player_batting["AB"].values[0]
        walks = player_batting["BB"].values[0]
        sac_flies = player_batting["SF"].values[0]
        sac_bunts = player_batting["SH"].values[0]
        hits_by_pitch = player_batting["HBP"].values[0]

        plate_appearances = at_bats+walks+sac_flies+sac_bunts+hits_by_pitch
        if plate_appearances <= 0:
            res = 0 
        else:     
            linear_weights = pd.read_csv("data/ncaa_d1_woba_linear_weights.csv")
            season_weights = linear_weights[linear_weights.Season == season]
            league_woba = season_weights["wOBA"].values[0]
            woba_scale = season_weights["wOBAScale"].values[0]
            league_runs_per_pa = season_weights["R/PA"].values[0]

            player_woba = calculate_woba(player_id, season)   
            res = ((((player_woba-league_woba)/woba_scale)+league_runs_per_pa))*plate_appearances
    else:
        res = f"""No batting records found for player_id {player_id} within given range"""
    return res 

In [264]:
calculate_wrc(2347219, 2020)

-0.8452557077625568

In [282]:
def calculate_lw_metrics(player_id, season):
    """
    returns: woba, wraa, wrc for given player
    """
    player_batting = (pd.read_excel("data/cornell/cornell_batting_individual_season_totals_2015_to_2020.xlsx")
                          .query(f"""player_id=={player_id}""")
                          .query(f"""season=={season}""")
                          .fillna(0)
                         )
    assert len(player_batting) > 0, f"""no records found for {player_id} in given range"""  
    walks = player_batting["BB"].values[0]
    hits_by_pitch = player_batting["HBP"].values[0]
    doubles = player_batting["2B"].values[0]
    triples = player_batting["3B"].values[0]
    home_runs = player_batting["HR"].values[0]
    hits =  player_batting["H"].values[0]
    singles = hits-(doubles+triples+home_runs)
    at_bats = player_batting["AB"].values[0]
    sac_flies = player_batting["SF"].values[0]
    sac_bunts = player_batting["SH"].values[0]

    plate_appearances = at_bats+walks+sac_flies+sac_bunts+hits_by_pitch

    if plate_appearances > 0:
        linear_weights = pd.read_csv("data/ncaa_d1_woba_linear_weights.csv")
        season_weights = linear_weights[linear_weights.Season==season]
        wbb = season_weights["wBB"].values[0]
        whbp = season_weights["wHBP"].values[0]
        w1b = season_weights["w1B"].values[0]
        w2b = season_weights["w2B"].values[0]
        w3b = season_weights["w3B"].values[0]
        whr = season_weights["wHR"].values[0]
        league_woba = season_weights["wOBA"].values[0]
        woba_scale = season_weights["wOBAScale"].values[0]
        league_runs_per_pa = season_weights["R/PA"].values[0]

        woba = (wbb*walks+whbp*hits_by_pitch+w1b*singles+w2b*doubles+w3b*triples+whr*home_runs)/plate_appearances
        wraa = ((((woba-league_woba)/woba_scale)))*(plate_appearances)
        wrc = ((((woba-league_woba)/woba_scale)+league_runs_per_pa))*plate_appearances
    else: 
        woba = 0 
        wraa = 0
        wrc = 0

    return {"wOBA":round(woba, 3), "wRAA":round(wraa, 3), "wRC":round(wrc, 3)}

In [283]:
calculate_lw_metrics(2347219, 2020)

{'wOBA': 0.123, 'wRAA': -2.652, 'wRC': -0.845}

In [12]:
def get_cornell_woba(year):
    """
    Returns: The Weighted On-Base Average  for a given player in a given season
    
    wOBA = (wBB×uBB + wHBP×HBP + w1B×1B + w2B×2B + w3B×3B +
            wHR×HR) / (AB + BB – IBB + SF + HBP))
    PA = AB + BB - IBB + SF + HBP 

    Parameter year: The season to return for 
    Precondition: year is an int, 2012 <= year <= 2020
    """
    linear_weights = pd.read_csv("data/ncaa_d1_woba_linear_weights.csv")
    season_weights = linear_weights[linear_weights.Season==year]
    wbb = season_weights["wBB"].values[0]
    whbp = season_weights["wHBP"].values[0]
    w1b = season_weights["w1B"].values[0]
    w2b = season_weights["w2B"].values[0]
    w3b = season_weights["w3B"].values[0]
    whr = season_weights["wHR"].values[0]
    totals = pd.read_excel("data/cornell_totals_2012_to_2020.xlsx")
    season_totals = totals[totals.Season==year]
    walks = season_totals["BB"].values[0]
    hits_by_pitch = season_totals["HBP"].values[0]
    doubles = season_totals["2B"].values[0]
    triples = season_totals["3B"].values[0]
    home_runs = season_totals["HR"].values[0]
    hits = season_totals["H"].values[0]
    singles =  hits-(doubles+triples+home_runs)
    at_bats = season_totals["AB"].values[0]
    sac_flies = season_totals["SF"].values[0]
    sac_bunts = season_totals["SH"].values[0]
    plate_appearances = at_bats+walks+sac_flies+sac_bunts+hits_by_pitch
    woba = (wbb*walks+whbp*hits_by_pitch+w1b*singles+w2b*doubles+w3b*triples+whr*home_runs)/plate_appearances
    return woba

In [10]:
def get_ivy_woba(year):
    """
    Returns: Ivy League Weighted On-Base Average in a given season

    wOBA = (wBB×uBB + wHBP×HBP + w1B×1B + w2B×2B + w3B×3B +
    wHR×HR) / (AB + BB – IBB + SF + HBP)
    PA = AB + BB - IBB + SF + HBP 

    Parameter year: The season to return for 
    Precondition: year is an int, 2012 <= year <= 2020
    """  
    linear_weights = pd.read_csv("data/ncaa_d1_woba_linear_weights.csv")
    season_weights = linear_weights[linear_weights.Season==year]
    wbb = season_weights["wBB"].values[0]
    whbp = season_weights["wHBP"].values[0]
    w1b = season_weights["w1B"].values[0]
    w2b = season_weights["w2B"].values[0]
    w3b = season_weights["w3B"].values[0]
    whr = season_weights["wHR"].values[0]
    ivy_totals = pd.read_excel("data/ivy_league_totals_2012_to_2020.xlsx", sheet_name="batting")
    season_totals = ivy_totals[ivy_totals.Season==year]
    walks = season_totals["BB"].values[0]
    hits_by_pitch = season_totals["HBP"].values[0]
    doubles = season_totals["2B"].values[0]
    triples = season_totals["3B"].values[0]
    home_runs = season_totals["HR"].values[0]
    hits = season_totals["H"].values[0]
    singles =  hits-(doubles+triples+home_runs)
    at_bats = season_totals["AB"].values[0]
    sac_flies = season_totals["SF"].values[0]
    sac_bunts = season_totals["SH"].values[0]
    plate_appearances = at_bats+walks+sac_flies+sac_bunts+hits_by_pitch
    woba = (wbb*walks+whbp*hits_by_pitch+w1b*singles+w2b*doubles+w3b*triples+whr*home_runs)/plate_appearances
    return woba

In [9]:
def get_fip(player_id, year):
    """
    Returns: Fielding Independent Pitching for a given player in a given season
    
    FIP = ((13 * HR)+(3 * (BB + HBP))-(2 * K))/IP + constant
    FIP Constant = lgERA – (((13 * lgHR) + (3 * (lgBB+lgHBP))-(2 * lgK))/ lgIP)

    Parameter player_id: The ID of player to return for
    Precondition: player_id is an int
    Parameter year: The season to return wRC for 
    Precondition: year is an int
    """
    season_pitching = pd.read_csv("data/cornellpitching"+str(year)+".csv")
    player_pitching =  season_pitching[season_pitching.player_id==player_id]
    player_pitching = player_pitching.fillna(0)
    strike_outs = player_pitching["SO"].values[0]
    hit_batters = player_pitching["HB"].values[0]
    walks_given = player_pitching["BB"].values[0]
    home_runs_allowed = player_pitching["HR-A"].values[0]
    strike_outs = player_pitching["IP"].values[0]
    linear_weights = pd.read_csv("data/ncaa_d1_woba_linear_weights.csv")
    season_weights = linear_weights[linear_weights.Season== year]
    fip_constant =season_weights["cFIP"].values[0]
    fip = (((13*home_runs_allowed) + (3*(walks_given+hit_batters))-
            (2*strike_outs))/strike_outs) + fip_constant
    return fip

In [28]:
def get_cornell_fip(year):
    """
    Returns: Cornell total  Fielding Independent Pitching (FIP) for a given player in a given season

    FIP = ((13*HR)+(3*(BB+HBP))-(2*K))/IP + constant
    FIP Constant = lgERA – (((13*lgHR)+(3*(lgBB+lgHBP))-(2*lgK))/lgIP)

    Parameter player_id: The ID of player to return for
    Precondition: player_id is 
    Parameter year: The season to return wRC for 
    Precondition: year is an INT 
    """
    cu_pitching = pd.read_csv("data/cornellpitching"+str(year)+".csv")
    season_totals = cu_pitching[cu_pitching.Player=="Totals"]
    strike_outs = season_totals["SO"].values[0]
    hit_batters = season_totals["HB"].values[0]
    walks_given = season_totals["BB"].values[0]
    home_runs_allowed = season_totals["HR-A"].values[0]
    strike_outs = season_totals["IP"].values[0]
    linear_weights = pd.read_csv("data/ncaa_d1_woba_linear_weights.csv")
    season_weights = linear_weights[linear_weights.Season== year]
    fip_constant =season_weights["cFIP"].values[0]
    fip = (((13*home_runs_allowed)+(3*(walks_given+hit_batters))-(2*strike_outs))/strike_outs) + fip_constant
    return fip

3.850534334078569

In [29]:
def get_ivy_fip(year):
    """
    Returns: Ivy League total Fielding Independent Pitching (FIP) for a given season
    
    FIP = ((13*HR)+(3*(BB+HBP))-(2*K))/IP + constant
    FIP Constant = lgERA – (((13*lgHR)+(3*(lgBB+lgHBP))-(2*lgK))/lgIP)

    Parameter year: The season to return fip for 
    Precondition: year is an INT 
    """
    ivy_pitching = pd.read_excel("data/ivy_league_totals_2012_to_2020.xlsx", sheet_name="pitching")
    season_totals = ivy_pitching[ivy_pitching.Season==year]
    walks = season_totals["BB"].values[0]   
    strike_outs = season_totals["SO"].values[0]
    hit_batters = season_totals["HB"].values[0]
    walks_given = season_totals["BB"].values[0]
    home_runs_allowed = season_totals["HR-A"].values[0]
    strike_outs = season_totals["IP"].values[0]
    linear_weights = pd.read_csv("data/ncaa_d1_woba_linear_weights.csv")
    season_weights = linear_weights[linear_weights.Season== year]
    fip_constant =season_weights["cFIP"].values[0]
    fip = (((13*home_runs_allowed)+(3*(walks_given+hit_batters))-(2*strike_outs))/strike_outs) + fip_constant
    return fip

4.506829547519535

In [30]:
def get_era(player_id, year): 
    """
    Returns: ERA for a given player in a given season 
    
    Parameter player_id: The ID of player to return for
    Precondition: player_id is 
    Parameter year: The season to return wRC for 
    Precondition: year is an INT 
    """
    season_pitching = pd.read_csv("data/cornellpitching"+str(year)+".csv")
    player_pitching =  season_pitching[season_pitching.player_id==player_id]
    player_pitching = player_pitching.fillna(0)
    era = player_pitching["ERA"].values[0]
    return era

2.18

In [8]:
def get_runs_per_ip(player_id, year):
    """
    Returns: runs per innings pitched
    
    runs allowed / innings pitched

    """
    season_pitching = pd.read_csv("data/cornellpitching"+str(year)+".csv")
    player_pitching =  season_pitching[season_pitching.player_id==player_id]
    player_pitching = player_pitching.fillna(0)
    runs_allowed = player_pitching["R"].values[0]
    innings_pitched = player_pitching["IP"].values[0]
    res = runs_allowed/innings_pitched
    return res

In [7]:
def get_cornell_batting_stats(start=2015):
    """
    """
    batting = pd.read_csv("data/battingsince"+str(start)+".csv")
    batting = batting.loc[batting["BA"] > 0, :]
    team_list = {"player_id":[], "name":[], "year":[], "AB":[], "woba":[], "wrc":[], "wraa":[], "class_year":[]}
    for i in range(len(batting)):
        player_id = batting.iloc[i, 0]
        player = batting.iloc[i,6]
        year = batting.iloc[i, 1]
        at_bats = batting.iloc[i,15]
        class_year = batting.iloc[i,7]
        woba = get_woba(player_id, year)
        wrc  = get_wrc(player_id, year)
        wraa  = get_wraa(player_id, year)
        team_list["player_id"].append(player_id)
        team_list["AB"].append(at_bats)
        team_list["class_year"].append(class_year)
        team_list["name"].append(player)
        team_list["year"].append(year)
        team_list["woba"].append(woba)
        team_list["wrc"].append(wrc)
        team_list["wraa"].append(wraa)
    res = pd.DataFrame(team_list)
    return res

In [33]:
def get_cornell_pitching_stats(start=2015):
    """
    Returns: Dataframe of advanced pitching statistics
    """
    pitching = pd.read_csv("data/pitchingsince"+str(start)+".csv")
    pitching = pitching.loc[pitching['App']>0, :]
    team_list = {"player_id":[], "name":[], "year":[], "innings_pitched":[], "runs":[], "era":[], "fip":[], "runs_per_ip":[],"class_year":[]}
    for i in range(len(pitching)):
        player_id = pitching.iloc[i, 0]
        player = pitching.iloc[i,6]
        year = pitching.iloc[i, 1]
        class_year = pitching.iloc[i,7]
        innings_pitched = pitching.iloc[i, 13]
        runs_allowed = pitching.iloc[i, 15]
        era = get_era(player_id, year)
        fip = get_fip(player_id, year)
        runs_per_ip = get_runs_per_ip(player_id, year)
        team_list["player_id"].append(player_id)
        team_list["innings_pitched"].append(innings_pitched)
        team_list["runs"].append(runs_allowed)
        team_list["class_year"].append(class_year)
        team_list["name"].append(player)
        team_list["year"].append(year)
        team_list["era"].append(era)
        team_list["fip"].append(fip)
        team_list["runs_per_ip"].append(runs_per_ip)
    res = pd.DataFrame(team_list)
    return res

Unnamed: 0,player_id,name,year,innings_pitched,runs,era,fip,runs_per_ip,class_year
0,2111707.0,"Lillios, Nikolas",2020,5.1,1.0,1.69,2.497235,0.196078,So
1,2111755.0,"Zacharias, Jon",2020,6.2,9.0,6.75,6.909000,1.451613,So
2,1997345.0,"Davis, Trevor Daniel",2020,4.2,2.0,1.93,6.909000,0.476190,Jr
3,1997324.0,"Urbon, Seth",2020,7.0,6.0,6.43,7.766143,0.857143,Sr
4,1884389.0,"Wyatt, Colby",2020,14.1,17.0,5.65,6.377085,1.205674,Sr
...,...,...,...,...,...,...,...,...,...
79,1547001.0,"Brewer, Ray",2015,12.1,8.0,2.19,4.542917,0.661157,So
80,1652392.0,"McCulley, Zach",2015,12.1,10.0,7.30,5.121430,0.826446,Sr
81,1546997.0,"Veenema, Ryne",2015,5.0,10.0,18.00,9.733000,2.000000,So
82,1439149.0,"Horton, Matt",2015,1.0,5.0,27.00,10.733000,5.000000,Jr


[PythagenPat Expectation](http://tangotiger.net/wiki_archive/PythagenPat.html)    
[Explaination from one of the developers of PythagenPat](http://walksaber.blogspot.com/2009/01/runs-per-win-from-pythagenpat.html)

In [None]:
# Pythagenpat intra-conference win %'s by team for the Ivy League
def generate_ivy_pythags(start, end):
    """
    Returns: Dataframe of actual and expected winning percentages of Ivy League for a given set of seasons. 
    Parameter start: start year, inclusive
    Precondition: start is an int YYYY
    Parameter end: end year, inclusive
    Precondtion: end is an int YYYY
    """
    ivy_pythagenpat = {"team_name":[], "pythagenpat_pct":[], "actual_pct":[], "deviation":[]};
    for i in ["Brown", "Columbia", "Cornell", "Dartmouth", "Harvard", "Pennsylvania", "Princeton", "Yale"]:
        games = get_games(i, start=start, end=end, ivy_only=False)
        pythagenpat_pct = calculate_pythag_win_percentage(i,games)  
        actual_pct = calculate_actual_win_percentage(i,games)  
        deviation = round(pythagenpat_pct - actual_pct,3)
        wins =
        ivy_pythagenpat["team_name"].append(i)
        ivy_pythagenpat["pythagenpat_pct"].append(round(pythagenpat_pct,3))
        ivy_pythagenpat["actual_pct"].append(round(actual_pct,3))
        ivy_pythagenpat["deviation"].append(deviation)
        res = pd.DataFrame(ivy_pythagenpat)
    return res

In [11]:
get_wrc(1779085,2017)
get_cornell_batting_stats()
get_cornell_pitching_stats()
get_runs_per_ip(1546998, 2017)
get_era(1546998, 2017)
get_ivy_fip(2017)
get_cornell_fip(2015)
get_fip(1546998, 2017)
get_ivy_woba(2018)
get_cornell_woba(2019)
get_wrc(1779085,2017)

NameError: name 'get_wrc' is not defined