In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
import requests
import tempfile
import warnings
warnings.filterwarnings('ignore')

In [8]:
#Global variables
school = "Cornell" # string of school name, first letter capitalized ex. "Brown"
start = "2015"  # string of school name, inclusive,  ex. "2012"
end = "2020" # string of ending year, inclusive, ex. "2020"

In [9]:
def get_games(team_name, start,end):
    """
    Returns: a dataframe of all games played for a given team inclusive of given start & end year
    
    Data Fd
    
    Parameter team_name: team whose games to select 
    Precondition: team_name is a lowercase string
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is a string of form "YYYY" 
    Parameter end: the end year of games
    Precondition: end is a string of form "YYYY"
    """
    assert type(team_name) == str
    assert type(start) == str && len(start) == 4
    assert type(end) == str && len(start) == 4
    team_name = team_name.capitalize()
    payload = {"team1":team_name, "firstyear":start, "team2":"all","lastyear":end, "format":"Text", "submit":"Fetch"}
    url = "http://www.boydsworld.com/cgi/scores.pl?" 
    s = requests.Session()
    r = requests.get(url, params = payload)
    response = r.text 
    if len(response) < 25:
        return pd.DataFrame
    else:
        temp = tempfile.NamedTemporaryFile(delete=False) 
        temp.write(str.encode(response))
        games = pd.read_fwf(temp.name, header=None, prefix="X")
        temp.close()
        col_names = ["date", "team_1", "team_1_score", "team_2", "team_2_score", "field"]
        try: 
            games.columns = col_names
        except:            
            games.drop("Unnamed: 4", inplace=True, axis=1)
            games.columns = col_names
        return games

get_games(school, start, end)

Unnamed: 0,date,team_1,team_1_score,team_2,team_2_score,field
0,2/21/2015,Gardner-Webb,2,Cornell,1,@Gardner-Webb
1,2/21/2015,Gardner-Webb,3,Cornell,2,@Gardner-Webb
2,2/22/2015,Gardner-Webb,2,Cornell,1,@Gardner-Webb
3,2/27/2015,Seton Hall,3,Cornell,0,@neutral
4,2/28/2015,Cornell,4,Hartford,3,@neutral
...,...,...,...,...,...,...
194,2/29/2020,Niagara,8,Cornell,6,@neutral
195,3/01/2020,Cornell,6,Niagara,5,@neutral
196,3/06/2020,South Carolina,10,Cornell,2,@South Carolina
197,3/07/2020,South Carolina,10,Cornell,3,@South Carolina


In [37]:
def get_wins(team_name, start, end):
    """
    Returns a dataframe of victories of a given team

    Parameter team_name: team to return victories of 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """        
    assert type(team_name) == str, "team_name invalid --> must be string"
    games = get_games(school,start,end)
    team_name = team_name.capitalize() 
    wins = games[(games["team_1"] == team_name) & (games["team_1_score"] > games["team_2_score"])]
    return wins

get_wins(school, start, end)

Unnamed: 0,date,team_1,team_1_score,team_2,team_2_score,field
4,2/28/2015,Cornell,4,Hartford,3,@neutral
7,3/14/2015,Cornell,8,Bowling Green State,6,@neutral
11,3/21/2015,Cornell,3,Bucknell,0,@Bucknell
14,3/29/2015,Cornell,10,Yale,1,@Cornell
15,3/29/2015,Cornell,7,Yale,5,@Cornell
...,...,...,...,...,...,...
184,4/28/2019,Cornell,8,Pennsylvania,7,@Cornell
185,5/01/2019,Cornell,5,Binghamton,2,@Binghamton
186,5/04/2019,Cornell,2,Dartmouth,1,@Dartmouth
187,5/04/2019,Cornell,8,Dartmouth,6,@Dartmouth


In [63]:
def get_wins_from_df(team_name, games):
    """
    Returns a dataframe of victories of a given team

    Parameter team_name: team to return victories of 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """        
    assert type(team_name) == str, "team_name invalid --> must be string"
    team_name = team_name.capitalize() 
    wins = games[(games["team_1"] == team_name) & (games["team_1_score"] > games["team_2_score"])]
    return wins

get_wins_from_df(school, games)

Unnamed: 0,date,team_1,team_1_score,team_2,team_2_score,field
4,2/28/2015,Cornell,4,Hartford,3,@neutral
7,3/14/2015,Cornell,8,Bowling Green State,6,@neutral
11,3/21/2015,Cornell,3,Bucknell,0,@Bucknell
14,3/29/2015,Cornell,10,Yale,1,@Cornell
15,3/29/2015,Cornell,7,Yale,5,@Cornell
...,...,...,...,...,...,...
184,4/28/2019,Cornell,8,Pennsylvania,7,@Cornell
185,5/01/2019,Cornell,5,Binghamton,2,@Binghamton
186,5/04/2019,Cornell,2,Dartmouth,1,@Dartmouth
187,5/04/2019,Cornell,8,Dartmouth,6,@Dartmouth


In [39]:
def losses(team_name, start, end):
    """
    Returns a dataframe of losses of a given dataframe o 

    Parameter team_name: team to return losses of 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """        
    assert type(team_name) == str, "team_name invalid. must be string"
    games = (school,start,end)
    losses = games[(games["team_2"] == team_name) & (games["team_1_score"] > games["team_2_score"])]
    return losses

get_losses(school, start, end)

Unnamed: 0,date,team_1,team_1_score,team_2,team_2_score,field
0,2/21/2015,Gardner-Webb,2,Cornell,1,@Gardner-Webb
1,2/21/2015,Gardner-Webb,3,Cornell,2,@Gardner-Webb
2,2/22/2015,Gardner-Webb,2,Cornell,1,@Gardner-Webb
3,2/27/2015,Seton Hall,3,Cornell,0,@neutral
5,3/01/2015,Virginia,5,Cornell,4,@neutral
...,...,...,...,...,...,...
193,2/29/2020,Niagara,8,Cornell,4,@neutral
194,2/29/2020,Niagara,8,Cornell,6,@neutral
196,3/06/2020,South Carolina,10,Cornell,2,@South Carolina
197,3/07/2020,South Carolina,10,Cornell,3,@South Carolina


In [62]:
def get_losses_from_df(team_name, games):
    """
    Returns a dataframe of losses of a given dataframe o 

    Parameter team_name: team to return losses of 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """        
    assert type(team_name) == str, "team_name invalid. must be string"
    losses = games[(games["team_2"] == team_name) & (games["team_1_score"] > games["team_2_score"])]
    return losses

get_losses_from_df(school, games)

Unnamed: 0,date,team_1,team_1_score,team_2,team_2_score,field
0,2/21/2015,Gardner-Webb,2,Cornell,1,@Gardner-Webb
1,2/21/2015,Gardner-Webb,3,Cornell,2,@Gardner-Webb
2,2/22/2015,Gardner-Webb,2,Cornell,1,@Gardner-Webb
3,2/27/2015,Seton Hall,3,Cornell,0,@neutral
5,3/01/2015,Virginia,5,Cornell,4,@neutral
...,...,...,...,...,...,...
193,2/29/2020,Niagara,8,Cornell,4,@neutral
194,2/29/2020,Niagara,8,Cornell,6,@neutral
196,3/06/2020,South Carolina,10,Cornell,2,@South Carolina
197,3/07/2020,South Carolina,10,Cornell,3,@South Carolina


In [51]:
def get_intra_ivy(team_name, start, end):
    """
    Returns: data frame of in-conference games of given team.
    
     Creates a temporary column "intra_ivy" equal to 1 if against an Ivy-League opponent, 0 if not, and then 
     selects the rows for which this intra_ivy is one. 
    
    Plan to make this work for any team. Conference changes get tricky, will need helper to get conference team list for each season, 
    check each game against per-season list. 
    
    Parameter team_name: team whose games to select 
    Precondition: team_name is a lowercase string
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is a string of form "YYYY" 
    Parameter end: the end year of games
    Precondition: end is a string of form "YYYY"
    """
    assert type(team_name) == str
    assert type(start) == str 
    assert type(end) == str
    conference = ["Brown","Cornell","Columbia","Dartmouth","Harvard","Pennsylvania","Princeton","Yale"]
    conference.remove(team_name)
    games = get_games(team_name,start,end)
    wins = get_wins_from_df(team_name, games)
    losses = get_losses_from_df(team_name, games)
    wins["intra_ivy"] = [1 if x in conference else 0 for x in wins["team_2"]]
    losses["intra_ivy"] = [1 if x in conference else 0 for x in losses["team_1"]]
    new_df = pd.concat([wins,losses])
    in_conference = new_df[new_df["intra_ivy"] == 1]
    return in_conference

get_intra_ivy(school, start, end)

Unnamed: 0,date,team_1,team_1_score,team_2,team_2_score,field,intra_ivy
14,3/29/2015,Cornell,10,Yale,1,@Cornell,1
15,3/29/2015,Cornell,7,Yale,5,@Cornell,1
17,3/30/2015,Cornell,10,Brown,6,@Cornell,1
20,4/04/2015,Cornell,1,Dartmouth,0,@Dartmouth,1
22,4/05/2015,Cornell,5,Harvard,4,@Harvard,1
...,...,...,...,...,...,...,...
177,4/13/2019,Harvard,8,Cornell,6,@Harvard,1
180,4/20/2019,Princeton,7,Cornell,2,@Cornell,1
181,4/21/2019,Princeton,4,Cornell,2,@Cornell,1
183,4/27/2019,Pennsylvania,6,Cornell,2,@Cornell,1


In [83]:
def get_intra_ivy_from_df(team_name, games):  
    """
    Returns: data frame of in-conference games of given team.
    
     Creates a temporary column "intra_ivy" equal to 1 if against an Ivy-League opponent, 0 if not, and then 
     selects the rows for which this intra_ivy is one. 
    
    Plan to make this work for any team. Conference changes get tricky, will need helper to get conference team list for each season, 
    check each game against per-season list. 
    
    Parameter team_name: team whose games to select 
    Precondition: team_name is a lowercase string
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is a string of form "YYYY" 
    Parameter end: the end year of games
    Precondition: end is a string of form "YYYY"
    """
    assert type(team_name) == str 
    assert type(start) == str
    assert type(end) == str
    conference = ["Brown","Cornell","Columbia","Dartmouth","Harvard","Pennsylvania","Princeton","Yale"]
    conference.remove(team_name)
    wins = get_wins_from_df(team_name, games)
    losses = get_losses_from_df(team_name, games)
    wins["intra_ivy"] = [1 if x in conference else 0 for x in wins["team_2"]]
    losses["intra_ivy"] = [1 if x in conference else 0 for x in losses["team_1"]]
    new_df = pd.concat([wins,losses])
    in_conference = new_df[new_df["intra_ivy"] == 1]
    return in_conference

get_intra_ivy_from_df(school, games)

Unnamed: 0,date,team_1,team_1_score,team_2,team_2_score,field,intra_ivy
14,3/29/2015,Cornell,10,Yale,1,@Cornell,1
15,3/29/2015,Cornell,7,Yale,5,@Cornell,1
17,3/30/2015,Cornell,10,Brown,6,@Cornell,1
20,4/04/2015,Cornell,1,Dartmouth,0,@Dartmouth,1
22,4/05/2015,Cornell,5,Harvard,4,@Harvard,1
...,...,...,...,...,...,...,...
177,4/13/2019,Harvard,8,Cornell,6,@Harvard,1
180,4/20/2019,Princeton,7,Cornell,2,@Cornell,1
181,4/21/2019,Princeton,4,Cornell,2,@Cornell,1
183,4/27/2019,Pennsylvania,6,Cornell,2,@Cornell,1


In [82]:
def get_runs_scored(team_name,start, end):
    """
    Returns: int equal to total number of runs scored by team_name over given games
    
    Parameter team_name: team to calculate runs scored for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games to aggregate over 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str
    games = get_games(team_name, start, end)
    wins = get_wins_from_df(team_name, games)
    wins["winner_runs_scored"] = wins["team_1_score"]
    losses = get_losses_from_df(team_name, games)
    losses["loser_runs_scored"] = losses["team_2_score"] 
    sum_runs_scored_in_wins = wins["winner_runs_scored"].sum()
    sum_runs_scored_in_losses = losses["loser_runs_scored"].sum()
    result = sum_runs_scored_in_wins+sum_runs_scored_in_losses
    return result

get_runs_scored(school, start, end)

888

In [58]:
def get_runs_scored_from_df (team_name, games):
    """
    Returns: int equal to total number of runs scored by team_name over given games

    Parameter team_name: team to calculate runs scored for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games to aggregate over 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str
    wins = get_wins_from_df(team_name, games)
    wins["winner_runs_scored"] = wins["team_1_score"]
    losses = get_losses_from_df(team_name, games)
    losses["loser_runs_scored"] = losses["team_2_score"] 
    sum_runs_scored_in_wins = wins["winner_runs_scored"].sum()
    sum_runs_scored_in_losses = losses["loser_runs_scored"].sum()
    result = sum_runs_scored_in_wins+sum_runs_scored_in_losses
    return result

get_runs_scored_from_df(school, games)

888

In [70]:
def get_runs_allowed(team_name, games):
    """
    Returns: int equal to total number of runs allowed by team_name over given games
    
    Parameter team_name: team to calculate runs allowed for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games to aggregate over 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str
    games = get_games(team_name,start,end)
    wins = get_wins_from_df(team_name, games)
    wins["winner_runs_allowed"] = wins["team_2_score"]
    losses = get_losses_from_df(team_name, games)
    losses["loser_runs_allowed"] = losses["team_1_score"] 
    sum_runs_allowed_in_wins = wins["winner_runs_allowed"].sum()
    sum_runs_allowed_in_losses = losses["loser_runs_allowed"].sum()
    result = sum_runs_allowed_in_wins + sum_runs_allowed_in_losses
    return result
                        
get_runs_allowed_from_df(school, games)

1143

In [71]:
def get_runs_allowed_from_df(team_name, games):
    """
    Returns: int equal to total number of runs allowed by team_name over given games
    
    Parameter team_name: team to calculate runs allowed for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games to aggregate over 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str
    wins = get_wins_from_df(team_name, games)
    wins["winner_runs_allowed"] = wins["team_2_score"]
    losses = get_losses_from_df(team_name, games)
    losses["loser_runs_allowed"] = losses["team_1_score"] 
    sum_runs_allowed_in_wins = wins["winner_runs_allowed"].sum()
    sum_runs_allowed_in_losses = losses["loser_runs_allowed"].sum()
    result = sum_runs_allowed_in_wins + sum_runs_allowed_in_losses
    return result
                        
get_runs_allowed_from_df(school, games)

1143

In [75]:
def get_run_difference(team_name, start, end):
    """
    Returns: The total run difference across a given set of games 
    
    Parameter team_name: team to return run difference of
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str
    wins = get_wins_from_df(team_name, games)
    wins["run_diff"] = wins["team_1_score"] - wins["team_2_score"]
    losses = get_losses_from_df(team_name, games)
    losses["run_diff"] = losses["team_2_score"] - losses["team_1_score"]
    sum_win_diffs = wins["run_diff"].sum()
    sum_loss_diffs = losses["run_diff"].sum()
    result = sum_win_diffs+sum_loss_diffs
    return result
get_run_difference(school, start, end)

-255

In [79]:
def get_run_difference_from_df(team_name, games):
    """
    Returns: The total run difference across a given set of games 
    
    Parameter team_name: team to return run difference of
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str
    wins = get_wins_from_df(team_name, games)
    wins["run_diff"] = wins["team_1_score"] - wins["team_2_score"]
    losses = get_losses_from_df(team_name, games)
    losses["run_diff"] = losses["team_2_score"] - losses["team_1_score"]
    sum_win_diffs = wins["run_diff"].sum()
    sum_loss_diffs = losses["run_diff"].sum()
    result = sum_win_diffs+sum_loss_diffs
    return result

get_run_difference_from_df(school, games)

-255

[PythagenPat Expectation](http://tangotiger.net/wiki_archive/PythagenPat.html)    

[Explaination from one of the developers of PythagenPat](http://walksaber.blogspot.com/2009/01/runs-per-win-from-pythagenpat.html)

In [46]:
def pythag_win_percentage(team_name, start, end):
    """
    Returns: The PythagenPat winning percentage expectation of a given team over given games. 

    W% = R^x/(R^x + RA^x)
    where x = (RPG)^.287
    Developed by David Smyth and Patriot
   
    Parameter team_name: team to return expected winning % for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games over which to calculate
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str
    assert type(start) == str 
    assert type(end) == str
    team_name  = team_name.capitalize()
    games = get_games(school,start,end)

    run_diff =  get_run_difference(team_name, start, end)
    runs_scored = get_runs_scored(team_name, start, end)
    runs_allowed = get_runs_allowed(team_name, start, end)
    games = get_games(school,start,end)
    
    num_games = len(games.index)
    if not num_games == 0:
        runs_per_game = runs_scored / num_games
    else:
        runs_per_game = 0
    x = runs_per_game ** 0.287
    result  = (runs_scored ** x) / ((runs_scored ** x) + (runs_allowed ** x))
    return result


games = get_games(school,start,end)
pythag_win_percentage(school,games)

TypeError: pythag_win_percentage() missing 1 required positional argument: 'end'

In [14]:
def actual_win_percentage(team_name, games):
    """
    Returns: The actual (i.e. experimental) winning percentage of a given team over given games. 

    winning_percentage = games_won / games_plated
   
    Parameter team_name: team to return actual winning % for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games over which to calculate
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str
    wins = get_wins(team_name, games)
    losses = get_losses(team_name,games)
    num_wins = len(wins.index)
    num_losses = len(losses.index)
    win_percentage = num_wins / (num_wins+num_losses)
    return win_percentage

games = get_games(school,start,end)
actual_win_percentage(school,games)


0.4678111587982833

In [15]:
# Pythagenpat intra-conference win %'s by team for the Ivy League
ivy_pythagenpat_percentages = {"school":[], "pythagenpat_percentage":[]};
start = "2015"
end = "2020"
for i in ["Brown", "Columbia", "Cornell", "Dartmouth", "Harvard", "Pennsylvania", "Princeton", "Yale"]:
    games = get_games(i,start,end)
    conference_games = get_intra_ivy(i,games)
    pythagenpat_percentage = pythag_win_percentage(i,conference_games)  
    ivy_pythagenpat_percentages["school"].append(i)
    ivy_pythagenpat_percentages["pythagenpat_percentage"].append(pythagenpat_percentage)
    results = pd.DataFrame(ivy_pythagenpat_percentages)
    
results

Unnamed: 0,school,pythagenpat_percentage
0,Brown,0.396092
1,Columbia,0.602637
2,Cornell,0.435074
3,Dartmouth,0.539251
4,Harvard,0.531602
5,Pennsylvania,0.616569
6,Princeton,0.372442
7,Yale,0.522967


In [16]:
# Actual - Pythagenpat expected intra-conference win %'s by team for the Ivy League
results = {"school":[], "difference":[]};
start = "2015"
end = "2020"
for i in ["Brown", "Columbia", "Cornell", "Dartmouth", "Harvard", "Pennsylvania", "Princeton", "Yale"]:
    games = get_games(i,start,end)
    conference_games = get_intra_ivy(i,games)
    pythagenpat_percentage = pythag_win_percentage(i, conference_games)
    actual_percentage =  actual_win_percentage(i, conference_games)
    result = pythagenpat_percentage - actual_percentage
    results["school"].append(i)
    results["difference"].append(result)
    df = pd.DataFrame(results)
    
df

Unnamed: 0,school,difference
0,Brown,0.04315
1,Columbia,-0.018985
2,Cornell,0.023309
3,Dartmouth,-0.022654
4,Harvard,0.041218
5,Pennsylvania,0.054665
6,Princeton,-0.021789
7,Yale,-0.064189
