In [136]:
import pandas as pd
import numpy as np
from datetime import datetime
import requests
import tempfile
from io import StringIO


In [137]:
def get_games(team_name, start, end):
    """
    Returns: a dataframe of all games played for a given team inclusive of given start & end year
    
    Data Fd
    
    Parameter team_name: team whose games to select 
    Precondition: team_name is a lowercase string
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int

    assert type(team_name) == str
    assert type(start) == str
    assert type(end) == str
    team_name = team_name.capitalize()
    payload = {"team1":team_name, "firstyear":start, "team2":"all","lastyear":end, "format":"Text", "submit":"Fetch"}
    url = "http://www.boydsworld.com/cgi/scores.pl?" 
    s = requests.Session()
    r = requests.get(url, params = payload)
    response = r.text 
    print(response)
    if len(response) < 10:
        return pd.DataFrame
    else:
        temp = tempfile.NamedTemporaryFile(delete=False) 
        temp.write(str.encode(response))
        games = pd.read_fwf(temp.name, header=None, prefix="X")
        col_names = ["date", "team_1", "team_1_score", "team_2", "team_2_score", "field"]
        try: 
            games.columns = col_names
        except:            
            games.drop("Unnamed: 4", inplace=True, axis=1)
            games.columns = col_names
        return games
    """
    pass

In [138]:
def get_games(team_name, start,end):
    """
    Returns: a dataframe of all games played for a given team inclusive of given start & end year
    
    Data Fd
    
    Parameter team_name: team whose games to select 
    Precondition: team_name is a lowercase string
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """
    assert type(team_name) == str
    assert type(start) == int
    assert type(end) == int
    team_name = team_name.capitalize()
    payload = {"team1":team_name, "firstyear":str(start), "team2":"all","lastyear":str(end), "format":"Text", "submit":"Fetch"}
    url = "http://www.boydsworld.com/cgi/scores.pl?" 
    s = requests.Session()
    r = requests.get(url, params = payload)
    response = r.text 
    if len(response) < 10:
        return pd.DataFrame
    else:
        df = pd.read_fwf(StringIO(response), encoding = 'utf8',header=None)
        col_names = ["date", "team_1", "team_1_score", "team_2", "team_2_score", "field"]
        try: 
            df.shift(periods=1)
            df.columns = col_names
        except:            
            df.drop("Unnamed: 4", inplace=True, axis=1)
            df.columns = col_names
        return df

In [139]:
get_games("Yale", 2018, 2020)

Unnamed: 0,date,team_1,team_1_score,team_2,team_2_score,field
0,2/23/2018,Campbell,9,Yale,5,@Campbell
1,2/24/2018,"Miami, Ohio",2,Yale,1,@Yale
2,2/24/2018,"Miami, Ohio",4,Yale,3,"@Miami, Ohio"
3,2/25/2018,Western Michigan,6,Yale,0,@Yale
4,3/03/2018,Virginia,5,Yale,4,@Virginia
...,...,...,...,...,...,...
89,3/01/2020,Yale,19,Old Dominion,7,@Old Dominion
90,3/06/2020,Ball State,10,Yale,4,@neutral
91,3/07/2020,Sacred Heart,3,Yale,2,@neutral
92,3/08/2020,Yale,12,Richmond,3,@neutral


In [140]:
def get_wins(team_name, start, end):
    """
    Returns a dataframe of victories of a given team

    Parameter team_name: team to return victories of 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """        
    assert type(team_name) == str, "team_name invalid --> must be string"
    assert type(start) == int, "team_name invalid --> must be int"
    assert type(end) == int, "team_name invalid --> must be int"
    games = get_games(team_name,start,end)
    team_name = team_name.capitalize() 
    wins = games[(games["team_1"] == team_name) & (games["team_1_score"] > games["team_2_score"])]
    return wins

In [141]:
def get_wins_from_df(team_name, games):
    """
    Returns a dataframe of victories of a given team

    Parameter team_name: team to return victories of 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """        
    assert type(team_name) == str, "team_name invalid --> must be string"
    team_name = team_name.capitalize() 
    wins = games[(games["team_1"] == team_name) & (games["team_1_score"] > games["team_2_score"])]
    return wins

In [142]:
def get_losses(team_name, start, end):
    """
    Returns a dataframe of losses of a given dataframe o 

    Parameter team_name: team to return losses of 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """        
    assert type(team_name) == str, "team_name invalid --> must be string"
    assert type(start) == int, "start years invalid--> must be int"
    assert type(end) == int, "end year invalid --> must be int"
    games = get_games(team_name,start,end)
    losses = games[(games["team_2"] == team_name) & (games["team_1_score"] > games["team_2_score"])]
    return losses

In [143]:
def get_losses_from_df(team_name, games):
    """
    Returns a dataframe of losses of a given dataframe o 

    Parameter team_name: team to return losses of 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """        
    assert type(team_name) == str, "team_name invalid --> must be string"
    losses = games[(games["team_2"] == team_name) & (games["team_1_score"] > games["team_2_score"])]
    return losses

In [144]:
def get_intra_ivy(team_name, start, end):
    """
    Returns: data frame of in-conference games of given team.
    
     Creates a temporary column "intra_ivy" equal to 1 if against an Ivy-League opponent, 0 if not, and then 
     selects the rows for which this intra_ivy is one. 
    
    Plan to make this work for any team. Conference changes get tricky, will need helper to get conference team list for each season, 
    check each game against per-season list. 
    
    Parameter team_name: team whose games to select 
    Precondition: team_name is a lowercase string
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """
    assert type(team_name) == str, "team_name invalid --> must be str"
    assert type(start) == int, "start years invalid--> must be int"
    assert type(end) == int, "end year invalid --> must be int"
    conference = ["Brown","Cornell","Columbia","Dartmouth","Harvard","Pennsylvania","Princeton","Yale"]
    conference.remove(team_name)
    games = get_games(team_name, start, end)
    wins = get_wins_from_df(team_name, games)
    losses = get_losses_from_df(team_name, games)
    wins["intra_ivy"] = [1 if x in conference else 0 for x in wins["team_2"]]
    losses["intra_ivy"] = [1 if x in conference else 0 for x in losses["team_1"]]
    new_df = pd.concat([wins,losses])
    in_conference = new_df[new_df["intra_ivy"] == 1]
    return in_conference

In [145]:
def get_intra_ivy_from_df(team_name, games):  
    """
    Returns: data frame of in-conference games of given team.
    
     Creates a temporary column "intra_ivy" equal to 1 if against an Ivy-League opponent, 0 if not, and then 
     selects the rows for which this intra_ivy is one. 
    
    Plan to make this work for any team. Conference changes get tricky, will need helper to get conference team list for each season, 
    check each game against per-season list. 
    
    Parameter team_name: team whose games to select 
    Precondition: team_name is a lowercase string
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str, "team_name invalid --> must be str"
    conference = ["Brown","Cornell","Columbia","Dartmouth","Harvard","Pennsylvania","Princeton","Yale"]
    conference.remove(team_name)
    wins = get_wins_from_df(team_name, games)
    losses = get_losses_from_df(team_name, games)
    wins["intra_ivy"] = [1 if x in conference else 0 for x in wins["team_2"]]
    losses["intra_ivy"] = [1 if x in conference else 0 for x in losses["team_1"]]
    new_df = pd.concat([wins,losses])
    in_conference = new_df[new_df["intra_ivy"] == 1]
    return in_conference

In [146]:
def get_runs_scored(team_name,start, end):
    """
    Returns: int equal to total number of runs scored by team_name over given games
    
    Parameter team_name: team to calculate runs scored for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """
    assert type(team_name) == str, "team_name invalid --> must be str"
    assert type(start) == int, "start years invalid--> must be int"
    assert type(end) == int, "end year invalid --> must be int"
    games = get_games(team_name, start, end)
    wins = get_wins_from_df(team_name, games)
    wins["winner_runs_scored"] = wins["team_1_score"]
    losses = get_losses_from_df(team_name, games)
    losses["loser_runs_scored"] = losses["team_2_score"] 
    sum_runs_scored_in_wins = wins["winner_runs_scored"].sum()
    sum_runs_scored_in_losses = losses["loser_runs_scored"].sum()
    result = sum_runs_scored_in_wins+sum_runs_scored_in_losses
    return result

In [147]:
def get_runs_scored_from_df (team_name, games):
    """
    Returns: int equal to total number of runs scored by team_name over given games

    Parameter team_name: team to calculate runs scored for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games to aggregate over 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str, "team_name invalid --> must be str"
    wins = get_wins_from_df(team_name, games)
    wins["winner_runs_scored"] = wins["team_1_score"]
    losses = get_losses_from_df(team_name, games)
    losses["loser_runs_scored"] = losses["team_2_score"] 
    sum_runs_scored_in_wins = wins["winner_runs_scored"].sum()
    sum_runs_scored_in_losses = losses["loser_runs_scored"].sum()
    result = sum_runs_scored_in_wins+sum_runs_scored_in_losses
    return result

In [148]:
def get_runs_allowed(team_name, start, end):
    """
    Returns: int equal to total number of runs allowed by team_name over given games
    
    Parameter team_name: team to calculate runs allowed for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """
    assert type(team_name) == str, "team_name invalid --> must be str"
    assert type(start) == int, "start years invalid--> must be int"
    assert type(end) == int, "end year invalid --> must be int"
    games = get_games(team_name,start,end)
    wins = get_wins_from_df(team_name, games)
    wins["winner_runs_allowed"] = wins["team_2_score"]
    losses = get_losses_from_df(team_name, games)
    losses["loser_runs_allowed"] = losses["team_1_score"] 
    sum_runs_allowed_in_wins = wins["winner_runs_allowed"].sum()
    sum_runs_allowed_in_losses = losses["loser_runs_allowed"].sum()
    result = sum_runs_allowed_in_wins + sum_runs_allowed_in_losses
    return result

In [149]:
def get_runs_allowed_from_df(team_name, games):
    """
    Returns: int equal to total number of runs allowed by team_name over given games
    
    Parameter team_name: team to calculate runs allowed for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games to aggregate over 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str, "team_name invalid --> must be str"
    wins = get_wins_from_df(team_name, games)
    wins["winner_runs_allowed"] = wins["team_2_score"]
    losses = get_losses_from_df(team_name, games)
    losses["loser_runs_allowed"] = losses["team_1_score"] 
    sum_runs_allowed_in_wins = wins["winner_runs_allowed"].sum()
    sum_runs_allowed_in_losses = losses["loser_runs_allowed"].sum()
    result = sum_runs_allowed_in_wins + sum_runs_allowed_in_losses
    return result

In [150]:
def get_run_difference(team_name, start, end):
    """
    Returns: The total run difference across a given set of games 
    
    Parameter team_name: team to return run difference of
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """
    assert type(team_name) == str, "team_name invalid --> must be str"
    assert type(start) == int, "start years invalid--> must be int"
    assert type(end) == int, "end year invalid --> must be int"
    games  = get_games(team_name, start, end)
    scored = get_runs_allowed_from_df(team_name, games)
    allowed = get_runs_scored_from_df(team_name, games)
    result = scored - allowed
    return result

In [151]:
def get_run_difference_from_df(team_name, games):
    """
    Returns: The total run difference across a given set of games 
    
    Parameter team_name: team to return run difference of
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str, "team_name invalid --> must be str"
    scored = get_runs_allowed_from_df(team_name, games)
    allowed = get_runs_scored_from_df(team_name, games)
    result = scored - allowed
    return result

In [152]:
def add_run_difference_column(team_name, games):
    """
    Returns copy with a given dataframe with new column containing the run difference for each game for a given team.

    Parameter team_name: team to return run difference of
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"Precondtion: 
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str, "team_name invalid --> must be str"
    df_copy = games.copy()
    dif_in_wins = df_copy["team_1_score"] - df_copy["team_2_score"]
    dif_in_losses = df_copy["team_2_score"] - df_copy["team_1_score" ]
    df_copy["run_difference"] = np.where(((df_copy["team_1"] == team_name) & (df_copy["team_1_score"] > df_copy["team_2_score"])), dif_in_wins, dif_in_losses)
    df_copy["opponent"] = np.where(((df_copy["team_1"] == team_name) & (df_copy["team_1_score"] > df_copy["team_2_score"])), df_copy["team_2"], df_copy["team_1"])
    df_copy["cumulative_rd"] = df_copy["run_difference"].cumsum()
    return df_copy

[PythagenPat Expectation](http://tangotiger.net/wiki_archive/PythagenPat.html)    

[Explaination from one of the developers of PythagenPat](http://walksaber.blogspot.com/2009/01/runs-per-win-from-pythagenpat.html)

In [153]:
def pythag_win_percentage_from_df(team_name, games):
    """
    Returns: The PythagenPat winning percentage expectation of a given team over given games. 

    W% = R^x/(R^x + RA^x)
    where x = (RPG)^.287
    Developed by David Smyth and Patriot
   
    Parameter team_name: team to return expected winning % for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games over which to calculate
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str, "team_name invalid --> must be str"
    assert type(start) == int, "start years invalid--> must be int"
    assert type(end) == int, "end year invalid --> must be int"
    team_name  = team_name.capitalize()
    run_diff =  get_run_difference_from_df(team_name, games)
    runs_scored = get_runs_scored_from_df(team_name, games)
    runs_allowed = get_runs_allowed_from_df(team_name, games)    
    num_games = len(games.index)
    if not num_games == 0:
        runs_per_game = runs_scored / num_games
    else:
        runs_per_game = 0
    x = runs_per_game ** 0.287
    result  = (runs_scored ** x) / ((runs_scored ** x) + (runs_allowed ** x))
    return result

In [154]:
def actual_win_percentage_from_df(team_name, games):
    """
    Returns: The actual (i.e. experimental) winning percentage of a given team over given games. 

    winning_percentage = games_won / games_plated
   
    Parameter team_name: team to return actual winning % for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games over which to calculate
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str, "team_name invalid --> must be str"
    wins = get_wins_from_df(team_name, games)
    losses = get_losses_from_df(team_name,games)
    num_wins = len(wins.index)
    num_losses = len(losses.index)
    win_percentage = num_wins / (num_wins+num_losses)
    return win_percentage

In [155]:
# Pythagenpat intra-conference win %'s by team for the Ivy League
ivy_pythagenpat_percentages = {"team_name":[], "pythagenpat_percentage":[]};
start = 2015
end = 2020
for i in ["Brown", "Columbia", "Cornell", "Dartmouth", "Harvard", "Pennsylvania", "Princeton", "Yale"]:
    games = get_games(i,start,end)
    conference_games = get_intra_ivy_from_df(i,games)
    pythagenpat_percentage = pythag_win_percentage_from_df(i,conference_games)  
    ivy_pythagenpat_percentages["team_name"].append(i)
    ivy_pythagenpat_percentages["pythagenpat_percentage"].append(pythagenpat_percentage)
    results = pd.DataFrame(ivy_pythagenpat_percentages)

In [156]:
# Actual - Pythagenpat expected intra-conference win %'s by team for the Ivy League
results = {"team_name":[], "difference":[]};
start = 2015
end = 2020
for i in ["Brown", "Columbia", "Cornell", "Dartmouth", "Harvard", "Pennsylvania", "Princeton", "Yale"]:
    games = get_games(i,start,end)
    conference_games = get_intra_ivy_from_df(i,games)
    pythagenpat_percentage = pythag_win_percentage_from_df(i, conference_games)
    actual_percentage =  actual_win_percentage_from_df(i, conference_games)
    result = pythagenpat_percentage - actual_percentage
    results["team_name"].append(i)
    results["difference"].append(result)
    df = pd.DataFrame(results)
    
df

Unnamed: 0,team_name,difference
0,Brown,0.04315
1,Columbia,-0.018985
2,Cornell,0.023309
3,Dartmouth,-0.022654
4,Harvard,0.041218
5,Pennsylvania,0.054665
6,Princeton,-0.021789
7,Yale,-0.064189


## TESTING

In [157]:
team_name = "Cornell" # string of school name, first letter capitalized ex. "Brown"
start = 2015  # string of school name, inclusive,  ex. "2012"
end = 2020 # string of ending year, inclusive, ex. "2020"

get_games(team_name, start, end)
get_intra_ivy(team_name, start, end)
get_wins(team_name, start, end)
get_losses(team_name, start, end)
get_runs_scored(team_name, start, end)
get_runs_allowed(team_name, start, end)
get_run_difference(team_name, start, end)

games = get_games(team_name, start, end)
get_wins_from_df(team_name, games)
get_losses_from_df(team_name, games)
get_intra_ivy_from_df(team_name, games)
get_runs_allowed_from_df(team_name, games)
get_runs_scored_from_df(team_name, games)
get_run_difference_from_df(team_name, games)
pythag_win_percentage_from_df(team_name, games)
actual_win_percentage_from_df(team_name, games)
add_run_difference_column(team_name, games)


Unnamed: 0,date,team_1,team_1_score,team_2,team_2_score,field,run_difference,opponent,cumulative_rd
0,2/21/2015,Gardner-Webb,2,Cornell,1,@Gardner-Webb,-1,Gardner-Webb,-1
1,2/21/2015,Gardner-Webb,3,Cornell,2,@Gardner-Webb,-1,Gardner-Webb,-2
2,2/22/2015,Gardner-Webb,2,Cornell,1,@Gardner-Webb,-1,Gardner-Webb,-3
3,2/27/2015,Seton Hall,3,Cornell,0,@neutral,-3,Seton Hall,-6
4,2/28/2015,Cornell,4,Hartford,3,@neutral,1,Hartford,-5
...,...,...,...,...,...,...,...,...,...
194,2/29/2020,Niagara,8,Cornell,6,@neutral,-2,Niagara,-236
195,3/01/2020,Cornell,6,Niagara,5,@neutral,1,Niagara,-235
196,3/06/2020,South Carolina,10,Cornell,2,@South Carolina,-8,South Carolina,-243
197,3/07/2020,South Carolina,10,Cornell,3,@South Carolina,-7,South Carolina,-250
