In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import requests
import tempfile
from io import StringIO
import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_games(team_name, start,end):
    """
    Returns: a dataframe of all games played for a given team inclusive of given start & end year
    
    Parameter team_name: team whose games to select 
    Precondition: team_name is a lowercase string
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """
    assert type(team_name) == str, "team_name invalid, must be string"
    assert type(start) == int, "team_name invalid, must be int"
    assert type(end) == int, "team_name invalid, must be int"
    payload = {"team1":team_name, "firstyear":str(start), "team2":"all","lastyear":str(end), "format":"Text", "submit":"Fetch"}
    url = "http://www.boydsworld.com/cgi/scores.pl?" 
    s = requests.Session()
    r = requests.get(url, params = payload)
    response = r.text 
    if (len(response) < 10):
        return pd.DataFrame
    else:
        df = pd.read_fwf(StringIO(response), encoding = 'utf8',header=None)
        col_names = ["date", "team_1", "team_1_score", "team_2", "team_2_score", "field"]
        try: 
            df.shift(periods=1)
            df.columns = col_names
        except:          
            df.columns = df.columns.astype("str")
            df.drop("2", inplace=True, axis=1)
            df.columns = col_names
        return df

In [3]:
def get_wins(team_name, start, end):
    """
    Returns a dataframe of victories of a given team

    Parameter team_name: team to return victories of 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """        
    assert type(team_name) == str, "team_name invalid, must be string"
    assert type(start) == int, "team_name invalid, must be int"
    assert type(end) == int, "team_name invalid, must be int"
    games = get_games(team_name,start,end)
    wins = games[(games["team_1"] == team_name) & (games["team_1_score"] > games["team_2_score"])]
    return wins

In [4]:
def get_wins_from_df(team_name, games):
    """
    Returns a dataframe of victories of a given team from given get_games dataframe

    Parameter team_name: team to return victories of 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """        
    assert type(team_name) == str, "team_name invalid, must be string"
    wins = games[(games["team_1"] == team_name) & (games["team_1_score"] > games["team_2_score"])]
    return wins

In [5]:
def get_losses(team_name, start, end):
    """
    Returns a dataframe of losses of a given dataframe 

    Parameter team_name: team to return losses of 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """        
    assert type(team_name) == str, "team_name invalid, must be string"
    assert type(start) == int, "start years invalid, must be int"
    assert type(end) == int, "end year invalid, must be int"
    games = get_games(team_name,start,end)
    losses = games[(games["team_2"] == team_name) & (games["team_1_score"] > games["team_2_score"])]
    return losses

In [6]:
def get_losses_from_df(team_name, games):
    """
    Returns a dataframe of losses of a given dataframe from get_games dataframe

    Parameter team_name: team to return losses of 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """        
    assert type(team_name) == str, "team_name invalid, must be string"
    losses = games[(games["team_2"] == team_name) & (games["team_1_score"] > games["team_2_score"])]
    return losses

In [7]:
def get_intra_ivy(team_name, start, end):
    """
    Returns: data frame of in-conference games of given team. Currently only functional for Ivy League Teams. 
    
     Creates a temporary column "intra_ivy" equal to 1 if against an Ivy-League opponent, 0 if not, and then 
     selects the rows for which this intra_ivy is one. 
    
    Plan to make this work for any team. Conference changes get tricky, will need helper to get conference team list for each season, 
    check each game against per-season list. 
    
    Parameter team_name: team whose games to select 
    Precondition: team_name is a lowercase string
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """
    assert type(team_name) == str, "team_name invalid, must be string"
    assert type(start) == int, "start years invalid, must be int"
    assert type(end) == int, "end year invalid, must be int"
    conference = ["Brown","Cornell","Columbia","Dartmouth","Harvard","Pennsylvania","Princeton","Yale"]
    conference.remove(team_name)
    games = get_games(team_name, start, end)
    wins = get_wins_from_df(team_name, games)
    losses = get_losses_from_df(team_name, games)
    wins["intra_ivy"] = [1 if x in conference else 0 for x in wins["team_2"]]
    losses["intra_ivy"] = [1 if x in conference else 0 for x in losses["team_1"]]
    new_df = pd.concat([wins,losses])
    in_conference = new_df[new_df["intra_ivy"] == 1]
    return in_conference

In [8]:
def get_intra_ivy_from_df(team_name, games):  
    """
    Returns: data frame of in-conference games of given team from get_games dataframe
    
     Creates a temporary column "intra_ivy" equal to 1 if against an Ivy-League opponent, 0 if not, and then 
     selects the rows for which this intra_ivy is one. 
    
    Plan to make this work for any team. Conference changes get tricky, will need helper to get conference team list for each season, 
    check each game against per-season list. 
    
    Parameter team_name: team whose games to select 
    Precondition: team_name is a lowercase string
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str, "team_name invalid, must be string"
    conference = ["Brown","Cornell","Columbia","Dartmouth","Harvard","Pennsylvania","Princeton","Yale"]
    conference.remove(team_name)
    wins = get_wins_from_df(team_name, games)
    losses = get_losses_from_df(team_name, games)
    wins["intra_ivy"] = [1 if x in conference else 0 for x in wins["team_2"]]
    losses["intra_ivy"] = [1 if x in conference else 0 for x in losses["team_1"]]
    new_df = pd.concat([wins,losses])
    in_conference = new_df[new_df["intra_ivy"] == 1]
    return in_conference

In [9]:
def get_runs_scored(team_name,start, end):
    """
    Returns: int equal to total number of runs scored by team_name over given games
    
    Parameter team_name: team to calculate runs scored for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """
    assert type(team_name) == str, "team_name invalid, must be string"
    assert type(start) == int, "start years invalid, must be int"
    assert type(end) == int, "end year invalid, must be int"
    games = get_games(team_name, start, end)
    wins = get_wins_from_df(team_name, games)
    wins["winner_runs_scored"] = wins["team_1_score"]
    losses = get_losses_from_df(team_name, games)
    losses["loser_runs_scored"] = losses["team_2_score"] 
    sum_runs_scored_in_wins = wins["winner_runs_scored"].sum()
    sum_runs_scored_in_losses = losses["loser_runs_scored"].sum()
    result = sum_runs_scored_in_wins+sum_runs_scored_in_losses
    return result

In [10]:
def get_runs_scored_from_df (team_name, games):
    """
    Returns: int equal to total number of runs scored by team_name over given games

    Parameter team_name: team to calculate runs scored for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games to aggregate over 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str, "team_name invalid, must be string"
    wins = get_wins_from_df(team_name, games)
    wins["winner_runs_scored"] = wins["team_1_score"]
    losses = get_losses_from_df(team_name, games)
    losses["loser_runs_scored"] = losses["team_2_score"] 
    sum_runs_scored_in_wins = wins["winner_runs_scored"].sum()
    sum_runs_scored_in_losses = losses["loser_runs_scored"].sum()
    result = sum_runs_scored_in_wins+sum_runs_scored_in_losses
    return result

In [11]:
def get_runs_allowed(team_name, start, end):
    """
    Returns: int equal to total number of runs allowed by team_name over given games
    
    Parameter team_name: team to calculate runs allowed for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """
    assert type(team_name) == str, "team_name invalid, must be string"
    assert type(start) == int, "start years invalid, must be int"
    assert type(end) == int, "end year invalid, must be int"
    games = get_games(team_name,start,end)
    wins = get_wins_from_df(team_name, games)
    wins["winner_runs_allowed"] = wins["team_2_score"]
    losses = get_losses_from_df(team_name, games)
    losses["loser_runs_allowed"] = losses["team_1_score"] 
    sum_runs_allowed_in_wins = wins["winner_runs_allowed"].sum()
    sum_runs_allowed_in_losses = losses["loser_runs_allowed"].sum()
    result = sum_runs_allowed_in_wins + sum_runs_allowed_in_losses
    return result

In [12]:
def get_runs_allowed_from_df(team_name, games):
    """
    Returns: int equal to total number of runs allowed by team_name over given get_games dataframe
    
    Parameter team_name: team to calculate runs allowed for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games to aggregate over 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str, "team_name invalid, must be string"
    wins = get_wins_from_df(team_name, games)
    wins["winner_runs_allowed"] = wins["team_2_score"]
    losses = get_losses_from_df(team_name, games)
    losses["loser_runs_allowed"] = losses["team_1_score"] 
    sum_runs_allowed_in_wins = wins["winner_runs_allowed"].sum()
    sum_runs_allowed_in_losses = losses["loser_runs_allowed"].sum()
    result = sum_runs_allowed_in_wins + sum_runs_allowed_in_losses
    return result

In [13]:
def get_run_difference(team_name, start, end):
    """
    Returns: The total run difference across a given set of games 
    
    Parameter team_name: team to return run difference of
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter start: the start year of games. To select only games from one year, leave  
    Precondition: start is an int
    Parameter end: the end year of games
    Precondition: end is an int
    """
    assert type(team_name) == str, "team_name invalid, must be string"
    assert type(start) == int, "start years invalid, must be int"
    assert type(end) == int, "end year invalid, must be int"
    games  = get_games(team_name, start, end)
    scored = get_runs_allowed_from_df(team_name, games)
    allowed = get_runs_scored_from_df(team_name, games)
    result = scored - allowed
    return result

In [14]:
def get_run_difference_from_df(team_name, games):
    """
    Returns: The total run difference across a given set of get_games dataframe
    
    Parameter team_name: team to return run difference of
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str, "team_name invalid, must be string"
    scored = get_runs_allowed_from_df(team_name, games)
    allowed = get_runs_scored_from_df(team_name, games)
    result = scored - allowed
    return result

In [15]:
def add_run_difference_column(team_name, games):
    """
    Returns copy with a given dataframe with new column containing the run difference for each game for a given team.

    Parameter team_name: team to return run difference of
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"Precondtion: 
    Parameter games: Games to filter 
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str, "team_name invalid, must be string"
    df_copy = games.copy()
    dif_in_wins = df_copy["team_1_score"] - df_copy["team_2_score"]
    dif_in_losses = df_copy["team_2_score"] - df_copy["team_1_score" ]
    df_copy["run_difference"] = np.where(((df_copy["team_1"] == team_name) & (df_copy["team_1_score"] > df_copy["team_2_score"])), dif_in_wins, dif_in_losses)
    df_copy["opponent"] = np.where(((df_copy["team_1"] == team_name) & (df_copy["team_1_score"] > df_copy["team_2_score"])), df_copy["team_2"], df_copy["team_1"])
    df_copy["cumulative_rd"] = df_copy["run_difference"].cumsum()
    return df_copy

In [22]:
def pythag_win_percentage_from_df(team_name, games):
    """
    Returns: The PythagenPat winning percentage expectation of a given team over given games. 

    W% = R^x/(R^x + RA^x)
    where x = (RPG)^.287
    Developed by David Smyth and Patriot
   
    Parameter team_name: team to return expected winning % for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games over which to calculate
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str, "team_name invalid, must be string"
    run_diff =  get_run_difference_from_df(team_name, games)
    runs_scored = get_runs_scored_from_df(team_name, games)
    runs_allowed = get_runs_allowed_from_df(team_name, games)    
    num_games = len(games.index)
    if not num_games == 0:
        runs_per_game = runs_scored / num_games
    else:
        runs_per_game = 0
    x = runs_per_game ** 0.287
    result  = (runs_scored ** x) / ((runs_scored ** x) + (runs_allowed ** x))
    return result

[PythagenPat Expectation](http://tangotiger.net/wiki_archive/PythagenPat.html)    

[Explaination from one of the developers of PythagenPat](http://walksaber.blogspot.com/2009/01/runs-per-win-from-pythagenpat.html)

In [16]:
def actual_win_percentage_from_df(team_name, games):
    """
    Returns: The actual (i.e. experimental) winning percentage of a given team over given games. 

    winning_percentage = games_won / games_plated
   
    Parameter team_name: team to return actual winning % for 
    Preconditions: team_name is a string format ex. "Cornell," "Colgate"
    Parameter games: games over which to calculate
    Precondition: games is a DataFrame returned by getGames() function
    """
    assert type(team_name) == str, "team_name invalid, must be string"
    wins = get_wins_from_df(team_name, games)
    losses = get_losses_from_df(team_name,games)
    num_wins = len(wins.index)
    num_losses = len(losses.index)
    win_percentage = num_wins / (num_wins+num_losses)
    return win_percentage

## TESTING

In [20]:
def test_module(team_name, start, end):    
    print("running tests for "+team_name+" games from "+str(start)+" to "+str(end))
    get_games(team_name, start, end)
    print("function get_games passed")
    get_intra_ivy(team_name, start, end)
    print("function get_intra_ivy passed")
    get_wins(team_name, start, end)
    print("function get_wins passed")
    get_losses(team_name, start, end)
    print("function get_losses passed")
    get_runs_scored(team_name, start, end)
    print("function get_runs_scored passed")
    get_runs_allowed(team_name, start, end)
    print("function get_runs_allowed passed")
    get_run_difference(team_name, start, end)
    print("function get_run_difference passed")
    games = get_games(team_name, start, end)
    get_wins_from_df(team_name, games)
    print("function get_wins_from_df passed")
    get_losses_from_df(team_name, games)
    print("function get_losses_from_df passed")
    get_intra_ivy_from_df(team_name, games)
    print("function get_intra_ivy_from_df passed")
    get_runs_allowed_from_df(team_name, games)
    print("function get_runs_allowed_from_df passed")
    get_runs_scored_from_df(team_name, games)
    print("function get_runs_scored_from_df passed")
    get_run_difference_from_df(team_name, games)
    print("function get_run_difference_from_df passed")
    pythag_win_percentage_from_df(team_name, games)
    print("function pythag_win_percentage_from_df passed")
    actual_win_percentage_from_df(team_name, games)
    print("function actual_win_percentage_from_df passed")
    add_run_difference_column(team_name, games)
    print("function add_run_difference_column passed")
    print("all tests passed :)")

In [1]:
#test_module("Yale", 2015, 2020)