## Most Similar Games

Determine n most similar matchups based off of the ESPN efficiency rating for 1. Cal's offense and 2. the opponent's defense. I know we talked about determining the most similar matchups based off of 4 parameters (the offense and defense of both teams), but becuase we didn't decide how to weight the 4 parameters I use only 1. and 2.

In [2]:
import pandas as pd
import numpy as np
import urllib.request
from bs4 import BeautifulSoup
from collections import Counter
import csv
import re
from get_data import get_data

In [3]:
"""Extracts ESPN Offensive and Defensive Efficieny Ratings from 2005-2017

Returns:
    3 Pandas Data Frames: 
        1. awayTeam, homeTeam, year
        2. Def. Rating, Off. Rating, Team, year
        3. pxp data

"""

def get_ratings():
    
    def extract_team_name(x):
            match = re.search('\>([a-zA-Z\s\&\;\(\)\.\']*)\<\/a\>', x)
            if match:
                found = match.group(1)
            return found

    def extract_rating(x):
            match = re.search('\>(.*)\<', x)
            if match:
                found = match.group(1)
            return found
        
    pxp_data = get_data('all')

    game_data = pxp_data.groupby(pxp_data.gameId).first().loc[:, ["awayTeam", "homeTeam", "year"]]
    game_data = game_data.loc[game_data.year >= 2005]
    
    ### Remove teams that were not DI for all years between 2005 and 2017
    teams_to_remove = ["Appalachian St", "Texas State", "W Kentucky", "UMass", "Ga Southern", "Georgia State", "South Alabama"]

    for x in teams_to_remove:
        team_away = game_data.awayTeam == x
        team_home = game_data.homeTeam == x
        team_home = ~team_home.values
        team_away = ~team_away.values
        game_data = game_data[team_home*team_away]
                
    teams = []
    off_eff = []
    def_eff = []
    year = []

    ### Start of Web Scraper
    for x in np.arange(2005, 2018):
        url = "http://www.espn.com/college-football/statistics/teamratings/_/year/" + str(x) + "/sort/offEfficiency/tab/efficiency"
        page = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(page)

        raw_team_name = []
        raw_off_rating = []
        raw_def_rating = []

        for tr in soup.find_all('tr')[1:]:
            tds = tr.find_all('td')
            team = tds[1]
            off_rating = tds[2]
            def_rating = tds[3]

            raw_team_name.append(team)
            raw_off_rating.append(off_rating)
            raw_def_rating.append(def_rating)

        raw_team_name = [str(x) for x in raw_team_name]
        raw_team_name = raw_team_name[1:]
        raw_team_name = np.array(raw_team_name)

        column_headers = ~np.array(["TEAM" in x for x in raw_team_name])
        raw_team_name = raw_team_name[column_headers]
        team_names = [extract_team_name(x) for x in raw_team_name]

        raw_off_rating = raw_off_rating[1:]
        raw_off_rating = [str(x) for x in raw_off_rating]
        raw_off_rating = np.array(raw_off_rating)
        raw_off_rating = raw_off_rating[column_headers]

        off_ratings = np.array([extract_rating(x) for x in raw_off_rating])
        off_ratings = [float(x) for x in off_ratings]

        raw_def_rating = raw_def_rating[1:]
        raw_def_rating = [str(x) for x in raw_def_rating]
        raw_def_rating = np.array(raw_def_rating)
        raw_def_rating = raw_def_rating[column_headers]

        def_ratings = np.array([extract_rating(x) for x in raw_def_rating])
        def_ratings = [float(x) for x in def_ratings]

        teams.append(team_names)
        off_eff.append(off_ratings)
        def_eff.append(def_ratings)
        year.append([x]*len(team_names))

    teams = sum(teams, [])
    off_eff = sum(off_eff, [])
    def_eff = sum(def_eff, [])
    year = sum(year, [])
    
    ### Create Rating DataFrame
    rating_data = pd.DataFrame({"Team" : teams, "Off. Rating" : off_eff, "Year" : year, "Def. Rating" : def_eff})

    ### Rename Teams in the rating data so that it matches the game data
    rating_data.Team = pd.Series([x.replace("Texas A&amp;M'", "Texas A&M") for x in rating_data.Team.values])
    rating_data.Team = pd.Series([x.replace("Texas A&amp;M", "Texas A&M") for x in rating_data.Team.values])
    rating_data.Team = pd.Series([x.replace("Mich. St.", "Michigan State") for x in rating_data.Team.values])
    rating_data.Team = pd.Series([x.replace("Oregon St", "Oregon State") for x in rating_data.Team.values])
    rating_data.Team = pd.Series([x.replace("Oregon Stateate", "Oregon State") for x in rating_data.Team.values])

    ### Remove games where teams are playing an opponent for which no rating data exists
    keep_games = []
    
    for index, x in game_data.iterrows():
        keep_game = x.awayTeam in rating_data.Team.values and x.homeTeam in rating_data.Team.values
        keep_games.append(keep_game)

    keep_games = np.array(keep_games)
    game_data = game_data.loc[keep_games]
    
    return game_data, rating_data, pxp_data

In [4]:
"""Uses ESPN Efficiency data from above to find the n most similar games

Parameters: 
    off_rating - Offensive Rating of team of interest
    def_rating - Deffensive Rating of team of interest
    n - number of most similar games to be returned
    game_data - First Dataframe returned by get_ratings()
    rating_data - Second Dataframe returned by get_ratings()
    
Returns: 
    2 Objects:
        1. Array of lists where each list has the following elements:
            1. String - name of AWAY team
            2. String - name of HOME team
            3. Int - year of matchup
            4. Boolean - True if home team is on offense and away team is on defense (for False the sides are swapped)
            5. Float - Root Mean Squared Error of matchup to matchup of interest
        2. Pxp data of games in Array
        
"""

def most_similar(off_rating, def_rating, n, game_data, rating_data, pxp_data):
    RMSEs = []
    home_off_away_def = []

    for index, x in game_data.iterrows():
        x_year_ratings = rating_data.loc[rating_data.Year == x.year]
        
        away_def = float(x_year_ratings.loc[x_year_ratings.Team == x.awayTeam].loc[:, "Def. Rating"].values)
        away_off = float(x_year_ratings.loc[x_year_ratings.Team == x.awayTeam].loc[:, "Off. Rating"].values)
        home_def = float(x_year_ratings.loc[x_year_ratings.Team == x.homeTeam].loc[:, "Def. Rating"].values)
        home_off = float(x_year_ratings.loc[x_year_ratings.Team == x.homeTeam].loc[:, "Off. Rating"].values)
    
        ### Use Root Mean Squared Error for Test Statistic
        rmse_away_home = np.sqrt(((def_rating - away_def)**2 + (off_rating - home_off)**2)/2)
        rmse_home_away = np.sqrt(((def_rating - home_def)**2 + (off_rating - away_off)**2)/2)
        
        if rmse_away_home < rmse_home_away:
            home_off_away_def.append(True)
        else:
            home_off_away_def.append(False)
    
        game_rmse = np.array([rmse_away_home, rmse_home_away])
        RMSEs.append(game_rmse[np.argmin(game_rmse)])
    
    game_data.loc[:,"RMSEs"] = RMSEs
    game_data.loc[:, "home_off_away_def"] = home_off_away_def
    
    game_list = game_data.sort_values("RMSEs").reset_index().loc[0:n, ["awayTeam", "homeTeam", "year", "home_off_away_def", "RMSEs"]].values
    
    return_data = pd.DataFrame(columns = pxp_data.columns)

    for x in game_list:
        bool_away_team = pxp_data.awayTeam == x[0]
        bool_home_team = pxp_data.homeTeam == x[1]
        bool_year = pxp_data.year == x[2]
        return_data = return_data.append(pxp_data.loc[bool_away_team & bool_home_team & bool_year, :])
        
    bool_rep_arr = []
    for index, row in game_data.iterrows():
        num_plays_one_game = return_data.loc[return_data.gameId == index]
        bool_rep_arr.append(np.repeat(row["home_off_away_def"], (len(num_plays_one_game))))
       
    bool_rep_arr = np.concatenate(bool_rep_arr)
    return_data["home_off_away_def"] = bool_rep_arr
    return game_list, return_data

Here is an example of how the previous functions are used:

In [5]:
game_data, rating_data, pxp_data = get_ratings()
game_list, return_data = most_similar(off_rating = 80, def_rating = 90, n = 10, game_data = game_data, rating_data = rating_data, pxp_data=pxp_data)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [6]:
game_data

Unnamed: 0_level_0,awayTeam,homeTeam,year,RMSEs,home_off_away_def
gameId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
252440009,Temple,Arizona State,2005,52.189271,True
252440041,Buffalo,UConn,2005,51.061629,False
252440154,Vanderbilt,Wake Forest,2005,30.636579,False
252440202,Minnesota,Tulsa,2005,22.768948,False
252440248,Oregon,Houston,2005,21.778544,True
252440265,Idaho,Washington St,2005,42.851488,True
252442132,E Michigan,Cincinnati,2005,47.620164,False
252442579,UCF,South Carolina,2005,27.876244,False
252450254,Arizona,Utah,2005,22.742141,True
252452117,Indiana,Cent Michigan,2005,43.637140,False


In [7]:
return_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 2312 to 1970
Data columns (total 31 columns):
awayAbbr             1996 non-null object
awayId               1996 non-null object
awayScore            1996 non-null object
awayTeam             1996 non-null object
clock                1996 non-null object
defenseAbbr          1996 non-null object
defenseId            1996 non-null object
defenseTeam          1996 non-null object
description          1996 non-null object
distance             1996 non-null object
down                 1996 non-null object
driveIndex           1996 non-null object
endYardLine          1996 non-null object
gameId               1996 non-null object
homeAbbr             1996 non-null object
homeId               1996 non-null object
homeScore            1996 non-null object
homeTeam             1996 non-null object
isScoringPlay        398 non-null object
offenseAbbr          1996 non-null object
offenseId            1996 non-null object
offenseTe