### Find n most similar defenses based off of S&P+ ratings from 2014-2017

In [11]:
import pandas as pd
import numpy as np
import urllib.request
from bs4 import BeautifulSoup
from collections import Counter
import csv
import re

In [20]:
### does not utilize 2018 data yet
### only implemented for defenses for now

In [18]:
"""
Extracts Defensive S&P+ ratings from Football Outsiders from 2014-2017.

Returns: 
    Pandas Data Frame of S&P+ rating, Team, and Year.

"""

def get_defensive_s_and_p():
    
    def extract_team_name(x):
        match = re.search('\>(.*)\<', x)
        if match:
            found = match.group(1)
        return found

    def extract_def_fei(x):
        match = re.search('\>(.*)\<', x)
        if match:
            found = match.group(1)
        return found

    def extract_def_sp(x):
        match = re.search('[0-9]*\.[0-9]*', x)
        if match:
            found = match.group(0)
        return found
    
    teams = []
    defsp = []
    year = []

    for i in np.arange(2014, 2018):

        url = "https://www.footballoutsiders.com/stats/ncaadef" + str(i)
        page = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(page)

        raw_team_name = []
        raw_all_def_sp = []

        for tr in soup.find_all('tr')[1:]:
            tds = tr.find_all('td')
            team = tds[0]
            def_sp = tds[1]
            raw_team_name.append(team)
            raw_all_def_sp.append(def_sp)

        raw_str_team_name = [str(x) for x in raw_team_name]
        raw_str_def_sp = [str(x) for x in raw_all_def_sp]

        teams.append([extract_team_name(x) for x in raw_str_team_name])
        defsp.append([extract_def_sp(x) for x in raw_str_def_sp])
        year.append([i]*len(raw_str_team_name))

    teams = sum(teams, [])
    defsp = sum(defsp, [])
    year = sum(year, [])

    s_and_p_data = pd.DataFrame({'Team' : teams, 'Def. S&P+' : defsp, 'Year' : year})
    drop = s_and_p_data.iloc[:, 0] == "."
    s_and_p_data = s_and_p_data[~drop]
    s_and_p_data["Def. S&P+"] = [float(x) for x in s_and_p_data["Def. S&P+"]]
    s_and_p_data = s_and_p_data.sort_values("Def. S&P+").drop_duplicates()
    print("Done!")
    return s_and_p_data

In [19]:
data = get_defensive_s_and_p()

Done!


In [16]:
"""
Uses S&P+ data from above to find the n most similar defenses

Parameters: 
    team_rating - numeric S&P+ rating of team of interest
    n - number of closest teams returned
    data - output of get_defensive_s_and_p()
    
Returns: 
    List of tuples where each tuple has the following structure:
        (Team name, year, absolute percent difference from team of interest)
"""

def n_closest_defense(team_rating, n, data):
    closest_data = data.iloc[(data["Def. S&P+"] - team_rating).abs().argsort()[:n]]
    percent_diff = (abs(np.array(closest_data.loc[:, "Def. S&P+"]) - team_rating)/team_rating)*100
    keys = closest_data.Team
    values = closest_data.Year
    return list(zip(keys, values, percent_diff))

In [17]:
n_closest_defense(13, 10, data)

[('Florida', 2016, 2.3076923076923128),
 ('LSU', 2016, 3.0769230769230793),
 ('Boston College', 2015, 4.6153846153846132),
 ('Michigan', 2015, 4.6153846153846132),
 ('Ohio State', 2016, 4.6153846153846132),
 ('Ole Miss', 2014, 6.1538461538461586),
 ('Clemson', 2014, 6.1538461538461586),
 ('Clemson', 2016, 6.1538461538461586),
 ('Alabama', 2017, 6.1538461538461586),
 ('Clemson', 2017, 6.923076923076926)]