In [1]:
import pandas as pd
import numpy as np
from pdb import set_trace

In [2]:
# Source: http://andr3w321.com/elo-ratings-part-2-margin-of-victory-adjustments/
# p1: the original rating of the first team
# p2: the original rating of the second team
# mov: Margin Of Victory
# week: The weak of the season. This was initially included because I considered weighting
#          The importance of each week differently
def rate_1vs1(p1, p2, mov=1, week=8):
    k = 20
    k_multiplier = np.log(abs(mov) + 1)
    #if mov < 0:
    #    k_multiplier *= -1
    corr_m = 2.2 / ((p1 - p2)*.001 + 2.2)
    rp1 = 10 ** (p1/400)
    rp2 = 10 ** (p2/400)
    exp_p1 = rp1 /(rp1 + rp2)
    exp_p2 = rp2 /(rp1 + rp2)
    if (mov == 0):
        s1 = 0.5
        s2 = 0.5
    elif(mov > 0):
        s1 = 1
        s2 = 0
    else:
        s1 = 0
        s2 = 1
    new_p1 = p1 + k_multiplier * corr_m * k * (s1 - exp_p1)
    new_p2 = p2 + k_multiplier * corr_m * k * (s2 - exp_p2)
    return new_p1, new_p2
                                        
                                        


In [3]:
# read in data
df = pd.read_csv("allSeasonScores.csv")

#create the dictionary that will hold the final elo values
eloDict = dict()

# find average difference between Opoints and DPoints
avgPointsDict = dict()
for y in range(1999,2020):
    currSeason = df[df.season == y]
    HOPoints = np.array(currSeason.homeOffPoints)
    AOPoints = np.array(currSeason.awayOffPoints)
    HDPoints = np.array(currSeason.homeDefPoints)
    ADPoints = np.array(currSeason.awayDefPoints)
    homeMOV = np.mean(HOPoints - ADPoints)
    awayMOV = np.mean(AOPoints - HDPoints)
    avgPointsDict[y] = {'home' : homeMOV, 'away' : awayMOV}

In [4]:

# Returns the most recent ELO rating for the specified team and unit
# team: the team abbreviation (ARI, ATL, BAL, etc)
# unit: either 'off' or 'def' for offense or defense
def getMostRecentElo(team, unit):
    try: 
        season = eloDict[team][max(eloDict[team].keys())]
        result =  season[max(season.keys())][unit]
        return result
    except:
        return 1500


In [5]:
# Just wanted to store all the ratings to make a KDE plot to show the distribution. Was useful when looking for errors
# since you would expect a pretty normal distribution
offenseRatingsList = list()
defenseRatingsList = list()

# for each row (a game) in the dataframe...
for index, row in df.iterrows():
    
    # initialize some variables
    homeTeam = row.homeTeam
    awayTeam = row.awayTeam
    y = row.season
    w = row.week
    HomeOffDiff = row.homeOffPoints - row.awayDefPoints
    AwayOffDiff = row.awayOffPoints - row.homeDefPoints
    
    # helps set up the dictionary object to avoid keyerrors
    if homeTeam not in eloDict.keys():
        eloDict[homeTeam] = dict()
    if y not in eloDict[homeTeam].keys():
        eloDict[homeTeam][y] = {0 : {'off' : ((getMostRecentElo(homeTeam, 'off') - 1500) * 1/3) + 1500, 
                                     'def': ((getMostRecentElo(homeTeam, 'def') - 1500) * 1/3) + 1500}}
        
    if awayTeam not in eloDict.keys():
        eloDict[awayTeam] = dict()
    if y not in eloDict[awayTeam].keys():
        eloDict[awayTeam][y] = {0 : {'off' : ((getMostRecentElo(awayTeam, 'off') - 1500) * 1/3) + 1500, 
                                     'def': ((getMostRecentElo(awayTeam, 'def') - 1500) * 1/3) + 1500}}
        
    
    # average margin of victory for home and away teams. Note that the split is there because of homefield advantage
    avgHomeMOV = avgPointsDict[y]['home']
    avgAwayMOV = avgPointsDict[y]['away']
    
    #HomeOffenseELO, AwayDefenseELO, AwayOffenseELO, HomeDefenseELO
    HOELO = getMostRecentElo(homeTeam, 'off')
    ADELO = getMostRecentElo(awayTeam, 'def')
    AOELO = getMostRecentElo(awayTeam, 'off')
    HDELO = getMostRecentElo(homeTeam, 'def')
    
    # the new elo values for homeO and awayD
    a, b = rate_1vs1(HOELO, ADELO, HomeOffDiff - avgHomeMOV, w)
    
    # the new elo values for homeD and awayO
    c, d = rate_1vs1(AOELO, HDELO, AwayOffDiff - avgAwayMOV, w)
    
    # sets up the dictionary object to avoid keyerrors
    eloDict[awayTeam][y][w] = dict()
    eloDict[homeTeam][y][w] = dict()
    
    # adds the newly calculated ELOs
    eloDict[homeTeam][y][w]['off'] = a
    eloDict[awayTeam][y][w]['def'] = b    
    eloDict[awayTeam][y][w]['off'] = c
    eloDict[homeTeam][y][w]['def'] = d
    
    # just adds the new ratings to the lists declared above. Used below for the KDE plot
    offenseRatingsList.append(a)
    offenseRatingsList.append(c)
    defenseRatingsList.append(b)
    defenseRatingsList.append(d)
    

In [6]:
# write the dictionary to a json file
import json

with open('eloVals.json', 'w') as fp:
    json.dump(eloDict, fp, sort_keys=True, indent=4)

In [7]:

# this would be how you can read the json file into python
with open('eloVals.json', 'r') as fp:
    testLoad = json.load(fp)

In [8]:
# team is the team abbreviation (ARI, ATL, BAL, etc)
# year is the season. Note that if we are talking about the 2018-19 season, year would be 2018
# week is which week we are looking for the ELO rating in.
#     *** NOTE *** this function assumes that you are attempting to find the ELO at the start of a week, meaning
#                  that if you passed 'week = 1' it would be the teams rating BEFORE the game was played. 
#                  this explains why it is '...[week - 1]...' in the 'try' part of the below function
# unit is either 'off' (offense) or 'def' (defense)
# data is the dictionary object
def getInitialEloRating(team, year, week, unit, data = eloDict):
    try:
        return data[team][year][week - 1][unit]
    except:
        # What if we are missing that week? For instance, the bucs dolphins game in 2017 week 1 was postponed
        # until week 11 because of a hurricane. So if you tried something like...
        # data['MIA'][2017][2]['off'] you would get an error...
        # the solution to this problem is that we are going to try and find the most recent ELO ratings.
        # it is guaranteed that for each team, for each season, there exists their week 0, or initial, elo ratings which
        # are based on their ending values from the prevrious season. So in the example above (bucs dolphins) we would end
        # up with the initial rank at week 0.
        season = data[team][year]
        
        # descending order...
        for w in range(week - 1,-1, -1):
            try:
                return season[w][unit]
            except:
                pass

In [9]:
# makes the KDE plot to show that the ELO ratings are normally distributed, as we would expect
df2 = pd.DataFrame.from_dict(testLoad)
forPlot = pd.DataFrame({'Offense' : offenseRatingsList, "Defense" : defenseRatingsList})
p = forPlot.plot.kde()