# Data source

https://github.com/ryurko/nflscrapR-data has data for 10 years about the games won and lost by each team throughout the season. We'll take the data for the pre-season and season and build a logistic regression to predict the results of every post season game. We'll then use that data to run 10,000 simulations of the post season in 2021.


Our hypothesis is that 3 variables influence our predictions.

1. The point spread i.e. how much they won or lost by
2. How good the other team is
3. How much this team wins

We'll start by looking at the first five years of data


In [105]:
import pandas as pd
import os

seasons = [
    pd.read_csv(
        os.path.join(
            os.path.abspath(""), "games_data", "regular_season", f"reg_games_{year}.csv"
        )
    )
    for year in range(2009, 2019)
]


In [106]:
seasons[0].columns


Index(['type', 'game_id', 'home_team', 'away_team', 'week', 'season',
       'state_of_game', 'game_url', 'home_score', 'away_score'],
      dtype='object')

In [107]:
import random


FINAL_WEEK_TO_COUNT = 15

records_for_season = []
teams_for_season = []


def calculate_for_season(season):
    season_without_last_games = season[season["week"] < FINAL_WEEK_TO_COUNT]

    season_without_last_games.loc[:, "differential"] = (
        season_without_last_games["home_score"]
        - season_without_last_games["away_score"]
    )

    teams = season_without_last_games.home_team.unique()

    record = pd.DataFrame(
        index=season["home_team"].unique(), columns=["wins", "losses"]
    )

    records = {}
    for team in teams:
        record = pd.DataFrame(
            {
                "score": season_without_last_games[
                    season_without_last_games["home_team"] == team
                ]["differential"],
                "opponent": season_without_last_games[
                    season_without_last_games["home_team"] == team
                ]["away_team"],
            }
        ).append(
            pd.DataFrame(
                {
                    "score": -season_without_last_games[
                        season_without_last_games["away_team"] == team
                    ]["differential"],
                    "opponent": season_without_last_games[
                        season_without_last_games["away_team"] == team
                    ]["home_team"],
                }
            )
        )
        records[team] = record
    point_averages = pd.DataFrame(
        [
            {"avg": record["score"].mean(), "team": team}
            for (team, record) in records.items()
        ]
    )

    for team in teams:
        records[team] = records[team].merge(
            point_averages, left_on="opponent", right_on="team"
        )
        records[team].loc[:, "adjusted_score"] = (
            records[team]["score"] + records[team]["avg"] / 2
        )
        records[team].loc[:, "adjusted_score_2"] = (
            records[team]["score"] + records[team]["avg"] / 5
        )
        records[team].loc[:, "adjusted_score_3"] = (
            records[team]["score"] + records[team]["avg"] * 2
        )
        # records[team].loc[:, "score_with_log"] = (
        #     records[team]["score"] + records[team]["avg"] / 2
        # )
        # records[team].loc[:, "adjusted_score_with_log"] = (
        #     (records[team]["score"] + records[team]["avg"] / 2)
        # )

    return teams, records


for season in seasons:
    teams, records = calculate_for_season(season)
    teams_for_season.append(teams)
    records_for_season.append(records)


records_for_season


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


[{'PIT':     score opponent        avg team  adjusted_score  adjusted_score_2  \
  0       3      TEN  -2.307692  TEN        1.846154          2.538462   
  1      10       SD   7.923077   SD       13.961538         11.584615   
  2      13      CLE -12.076923  CLE        6.961538         10.584615   
  3      -7      CLE -12.076923  CLE      -13.038462         -9.415385   
  4      10      MIN  11.230769  MIN       15.615385         12.246154   
  5      -6      CIN   3.615385  CIN       -4.192308         -5.276923   
  6      -3      CIN   3.615385  CIN       -1.192308         -2.276923   
  7      -3      OAK -12.384615  OAK       -9.192308         -5.476923   
  8      -3      CHI  -3.384615  CHI       -4.692308         -3.676923   
  9       8      DET -15.153846  DET        0.423077          4.969231   
  10     18      DEN   2.000000  DEN       19.000000         18.400000   
  11     -3       KC -10.461538   KC       -8.230769         -5.092308   
  12     -3      BAL   7.769231

In [108]:
post_seasons = [
    pd.read_csv(
        os.path.join(
            os.path.abspath(""), "games_data", "post_season", f"post_games_{year}.csv"
        )
    )
    for year in range(2009, 2019)
]

outputs = []
for season, post_season, teams in zip(seasons, post_seasons, teams_for_season):
    without_all_star_game = post_season[(post_season["home_team"].isin(teams))]
    output = pd.DataFrame(
        {
            "team": without_all_star_game["home_team"],
            "opponent": without_all_star_game["away_team"],
            "win": without_all_star_game["away_score"]
            - without_all_star_game["home_score"]
            > 0,
        }
    )
    last_games_of_season = season[season["week"] >= FINAL_WEEK_TO_COUNT]
    output = output.append(
        pd.DataFrame(
            {
                "team": last_games_of_season["home_team"],
                "opponent": last_games_of_season["away_team"],
                "win": last_games_of_season["away_score"]
                - last_games_of_season["home_score"]
                > 0,
            }
        )
    )
    outputs.append(output)

outputs


[    team opponent    win
 0    CIN      NYJ   True
 1    DAL      PHI  False
 2     NE      BAL   True
 3    ARI       GB  False
 4     NO      ARI  False
 5    IND      BAL  False
 6    MIN      DAL  False
 7     SD      NYJ   True
 8    IND      NYJ  False
 9     NO      MIN  False
 11   IND       NO   True
 208  JAC      IND   True
 209   NO      DAL   True
 210  STL      HOU   True
 211   KC      CLE   True
 212  NYJ      ATL   True
 213  BUF       NE   True
 214  TEN      MIA  False
 215  DET      ARI   True
 216  DEN      OAK   True
 217   SD      CIN  False
 218  PIT       GB  False
 219  BAL      CHI  False
 220  PHI       SF  False
 221  SEA       TB   True
 222  CAR      MIN  False
 223  WAS      NYG   True
 224  TEN       SD   True
 225  ATL      BUF  False
 226  PIT      BAL  False
 227   NO       TB   True
 228   NE      JAC  False
 229  NYG      CAR   True
 230  MIA      HOU   True
 231  CIN       KC  False
 232  CLE      OAK  False
 233   GB      SEA  False
 234  ARI   

In [189]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


def build_and_test_model(score_to_use, train_validate_split=.5):
    inputs = []
    targets = None

    for records, output in zip(records_for_season, outputs):
        for team, opponent in zip(output["team"], output["opponent"]):
            inputs.append(
                records[team][score_to_use].append(records[opponent][score_to_use])
            )
        if targets is None:
            targets = output["win"]
        else:
            targets = targets.append(output["win"])

    train_count = int(len(inputs) * train_validate_split)
    training_set_input = inputs[0:train_count]
    training_set_target = targets[0:train_count]

    log_reg = LogisticRegression()
    log_reg.fit(training_set_input, training_set_target)

    if train_validate_split < 1:
        print(accuracy_score(targets[train_count:], log_reg.predict(inputs[train_count:])))
        print(log_reg.predict_proba(inputs[train_count:]))

    return log_reg


In [190]:
log_reg_adjusted_score = build_and_test_model("adjusted_score")


0.6440677966101694
[[0.45107027 0.54892973]
 [0.36461964 0.63538036]
 [0.60080981 0.39919019]
 [0.45706306 0.54293694]
 [0.46790942 0.53209058]
 [0.58626901 0.41373099]
 [0.54695198 0.45304802]
 [0.45824231 0.54175769]
 [0.81239891 0.18760109]
 [0.63283007 0.36716993]
 [0.28322578 0.71677422]
 [0.63950861 0.36049139]
 [0.36228774 0.63771226]
 [0.88954938 0.11045062]
 [0.82656916 0.17343084]
 [0.97921302 0.02078698]
 [0.46457089 0.53542911]
 [0.62431815 0.37568185]
 [0.96636202 0.03363798]
 [0.72175129 0.27824871]
 [0.84069147 0.15930853]
 [0.8282841  0.1717159 ]
 [0.75553754 0.24446246]
 [0.82807207 0.17192793]
 [0.86915508 0.13084492]
 [0.65063079 0.34936921]
 [0.42067147 0.57932853]
 [0.60666361 0.39333639]
 [0.07849871 0.92150129]
 [0.43080482 0.56919518]
 [0.78574739 0.21425261]
 [0.34454923 0.65545077]
 [0.2788496  0.7211504 ]
 [0.63919788 0.36080212]
 [0.69554888 0.30445112]
 [0.44510262 0.55489738]
 [0.71021185 0.28978815]
 [0.11813894 0.88186106]
 [0.60960408 0.39039592]
 [0.29

In [111]:
log_reg_adjusted_score_2 = build_and_test_model("adjusted_score_2")


0.6331658291457286
[[0.30719876 0.69280124]
 [0.77373655 0.22626345]
 [0.21808715 0.78191285]
 [0.23235138 0.76764862]
 [0.4376973  0.5623027 ]
 [0.50313507 0.49686493]
 [0.44071764 0.55928236]
 [0.45356196 0.54643804]
 [0.31949984 0.68050016]
 [0.5720022  0.4279978 ]
 [0.35750675 0.64249325]
 [0.64130766 0.35869234]
 [0.16168754 0.83831246]
 [0.83922822 0.16077178]
 [0.8402633  0.1597367 ]
 [0.73981572 0.26018428]
 [0.27467843 0.72532157]
 [0.64760756 0.35239244]
 [0.70855724 0.29144276]
 [0.54871151 0.45128849]
 [0.27041786 0.72958214]
 [0.18096496 0.81903504]
 [0.65733309 0.34266691]
 [0.2911365  0.7088635 ]
 [0.65411074 0.34588926]
 [0.79852782 0.20147218]
 [0.58097085 0.41902915]
 [0.71244793 0.28755207]
 [0.75783445 0.24216555]
 [0.74339332 0.25660668]
 [0.47301028 0.52698972]
 [0.91497506 0.08502494]
 [0.43835185 0.56164815]
 [0.61983213 0.38016787]
 [0.78265027 0.21734973]
 [0.2446642  0.7553358 ]
 [0.59240901 0.40759099]
 [0.29194314 0.70805686]
 [0.19574219 0.80425781]
 [0.72

In [112]:
log_reg_score = build_and_test_model("score")


0.6331658291457286
[[0.28434176 0.71565824]
 [0.80526942 0.19473058]
 [0.18102036 0.81897964]
 [0.23241747 0.76758253]
 [0.42080442 0.57919558]
 [0.48108589 0.51891411]
 [0.44387297 0.55612703]
 [0.43177496 0.56822504]
 [0.35281926 0.64718074]
 [0.57987484 0.42012516]
 [0.34733776 0.65266224]
 [0.64843063 0.35156937]
 [0.14743813 0.85256187]
 [0.83230464 0.16769536]
 [0.83449731 0.16550269]
 [0.7264791  0.2735209 ]
 [0.28095593 0.71904407]
 [0.63794956 0.36205044]
 [0.70083993 0.29916007]
 [0.56987164 0.43012836]
 [0.20330313 0.79669687]
 [0.14885988 0.85114012]
 [0.63131851 0.36868149]
 [0.29291899 0.70708101]
 [0.65984936 0.34015064]
 [0.83481999 0.16518001]
 [0.59573617 0.40426383]
 [0.77052629 0.22947371]
 [0.77127172 0.22872828]
 [0.7298441  0.2701559 ]
 [0.52170864 0.47829136]
 [0.91428787 0.08571213]
 [0.4181411  0.5818589 ]
 [0.61073239 0.38926761]
 [0.80855925 0.19144075]
 [0.24215864 0.75784136]
 [0.55832491 0.44167509]
 [0.26899919 0.73100081]
 [0.16477992 0.83522008]
 [0.70

In [191]:
log_reg_adjusted_score_with_all_data = build_and_test_model("adjusted_score", train_validate_split=1)

In [192]:
from collections import defaultdict

score_to_use = "adjusted_score"

afc_top_seed = {"team": "Tennessee Titans", "seed": 1}
afc_teams_with_seeds = [
    {"team": "Kansas City Chiefs", "seed": 2},
    {"team": "Buffalo Bills", "seed": 3},
    {"team": "Cincinnati Bengals", "seed": 4},
    {"team": "Las Vegas Raiders", "seed": 5},
    {"team": "New England Patriots", "seed": 6},
    {"team": "Pittsburgh Steelers", "seed": 7},
]
nfc_top_seed = {"team": "Green Bay Packers", "seed": 1}
nfc_teams_with_seeds = [
    {"team": "Tampa Bay Buccaneers", "seed": 2},
    {"team": "Dallas Cowboys", "seed": 3},
    {"team": "Los Angeles Rams", "seed": 4},
    {"team": "Arizona Cardinals", "seed": 5},
    {"team": "San Francisco 49ers", "seed": 6},
    {"team": "Philadelphia Eagles", "seed": 7},
]

current_season = pd.read_csv(
    os.path.join(
        os.path.abspath(""), "games_data", "regular_season", "reg_games_2021.csv"
    )
)

teams, current_season_records = calculate_for_season(current_season)

def pick_matchups(teams_with_seeds):
    if len(teams_with_seeds) == 0:
        return []
    lowest = min(teams_with_seeds, key=lambda x: x["seed"])
    highest = max(teams_with_seeds, key=lambda x: x["seed"])
    copy = teams_with_seeds[:]
    copy.remove(lowest)
    copy.remove(highest)
    return [(lowest, highest), *pick_matchups(copy)]


def play_game(teams_with_seeds, log_reg):
    inputs = [
        current_season_records[teams_with_seeds[0]["team"]][score_to_use].append(
            current_season_records[teams_with_seeds[1]["team"]][score_to_use]
        )
    ]

    if random.random() < log_reg.predict_proba(inputs)[0][0]:
        return teams_with_seeds[0]
    else:
        return teams_with_seeds[1]

def play_games(teams_with_seeds, log_reg):
    return [play_game(matchup, log_reg) for matchup in pick_matchups(teams_with_seeds)]

histogram = defaultdict(lambda: 0)

def simulate_playoffs(log_reg):
    new_matchups = play_games(afc_teams_with_seeds, log_reg)
    new_matchups.append(afc_top_seed)
    new_matchups = play_games(new_matchups, log_reg)
    afc_winner = play_games(new_matchups, log_reg)
    
    new_matchups = play_games(nfc_teams_with_seeds, log_reg)
    new_matchups.append(nfc_top_seed)
    new_matchups = play_games(new_matchups, log_reg)
    nfc_winner = play_games(new_matchups, log_reg)
    
    return play_game(afc_winner + nfc_winner, log_reg)["team"]

def simulation(iterations, log_reg):
    for i in range(iterations):
        name = simulate_playoffs(log_reg)
        histogram[name] += 1

    for key, value in histogram.items():
        print(f"{key} have {value*100/iterations}% chance")

simulation(1000, log_reg_adjusted_score_with_all_data)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Green Bay Packers have 14.1% chance
Los Angeles Rams have 6.3% chance
Tampa Bay Buccaneers have 14.2% chance
Tennessee Titans have 7.9% chance
Dallas Cowboys have 5.2% chance
Kansas City Chiefs have 12.2% chance
Cincinnati Bengals have 15.0% chance
Philadelphia Eagles have 8.4% chance
Pittsburgh Steelers have 0.9% chance
Buffalo Bills have 5.1% chance
New England Patriots have 7.2% chance
Arizona Cardinals have 1.8% chance
San Francisco 49ers have 1.3% chance
Las Vegas Raiders have 0.4% chance


In [182]:
# Test single game
score_to_use="adjusted_score"
def play_game(teams_with_seeds, log_reg):
    inputs = [
        current_season_records[teams_with_seeds[0]["team"]][score_to_use].append(
            current_season_records[teams_with_seeds[1]["team"]][score_to_use]
        )
    ]

    probability = log_reg.predict_proba(inputs)[0][0]
    print(probability)
    if random.random() < probability:
        return teams_with_seeds[0]
    else:
        return teams_with_seeds[1]

play_game([{"team": "Green Bay Packers"}, {"team": "San Francisco 49ers"}], log_reg_adjusted_score)

0.6519555972970412


{'team': 'Green Bay Packers'}

In [163]:
def play_games(teams_with_seeds, log_reg):
    matchups = pick_matchups(teams_with_seeds)
    print(matchups)
    return [play_game(matchup, log_reg) for matchup in matchups]


def simulate_playoffs(log_reg):
    new_matchups = play_games(afc_teams_with_seeds, log_reg)
    new_matchups.append(afc_top_seed)
    new_matchups = play_games(new_matchups, log_reg)
    afc_winner = play_games(new_matchups, log_reg)
    
    new_matchups = play_games(nfc_teams_with_seeds, log_reg)
    new_matchups.append(nfc_top_seed)
    new_matchups = play_games(new_matchups, log_reg)
    nfc_winner = play_games(new_matchups, log_reg)
    
    return play_game(afc_winner + nfc_winner, log_reg)["team"]

simulate_playoffs(log_reg_score)

[({'team': 'Kansas City Chiefs', 'seed': 2}, {'team': 'Pittsburgh Steelers', 'seed': 7}), ({'team': 'Buffalo Bills', 'seed': 3}, {'team': 'New England Patriots', 'seed': 6}), ({'team': 'Cincinnati Bengals', 'seed': 4}, {'team': 'Las Vegas Raiders', 'seed': 5})]
0.81121748536287
0.9201821094367056
0.8030298037824326
[({'team': 'Tennessee Titans', 'seed': 1}, {'team': 'Las Vegas Raiders', 'seed': 5}), ({'team': 'Kansas City Chiefs', 'seed': 2}, {'team': 'Buffalo Bills', 'seed': 3})]
0.8636578562050423
0.897648974189002
[({'team': 'Tennessee Titans', 'seed': 1}, {'team': 'Kansas City Chiefs', 'seed': 2})]
0.9484989412946755
[({'team': 'Tampa Bay Buccaneers', 'seed': 2}, {'team': 'Philadelphia Eagles', 'seed': 7}), ({'team': 'Dallas Cowboys', 'seed': 3}, {'team': 'San Francisco 49ers', 'seed': 6}), ({'team': 'Los Angeles Rams', 'seed': 4}, {'team': 'Arizona Cardinals', 'seed': 5})]
0.19247510893850106
0.8029385156862576
0.4840957333394791
[({'team': 'Green Bay Packers', 'seed': 1}, {'team'

'Tennessee Titans'

In [169]:
current_season_records["Philadelphia Eagles"]


Unnamed: 0,score,opponent,avg,team,adjusted_score,adjusted_score_2,adjusted_score_3
0,-6,San Francisco 49ers,2.153846,San Francisco 49ers,-4.923077,-5.569231,-1.692308
1,-12,Kansas City Chiefs,6.384615,Kansas City Chiefs,-8.807692,-10.723077,0.769231
2,-6,Tampa Bay Buccaneers,8.692308,Tampa Bay Buccaneers,-1.653846,-4.261538,11.384615
3,-3,Los Angeles Chargers,1.153846,Los Angeles Chargers,-2.423077,-2.769231,-0.692308
4,11,New Orleans Saints,1.461538,New Orleans Saints,11.730769,11.292308,13.923077
5,26,Atlanta Falcons,-8.307692,Atlanta Falcons,21.846154,24.338462,9.384615
6,-20,Dallas Cowboys,7.153846,Dallas Cowboys,-16.423077,-18.569231,-5.692308
7,3,Carolina Panthers,-1.923077,Carolina Panthers,2.038462,2.615385,-0.846154
8,-11,Las Vegas Raiders,-5.923077,Las Vegas Raiders,-13.961538,-12.184615,-22.846154
9,38,Detroit Lions,-10.846154,Detroit Lions,32.576923,35.830769,16.307692


In [170]:
current_season_records["Tampa Bay Buccaneers"]

Unnamed: 0,score,opponent,avg,team,adjusted_score,adjusted_score_2,adjusted_score_3
0,2,Dallas Cowboys,7.153846,Dallas Cowboys,5.576923,3.430769,16.307692
1,23,Atlanta Falcons,-8.307692,Atlanta Falcons,18.846154,21.338462,6.384615
2,13,Atlanta Falcons,-8.307692,Atlanta Falcons,8.846154,11.338462,-3.615385
3,28,Miami Dolphins,-2.615385,Miami Dolphins,26.692308,27.476923,22.769231
4,35,Chicago Bears,-7.769231,Chicago Bears,31.115385,33.446154,19.461538
5,20,New York Giants,-6.0,New York Giants,17.0,18.8,8.0
6,6,Buffalo Bills,10.307692,Buffalo Bills,11.153846,8.061538,26.615385
7,-10,Los Angeles Rams,5.615385,Los Angeles Rams,-7.192308,-8.876923,1.230769
8,2,New England Patriots,11.538462,New England Patriots,7.769231,4.307692,25.076923
9,6,Philadelphia Eagles,3.538462,Philadelphia Eagles,7.769231,6.707692,13.076923
