# Data source

https://github.com/ryurko/nflscrapR-data has data for 10 years about the games won and lost by each team throughout the season. We'll take the data for the pre-season and season and build a logistic regression to predict the results of every post season game. We'll then use that data to run 10,000 simulations of the post season in 2021.


To build our model we will gather aggregate data about how each team performs over a season. That will include: 

1. A team's win rate 
2. The mean of a team's net score in each game
3. The mean of a team's opponents net scores in each of their games (i.e. a measure of how strong their competition is)
4. The mean of a team's opponents win rates (again a measure of how strong the competition is)
5. The std of 3 and 4. To include some measure of the spread of teams

We will then build a model where the inputs are the concatenation of these data points for two teams that played each other, and the output is if the first team won. i.e. if we are looking at the Eagles and Patriots the input would be:
[ Eagles win rate, Eagles mean score, Mean of Eagles' opponents net scores, ..., Patriot win rate, ... ] and the output would be True if the Eagles won and False if they lost.

We train this on data from 2009-2019 then run 10k simulations of the upcoming playoffs using the predicted probability that each team wins their matchups.


In [28]:
import pandas as pd
import os

seasons = [
    pd.read_csv(
        os.path.join(
            os.path.abspath(""), "games_data", "regular_season", f"reg_games_{year}.csv"
        )
    )
    for year in range(2009, 2019)
]


In [29]:
seasons[0].columns


Index(['type', 'game_id', 'home_team', 'away_team', 'week', 'season',
       'state_of_game', 'game_url', 'home_score', 'away_score'],
      dtype='object')

In [38]:
import random

records_for_season = []
teams_for_season = []


def calculate_season_stats(season):
    season.loc[:, "differential"] = season["home_score"] - season["away_score"]

    teams = season.home_team.unique()

    record_dict = {}

    for team in teams:
        total_count = len(
            season[(season["home_team"] == team) | (season["away_team"] == team)]
        )
        win_rate = (
            len(
                season[
                    (season["home_team"] == team)
                    & (season["home_score"] > season["away_score"])
                ]
            )
            + len(
                season[
                    (season["away_team"] == team)
                    & (season["away_score"] > season["home_score"])
                ]
            )
        ) / total_count

        home_team_games = season["home_team"] == team
        away_team_games = season["away_team"] == team
        point_differential_per_game = (
            (season[home_team_games]["home_score"].sum())
            - (season[home_team_games]["away_score"].sum())
            + (season[away_team_games]["away_score"].sum())
            - (season[away_team_games]["home_score"].sum())
        ) / total_count

        record_dict[team] = {
            "win_rate": win_rate,
            "point_differential_per_game": point_differential_per_game,
        }
    record_df = pd.DataFrame.from_dict(record_dict, orient="index")
    for team in teams:
        opponents = season[(season["home_team"] == team)]["away_team"].append(
            season[(season["away_team"] == team)]["home_team"]
        )

        record_df.loc[team, "opponent_win_rate_avg"] = record_df.loc[
            record_df.index.isin(opponents), "win_rate"
        ].mean()
        record_df.loc[team, "opponent_win_rate_std"] = record_df.loc[
            record_df.index.isin(opponents), "win_rate"
        ].std()
        record_df.loc[team, "opponent_point_spread_avg"] = record_df.loc[
            record_df.index.isin(opponents), "point_differential_per_game"
        ].mean()
        record_df.loc[team, "opponent_point_spread_std"] = record_df.loc[
            record_df.index.isin(opponents), "point_differential_per_game"
        ].std()
        
    return record_df


for season in seasons:
    records_for_season.append(calculate_season_stats(season))


records_for_season


[     win_rate  point_differential_per_game  opponent_win_rate_avg  \
 PIT    0.5625                       2.7500               0.485577   
 CLE    0.3125                      -8.1250               0.495192   
 NO     0.8125                      10.5625               0.427885   
 TB     0.1875                      -9.7500               0.538462   
 HOU    0.5625                       3.4375               0.480769   
 IND    0.8750                       6.8125               0.466346   
 CIN    0.6250                       0.8750               0.495192   
 CAR    0.5000                       0.4375               0.543269   
 BAL    0.5625                       8.1250               0.528846   
 ATL    0.5625                       2.3750               0.504808   
 ARI    0.6250                       3.1250               0.480769   
 SEA    0.3125                      -6.8750               0.495192   
 NYG    0.5000                      -1.5625               0.533654   
 GB     0.6875      

In [39]:
post_seasons = [
    pd.read_csv(
        os.path.join(
            os.path.abspath(""), "games_data", "post_season", f"post_games_{year}.csv"
        )
    )
    for year in range(2009, 2019)
]

outputs = []
for record, post_season, season in zip(records_for_season, post_seasons, seasons):
    without_all_star_game = post_season[(post_season["home_team"].isin(record.index))]
    output = pd.DataFrame(
        {
            "team": without_all_star_game["home_team"],
            "opponent": without_all_star_game["away_team"],
            "win": without_all_star_game["away_score"]
            - without_all_star_game["home_score"]
            > 0,
        }
    )
    output = output.append(
        pd.DataFrame(
            {
                "team": season["home_team"],
                "opponent": season["away_team"],
                "win": season["away_score"]
                - season["home_score"]
                > 0,
            }
        )
    )
    outputs.append(output)

outputs


[    team opponent    win
 0    CIN      NYJ   True
 1    DAL      PHI  False
 2     NE      BAL   True
 3    ARI       GB  False
 4     NO      ARI  False
 ..   ...      ...    ...
 251   SD      WAS  False
 252  OAK      BAL   True
 253  DAL      PHI  False
 254  SEA      TEN   True
 255  NYJ      CIN  False
 
 [267 rows x 3 columns],
     team opponent    win
 0    SEA       NO  False
 1    IND      NYJ   True
 2     KC      BAL   True
 3    PHI       GB   True
 4    PIT      BAL  False
 ..   ...      ...    ...
 251  HOU      JAC  False
 252  PHI      DAL   True
 253  DEN       SD   True
 254   SF      ARI  False
 255  SEA      STL  False
 
 [267 rows x 3 columns],
     team opponent    win
 0    HOU      CIN  False
 1     NO      DET  False
 2    NYG      ATL  False
 3    DEN      PIT  False
 4     SF       NO  False
 ..   ...      ...    ...
 251  DEN       KC   True
 252  CLE      PIT   True
 253  CIN      BAL   True
 254  ARI      SEA  False
 255  NYG      DAL  False
 
 [267 ro

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

def preprocess(inputs):
    scaler = preprocessing.StandardScaler().fit(inputs)
    return scaler.transform(inputs)

def build_and_test_model(test_validation_ratio=0.5):
    inputs = []
    targets = None

    for records, output in zip(records_for_season, outputs):
        for team, opponent in zip(output["team"], output["opponent"]):
            inputs.append(records.loc[team, :].append(records.loc[opponent, :]))
        if targets is None:
            targets = output["win"]
        else:
            targets = targets.append(output["win"])


    scaled_inputs = preprocess(inputs)

    X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size=1 - test_validation_ratio, random_state=42)

    log_reg = LogisticRegression(max_iter=1000)
    log_reg.fit(X_train, y_train)
    if test_validation_ratio < 1:
        print(f"Accuracy: {accuracy_score(y_test, log_reg.predict(X_test))}")

    return log_reg


In [50]:
log_reg = build_and_test_model(.8)


Accuracy: 0.7471910112359551


In [53]:
from collections import defaultdict

afc_top_seed = {"team": "Tennessee Titans", "seed": 1}
afc_teams_with_seeds = [
    {"team": "Kansas City Chiefs", "seed": 2},
    {"team": "Buffalo Bills", "seed": 3},
    {"team": "Cincinnati Bengals", "seed": 4},
    {"team": "Las Vegas Raiders", "seed": 5},
    {"team": "New England Patriots", "seed": 6},
    {"team": "Pittsburgh Steelers", "seed": 7},
]
nfc_top_seed = {"team": "Green Bay Packers", "seed": 1}
nfc_teams_with_seeds = [
    {"team": "Tampa Bay Buccaneers", "seed": 2},
    {"team": "Dallas Cowboys", "seed": 3},
    {"team": "Los Angeles Rams", "seed": 4},
    {"team": "Arizona Cardinals", "seed": 5},
    {"team": "San Francisco 49ers", "seed": 6},
    {"team": "Philadelphia Eagles", "seed": 7},
]

current_season = pd.read_csv(
    os.path.join(
        os.path.abspath(""), "games_data", "regular_season", "reg_games_2021.csv"
    )
)

current_season_records = calculate_season_stats(current_season)


def pick_matchups(teams_with_seeds):
    if len(teams_with_seeds) == 0:
        return []
    lowest = min(teams_with_seeds, key=lambda x: x["seed"])
    highest = max(teams_with_seeds, key=lambda x: x["seed"])
    copy = teams_with_seeds[:]
    copy.remove(lowest)
    copy.remove(highest)
    return [(lowest, highest), *pick_matchups(copy)]


def play_game(teams_with_seeds, log_reg):
    inputs = [
        current_season_records.loc[teams_with_seeds[0]["team"], :].append(
            current_season_records.loc[teams_with_seeds[1]["team"], :]
        )
    ]

    scaled_inputs = preprocess(inputs)

    if random.random() < log_reg.predict_proba(scaled_inputs)[0][0]:
        return teams_with_seeds[0]
    else:
        return teams_with_seeds[1]


def play_games(teams_with_seeds, log_reg):
    return [play_game(matchup, log_reg) for matchup in pick_matchups(teams_with_seeds)]




def simulate_playoffs(log_reg):
    new_matchups = play_games(afc_teams_with_seeds, log_reg)
    new_matchups.append(afc_top_seed)
    new_matchups = play_games(new_matchups, log_reg)
    afc_winner = play_games(new_matchups, log_reg)

    new_matchups = play_games(nfc_teams_with_seeds, log_reg)
    new_matchups.append(nfc_top_seed)
    new_matchups = play_games(new_matchups, log_reg)
    nfc_winner = play_games(new_matchups, log_reg)

    return [afc_winner[0]["team"], nfc_winner[0]["team"]]


def simulation(iterations, log_reg):
    histogram = defaultdict(lambda: 0)

    for i in range(iterations):
        names = simulate_playoffs(log_reg)
        histogram[names[0]] += 1
        histogram[names[1]] += 1

    for key, value in histogram.items():
        print(f"{key} have {value*100/iterations}% chance")


simulation(10000, log_reg)


Kansas City Chiefs have 17.12% chance
Green Bay Packers have 36.03% chance
Tennessee Titans have 36.2% chance
Tampa Bay Buccaneers have 17.35% chance
Los Angeles Rams have 11.07% chance
Dallas Cowboys have 14.37% chance
San Francisco 49ers have 6.88% chance
Buffalo Bills have 14.05% chance
Pittsburgh Steelers have 5.95% chance
Cincinnati Bengals have 11.62% chance
New England Patriots have 7.24% chance
Philadelphia Eagles have 6.34% chance
Arizona Cardinals have 7.96% chance
Las Vegas Raiders have 7.82% chance
