# Data source

https://github.com/ryurko/nflscrapR-data has data for 10 years about the games won and lost by each team throughout the season. We'll take the data for the pre-season and season and build a logistic regression to predict the results of every post season game. We'll then use that data to run 10,000 simulations of the post season in 2021.


Our hypothesis is that 3 variables influence our predictions.

1. The point spread i.e. how much they won or lost by
2. How good the other team is
3. How much this team wins

We'll start by looking at the first five years of data


In [1]:
import pandas as pd
import os

seasons = [
    pd.read_csv(
        os.path.join(
            os.path.abspath(""), "games_data", "regular_season", f"reg_games_{year}.csv"
        )
    )
    for year in range(2009, 2019)
]


In [2]:
seasons[0].columns


Index(['type', 'game_id', 'home_team', 'away_team', 'week', 'season',
       'state_of_game', 'game_url', 'home_score', 'away_score'],
      dtype='object')

In [3]:
FINAL_WEEK_TO_COUNT = 15

records_for_season = []
teams_for_season = []
for season in seasons:
    season_without_last_games = season[season["week"] < FINAL_WEEK_TO_COUNT]
    season_without_last_games["differential"] = (
        season_without_last_games["home_score"]
        - season_without_last_games["away_score"]
    )

    teams = season_without_last_games.home_team.unique()
    teams_for_season.append(teams)

    record = pd.DataFrame(
        index=season["home_team"].unique(), columns=["wins", "losses"]
    )

    records = {}
    for team in teams:
        record = pd.DataFrame(
            {
                "score": season_without_last_games[
                    season_without_last_games["home_team"] == team
                ]["differential"],
                "opponent": season_without_last_games[
                    season_without_last_games["home_team"] == team
                ]["away_team"],
            }
        ).append(
            pd.DataFrame(
                {
                    "score": -season_without_last_games[
                        season_without_last_games["away_team"] == team
                    ]["differential"],
                    "opponent": season_without_last_games[
                        season_without_last_games["away_team"] == team
                    ]["home_team"],
                }
            )
        )
        records[team] = record

    point_averages = pd.DataFrame(
        [
            {"avg": record["score"].mean(), "team": team}
            for (team, record) in records.items()
        ]
    )

    for team in teams:
        records[team] = records[team].merge(
            point_averages, left_on="opponent", right_on="team"
        )
        records[team]["adjusted_score"] = (
            records[team]["score"] + records[team]["avg"] / 2
        )

    records_for_season.append(records)

records_for_season


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  season_without_last_games["differential"] = (


[{'PIT':     score opponent        avg team  adjusted_score
  0       3      TEN  -2.307692  TEN        1.846154
  1      10       SD   7.923077   SD       13.961538
  2      13      CLE -12.076923  CLE        6.961538
  3      -7      CLE -12.076923  CLE      -13.038462
  4      10      MIN  11.230769  MIN       15.615385
  5      -6      CIN   3.615385  CIN       -4.192308
  6      -3      CIN   3.615385  CIN       -1.192308
  7      -3      OAK -12.384615  OAK       -9.192308
  8      -3      CHI  -3.384615  CHI       -4.692308
  9       8      DET -15.153846  DET        0.423077
  10     18      DEN   2.000000  DEN       19.000000
  11     -3       KC -10.461538   KC       -8.230769
  12     -3      BAL   7.769231  BAL        0.884615,
  'CLE':     score opponent        avg team  adjusted_score
  0     -14      MIN  11.230769  MIN       -8.384615
  1      -3      CIN   3.615385  CIN       -1.192308
  2      -9      CIN   3.615385  CIN       -7.192308
  3     -28       GB   7.769231

In [4]:
post_seasons = [
    pd.read_csv(
        os.path.join(
            os.path.abspath(""), "games_data", "post_season", f"post_games_{year}.csv"
        )
    )
    for year in range(2009, 2019)
]

outputs = []
for season, post_season, teams in zip(seasons, post_seasons, teams_for_season):
    without_all_start_game = post_season[(post_season["home_team"].isin(teams))]
    output = pd.DataFrame(
        {
            "team": without_all_start_game["home_team"],
            "opponent": without_all_start_game["away_team"],
            "win": without_all_start_game["away_score"] - without_all_start_game["home_score"] > 0,
        }
    )
    last_games_of_season = season[season["week"] >=FINAL_WEEK_TO_COUNT]
    output = output.append(pd.DataFrame({
            "team": last_games_of_season["home_team"],
            "opponent": last_games_of_season["away_team"],
            "win": last_games_of_season["away_score"] - last_games_of_season["home_score"] > 0,
    }))
    outputs.append(output)

outputs


[    team opponent    win
 0    CIN      NYJ   True
 1    DAL      PHI  False
 2     NE      BAL   True
 3    ARI       GB  False
 4     NO      ARI  False
 5    IND      BAL  False
 6    MIN      DAL  False
 7     SD      NYJ   True
 8    IND      NYJ  False
 9     NO      MIN  False
 11   IND       NO   True
 208  JAC      IND   True
 209   NO      DAL   True
 210  STL      HOU   True
 211   KC      CLE   True
 212  NYJ      ATL   True
 213  BUF       NE   True
 214  TEN      MIA  False
 215  DET      ARI   True
 216  DEN      OAK   True
 217   SD      CIN  False
 218  PIT       GB  False
 219  BAL      CHI  False
 220  PHI       SF  False
 221  SEA       TB   True
 222  CAR      MIN  False
 223  WAS      NYG   True
 224  TEN       SD   True
 225  ATL      BUF  False
 226  PIT      BAL  False
 227   NO       TB   True
 228   NE      JAC  False
 229  NYG      CAR   True
 230  MIA      HOU   True
 231  CIN       KC  False
 232  CLE      OAK  False
 233   GB      SEA  False
 234  ARI   

In [5]:

inputs = []
targets = None

for records, output in zip(records_for_season, outputs):
    for team, opponent in zip(output["team"], output["opponent"]):
        inputs.append(records[team]["adjusted_score"].append(records[opponent]["adjusted_score"]))
    if targets is None:
        targets = output["win"]
    else:
        targets = targets.append(output["win"])

    
inputs, targets

([0     -4.000000
  1      4.307692
  2      7.307692
  3     -9.538462
  4     33.307692
  5     13.884615
  6      6.884615
  7      2.961538
  8     -3.038462
  9      2.423077
  10    10.884615
  11    -9.192308
  12   -14.384615
  0     11.384615
  1    -12.615385
  2      5.846154
  3     -5.153846
  4      3.846154
  5     -5.538462
  6     -4.538462
  7     -4.000000
  8      8.807692
  9     18.461538
  10    -6.615385
  11    31.807692
  12    16.615385
  Name: adjusted_score, dtype: float64,
  0     -1.576923
  1     -6.576923
  2     11.807692
  3     15.884615
  4     19.038462
  5      0.346154
  6     10.807692
  7      0.961538
  8      6.615385
  9     -6.000000
  10     0.769231
  11     7.807692
  12    -6.115385
  0    -18.615385
  1     14.769231
  2     12.615385
  3     23.423077
  4      7.423077
  5     -1.576923
  6      2.346154
  7      9.346154
  8     25.807692
  9    -10.192308
  10    -4.038462
  11     2.307692
  12    26.884615
  Name: adjusted_score, 

In [6]:
from sklearn.linear_model import LogisticRegression

training_set_input = inputs[0:200]
training_set_target = targets[0:200]

log_reg = LogisticRegression()
log_reg.fit(training_set_input, training_set_target)

LogisticRegression()

In [13]:
from sklearn.metrics import accuracy_score

accuracy_score(targets[201:400], log_reg.predict(inputs[201:400]))

0.6331658291457286