In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import calibration
import sklearn.linear_model
from catboost import CatBoostClassifier, Pool, CatBoostRegressor
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
import numpy as np
import scipy.stats
from sklearn.base import clone
from sklearn.model_selection._search import ParameterSampler
from sklearn.metrics import brier_score_loss, make_scorer, log_loss, mean_squared_error
import os
os.chdir('/Users/lucashaupt/Documents/GitHub/nfl-live-win-probability')
from src.generate_data import get_game_data, get_division_data, get_odds_data, get_schedule_data
import statsmodels.api as sm

In [None]:
df = pd.read_csv("data/yards_gained_all_seasons.csv")
game_data = get_game_data(cache=True)
game_data = game_data.drop_duplicates("game_code")
division_data = get_division_data(cache=True)
combined_df = df.merge(game_data, on=["game_code", "season"], how="left", copy=False)


In [None]:
division_data.groupby(["season", "league_id"], as_index=False).count()

In [None]:
def get_rolling_value(df, freq=16, min_periods=16, shift=1):
    cols = ["total_expected_yards_gained", "total_yards_gained", "total_yards_added", "play_count"]
    df2 = df.rolling(freq, on="game_date", min_periods=min_periods)[cols].mean().shift(shift)
    df4 = pd.concat([df[["game_code", "season", "game_date", "offense_team", "defense_team", "home_team_id", "away_team_id"]],
                     df2[["total_expected_yards_gained", "total_yards_gained", "total_yards_added", "play_count"]]], axis=1)
    return df4


In [None]:
aggregate_values = ["total_expected_yards_gained", "total_yards_gained", "total_yards_added", "play_count"]
group_values = ["game_code", "season", "game_date", "offense_team", "defense_team", "home_team_id", "away_team_id"]
game_yards = combined_df[group_values + aggregate_values].groupby(group_values, as_index=False).sum()

In [None]:
rolling_offense = game_yards.sort_values("game_date").groupby("offense_team", as_index=False).apply(get_rolling_value)

In [None]:
rolling_defense = game_yards.sort_values("game_date").groupby("defense_team", as_index=False).apply(get_rolling_value)

In [None]:
game_codes = combined_df[["game_code", "home_team_id", "away_team_id", "home_team_abbrev", "away_team_abbrev", "home_score", "away_score", "game_date", "season"]].drop_duplicates()

In [None]:
game_codes.columns

In [None]:
rolling_offense.columns

In [None]:
yards_values = ['total_expected_yards_gained', 'total_yards_gained', 'total_yards_added', 'play_count']
games_home_team = game_codes.merge(rolling_offense[["game_code", "offense_team"] + yards_values], left_on=["game_code", "home_team_id"], right_on=["game_code", "offense_team"], suffixes=["", "_offense"])
games_home_team = games_home_team.merge(rolling_defense[["game_code", "defense_team"] + yards_values], left_on=["game_code", "home_team_id"], right_on=["game_code", "defense_team"], suffixes=["_offense", "_defense"])


In [None]:
yards_values = ['total_expected_yards_gained', 'total_yards_gained', 'total_yards_added', 'play_count']
games_away_team = game_codes.merge(rolling_offense[["game_code", "offense_team"] + yards_values], left_on=["game_code", "away_team_id"], right_on=["game_code", "offense_team"], suffixes=["", "_offense"])
games_away_team = games_away_team.merge(rolling_defense[["game_code", "defense_team"] + yards_values], left_on=["game_code", "away_team_id"], right_on=["game_code", "defense_team"], suffixes=["_offense", "_defense"])

In [None]:
games = games_home_team.merge(games_away_team, on=['game_code', 'season', 'home_team_id', 'away_team_id', 'home_team_abbrev', 'away_team_abbrev', 'home_score', 'away_score', 'game_date'], suffixes=["_home", "_away"])
games = games[games["season"]>=2016]


In [None]:
features = ["total_yards_added_offense_home",
            "total_yards_added_defense_home",
            "total_yards_added_offense_away",
            "total_yards_added_defense_away",]
target = ["home_score","away_score"]

In [None]:
ridge_model_home = sklearn.linear_model.RidgeCV(alphas=(0.1, 1.0, 10.0))
ridge_model_home.fit(games[features], games["home_score"])
ridge_model_home.score(games[features], games["home_score"])
games["x_home_score"] = ridge_model_home.predict(games[features])

In [None]:
ridge_model_away = sklearn.linear_model.RidgeCV(alphas=(0.1, 1.0, 10.0))
ridge_model_away.fit(games[features], games["away_score"])
ridge_model_away.score(games[features], games["away_score"])
games["x_away_score"] = ridge_model_away.predict(games[features])

In [None]:
plt.scatter(games["x_home_score"], games["home_score"])
plt.plot([0,60], [0,60])

In [None]:
plt.scatter(games["x_away_score"], games["away_score"])
plt.plot([0,60], [0,60])

In [None]:

X2 = sm.add_constant(games[features])
est = sm.OLS(games["home_score"], X2)
results_new = []
results_fu = est.fit()
for x in np.arange(0.005, 0.25, 0.005).tolist():
    est2 = est.fit_regularized(L1_wt=0, alpha=x)
    results_fr_fit = sm.regression.linear_model.OLSResults(est, 
                                                            est2.params, 
                                                            results_fu.normalized_cov_params)

    results_new.append(results_fr_fit.ssr)
    # print(est2.model.score(est2.params))


In [None]:
plt.plot(np.arange(0.005, 0.25, 0.005).tolist(), results_new)

In [None]:
plt.plot(np.arange(0.005, 0.25, 0.005).tolist(), results_new)

In [None]:
results_fr = est.fit_regularized(L1_wt=0, alpha=0.1, start_params=results_fu.params)
final = sm.regression.linear_model.OLSResults(est, 
                                              results_fr.params, 
                                              est.normalized_cov_params)

print(final.summary())


In [None]:
games.to_clipboard()

In [None]:
games["total_yards_added_offense_home_per_play"] = games["total_yards_added_offense_home"] / games["play_count_offense_home"] 
games["total_yards_added_defense_home_per_play"] = games["total_yards_added_defense_home"] / games["play_count_defense_home"] 
games["total_yards_added_offense_away_per_play"] = games["total_yards_added_offense_away"] / games["play_count_offense_away"] 
games["total_yards_added_defense_away_per_play"] = games["total_yards_added_defense_away"] / games["play_count_defense_away"] 


In [None]:
games.columns

In [None]:

new_features = [
    'total_yards_added_offense_home_per_play',
    'total_yards_added_defense_home_per_play',
    'total_yards_added_offense_away_per_play',
    'total_yards_added_defense_away_per_play',
    'play_count_offense_home',
    'play_count_defense_home',
    'play_count_offense_away', 
    'play_count_defense_away'

       ]
X2 = sm.add_constant(games[new_features])
est = sm.OLS(games["away_score"], X2)
est2 = est.fit()
print(est2.summary())


In [None]:

new_features = [
    'total_yards_added_offense_home_per_play',
    'total_yards_added_defense_home_per_play',
    'total_yards_added_offense_away_per_play',
    'total_yards_added_defense_away_per_play',
    'play_count_offense_home',
    'play_count_defense_home',
    'play_count_offense_away', 
    'play_count_defense_away'

       ]
X2 = sm.add_constant(games[new_features])
est = sm.OLS(games["home_score"], X2)
est2 = est.fit()
print(est2.summary())


In [None]:
X2 = sm.add_constant(games[features])
est = sm.OLS(games["away_score"], X2)
est2 = est.fit()
print(est2.summary())


In [None]:
X2 = sm.add_constant(games[features])
est = sm.OLS(games["home_score"], X2)
est2 = est.fit()
print(est2.summary())


In [None]:
games["home_team_win"] = np.where(games["home_score"] > games["away_score"], 1, 0)
games["away_team_win"] = np.where(games["home_score"] < games["away_score"], 1, 0)
games["tie"] = np.where(games["home_score"] == games["away_score"], 1, 0)

In [None]:
X2 = sm.add_constant(games[features])
est = sm.OLS(games["away_score"], X2)
est2 = est.fit()
print(est2.summary())


In [None]:
import datetime
schedule = get_schedule_data(cache=True)
schedule = schedule[schedule["game_date"]>np.max(pd.to_datetime(games["game_date"]))]


In [None]:
X2 = sm.add_constant(games[features])
est = sm.Logit(games["tie"], X2)
est2 = est.fit()
games["x_tie"] = est2.predict(X2)

print(est2.summary())


In [None]:
odds = pd.read_parquet("data/odds_data.parquet")
odds = odds.drop_duplicates("game_code")

In [None]:
games=games.merge(odds[["game_code", "cur_spread"]], on="game_code")

In [None]:
games["line_abs_miss"] = abs(games["line"] - games["x_line"])
plt.scatter(abs(games["cur_spread"]), abs(games["line"]), alpha=.2)
# np.where(games["x_line"]>0, games["x_line"], -games["x_line"])

In [None]:
current_values = games_home_team.append(games_away_team).sort_values("game_date", ascending=False).drop_duplicates(["offense_team"])
current_values = current_values.merge(division_data, left_on=["season", "offense_team"], right_on=["season", "team_id"])

In [None]:
predictions = schedule.merge(current_values[['offense_team', 'total_yards_added_offense', 'total_yards_added_defense']], left_on=["home_team_id"], right_on=["offense_team"], suffixes=["", "_home"])
predictions = predictions.merge(current_values[['offense_team', 'total_yards_added_offense', 'total_yards_added_defense']], left_on=["away_team_id"], right_on=["offense_team"], suffixes=["_home", "_away"])

In [None]:
game_data

In [None]:
X2 = sm.add_constant(games[features])
est = sm.Logit(games["home_team_win"], X2)
est2 = est.fit()
games["x_home_team_win"] = est2.predict(X2)
predictions["x_home_team_win"] = est2.predict(sm.add_constant(predictions[features]))
print(est2.summary())


In [None]:
X2 = sm.add_constant(games[features])
est = sm.Logit(games["tie"], X2)
est2 = est.fit()
games["x_tie_calculated"] = est2.predict(X2)
games["x_away_team_win"] = 1 - games["x_tie_calculated"] - games["x_home_team_win"]
predictions["x_tie_calculated"] = est2.predict(sm.add_constant(predictions[features]))
predictions["x_away_team_win"] = 1 - predictions["x_tie_calculated"] - predictions["x_home_team_win"]

print(est2.summary())


In [None]:
predictions["x_home_score"] = ridge_model_home.predict(predictions[features])
predictions["x_away_score"] = ridge_model_away.predict(predictions[features])
predictions["x_line"] = predictions["x_away_score"] - predictions["x_home_score"]

In [None]:
import itertools
import numpy as np
score_dicts = {}
outcome_dicts = {}
import time
init_time = time.time()

for x in predictions["game_code"]:
    print(x)
    def simulate_results(pred_mean, pred_std):
        randomized_cycle = itertools.cycle(np.random.normal(size=250000))
        random_sample = next(randomized_cycle)
        score_diff = round(random_sample * pred_std + pred_mean)
        return score_diff
    score_diffs = []
    final_result = []
    for simulations in range(1):
        pred = int(simulate_results(predictions[predictions["game_code"] == x]["x_line"], 13.334230852348464))
        score_diffs.append(pred)
        final_result.append(np.where(pred<0, 'W', np.where(pred>0, 'L', 'T')))
        
    outcome_dicts[x] = final_result
    score_dicts[x] = score_diffs
    print(time.time() - init_time)



In [None]:
home_features_df = games[
    [
        "home_team_id",
        "home_team_abbrev",
        "home_score",
        "total_yards_added_offense_home",
        "total_yards_added_defense_home",
        "away_team_id",
        "away_team_abbrev",
        "away_score",
        "total_yards_added_offense_away",
        "total_yards_added_defense_away",
    ]
].rename(
    columns={
        "home_team_id": "team_id",
        "home_team_abbrev": "team_abbrev",
        "home_score": "team_score",
        "total_yards_added_offense_home": "total_yards_added_offense_team",
        "total_yards_added_defense_home": "total_yards_added_defense_team",
        "away_team_id": "opponent_id",
        "away_team_abbrev": "opponent_abbrev",
        "away_score": "opponent_score",
        "total_yards_added_offense_away": "total_yards_added_offense_opponent",
        "total_yards_added_defense_away": "total_yards_added_defense_opponent",
    }
)
home_features_df["is_home_team"] = 1
away_features_df = games[
    [
        "away_team_id",
        "away_team_abbrev",
        "away_score",
        "total_yards_added_offense_away",
        "total_yards_added_defense_away",
        "home_team_id",
        "home_team_abbrev",
        "home_score",
        "total_yards_added_offense_home",
        "total_yards_added_defense_home",
    ]
].rename(
    columns={
        "away_team_id": "team_id",
        "away_team_abbrev": "team_abbrev",
        "away_score": "team_score",
        "total_yards_added_offense_away": "total_yards_added_offense_team",
        "total_yards_added_defense_away": "total_yards_added_defense_team",
        "home_team_id": "opponent_id",
        "home_team_abbrev": "opponent_abbrev",
        "home_score": "opponent_score",
        "total_yards_added_offense_home": "total_yards_added_offense_opponent",
        "total_yards_added_defense_home": "total_yards_added_defense_opponent",
    }
)
away_features_df["is_home_team"] = 0


In [None]:
features_df = pd.concat([home_features_df, away_features_df])

In [None]:
features_new = [
    'total_yards_added_offense_team',
    'total_yards_added_defense_team',
    'total_yards_added_offense_opponent',
    'total_yards_added_defense_opponent',
    'is_home_team'
 ]

In [None]:
range(int(np.max(games["home_score"]+1)))

In [None]:
models = {}
for team_score in range(int(np.max(games["home_score"]+1))):
    for opponent_score in range(int(np.max(games["away_score"]+1))):
        try:
            models[str(team_score) + str(opponent_score)] = sklearn.linear_model.LogisticRegression()
            models[str(team_score) + str(opponent_score)].fit(games[features], np.where((games["home_score"]==team_score)&(games["away_score"]==opponent_score), 1, 0))
            print(str(team_score) + " - " + str(opponent_score), *models[str(team_score) + str(opponent_score)].intercept_, *models[str(team_score) + str(opponent_score)].coef_)
        except:
            print(str(team_score) + " - " + str(opponent_score))


In [None]:
game_prediction_matrix = predictions[["game_code"]+features]
game_outcome_matrix = predictions[["game_code"]+features]
list_of_scores = []
list_of_home_wins = []
list_of_ties = []
list_of_away_wins = []
for team_score in range(int(np.max(games["home_score"]+1))):
    for opponent_score in range(int(np.max(games["away_score"]+1))):
        try:
            game_prediction_matrix["score_" + str(team_score) + "_" + str(opponent_score)] = pd.DataFrame(models[str(team_score) + str(opponent_score)].predict_proba(predictions[features]))[1]
            print("score_" + str(team_score) + "_" + str(opponent_score))
            list_of_scores.append("score_" + str(team_score) + "_" + str(opponent_score))
            if team_score > opponent_score:
                list_of_home_wins.append("score_" + str(team_score) + "_" + str(opponent_score))
            elif team_score == opponent_score:
                list_of_ties.append("score_" + str(team_score) + "_" + str(opponent_score))
            else:
                list_of_away_wins.append("score_" + str(team_score) + "_" + str(opponent_score))

        except:
            game_prediction_matrix["score_" + str(team_score) + "_" + str(opponent_score)] = 0
            list_of_scores.append("score_" + str(team_score) + "_" + str(opponent_score))



In [None]:
game_code = 2337531
prob_adjustment = np.sum(game_prediction_matrix[game_prediction_matrix["game_code"]==game_code][list_of_scores].values[0])
[predictions[predictions["game_code"] == game_code],
    (np.sum(game_prediction_matrix[game_prediction_matrix["game_code"]==game_code][list_of_home_wins].values[0]) / prob_adjustment),
    (np.sum(game_prediction_matrix[game_prediction_matrix["game_code"]==game_code][list_of_ties].values[0]) / prob_adjustment),
    (np.sum(game_prediction_matrix[game_prediction_matrix["game_code"]==game_code][list_of_away_wins].values[0]) / prob_adjustment)]


In [None]:
game_exact_score_predictions = []
for game_code in predictions["game_code"]:
    game_dict = {
        "game_code": game_code,
        "home_team_id": predictions[predictions["game_code"] == game_code][
            "home_team_id"
        ].item(),
        "away_team_id": predictions[predictions["game_code"] == game_code][
            "away_team_id"
        ].item(),
    }
    game_prediction = game_prediction_matrix[
        game_prediction_matrix["game_code"] == game_code
    ][list_of_scores].values[0]
    prob_adjustment = np.sum(game_prediction_matrix[game_prediction_matrix["game_code"]==game_code][list_of_scores].values[0])
    game_dict["pred_exact_score"] = (game_prediction / np.sum(game_prediction)).tolist()
    game_dict["pred_outcome"] = [
        np.sum(
            game_prediction_matrix[game_prediction_matrix["game_code"] == game_code][
                list_of_home_wins
            ].values[0]
        )
        / prob_adjustment,
        np.sum(
            game_prediction_matrix[game_prediction_matrix["game_code"] == game_code][
                list_of_ties
            ].values[0]
        )
        / prob_adjustment,
        np.sum(
            game_prediction_matrix[game_prediction_matrix["game_code"] == game_code][
                list_of_away_wins
            ].values[0]
        )
        / prob_adjustment,
    ]

    game_dict["current_score"] = [0, 0]
    game_exact_score_predictions.append(game_dict)


In [None]:
game_prediction_matrix[game_prediction_matrix["game_code"]==2337634].to_clipboard()

In [None]:
game_exact_score_predictions

In [None]:
results_df = games.loc[games.season==2021, ["game_code", "home_team_id", "away_team_id", "home_score", "away_score"]].to_dict(orient='records')


In [None]:
predictions

In [None]:
team_names = games.loc[games.season==2021, ["home_team_id", "home_team_abbrev"]]
team_names = team_names.rename(columns={'home_team_id': 'id', 'home_team_abbrev': 'name'}).drop_duplicates().to_dict(orient='records')

In [None]:
# for team_score in range(int(np.max(features_df["team_score"]))):
#     for opponent_score in range(int(np.max(features_df["opponent_score"]))):
#         try:
#             print(str(team_score) + "_" + str(opponent_score), *models[str(team_score) + str(opponent_score)].intercept_, *models[str(team_score) + str(opponent_score)].coef_)
#         except:
#             print(str(team_score) + "_" + str(opponent_score))


In [None]:
sim_data = {
    "teams": team_names,
    "results": results_df,
    "predictions": game_exact_score_predictions,
    "prediction_params": {"max_home_score": 57, "max_away_score": 59}

}
import json
data_dir = "data/"
with open(os.path.join(data_dir, 'simulation_inputs.json'), 'w') as f:
    json.dump(sim_data, f)

In [None]:
np.max(games["away_score"])

In [None]:
X2 = sm.add_constant(features_df[features_new])
est = sm.OLS(features_df["team_score"], X2)
est2 = est.fit()
print(est2.summary())


In [None]:
X2 = sm.add_constant(features_df[features_new])
est = sm.OLS(features_df["opponent_score"], X2)
est2 = est.fit()
print(est2.summary())


In [None]:
np.max(features_df["opponent_score"])

In [None]:
["home_team_id", "away_team_id", "home_team_abbrev", "away_team_abbrev", "home_score", "away_score"]+features