In [1]:
import pandas as pd
import networkx as nx
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
def create_graph(game_data, week_num):
    G = nx.DiGraph()
    margin_totals = defaultdict(float)
    game_counts = defaultdict(int)
    
    # Iterate over each game result
    for _, row in game_data.iterrows():
        winner = row['winner']
        loser = row['loser']
        week = row['week']
        margin = row['margin']* week / week_num

        key = (loser, winner)
        margin_totals[key] += margin
        game_counts[key] += 1

    # Add edges with average margin as weight
    for (loser, winner), total_margin in margin_totals.items():
        avg_margin = total_margin / game_counts[(loser, winner)]
        G.add_edge(loser, winner, weight=avg_margin)
    
    return G

In [3]:
# Function to calculate rankings based on the weighted graph
def calculate_rankings(graph):
    # Calculate PageRank with weights
    return pd.DataFrame(nx.pagerank(graph, weight='weight').items(), columns=['Team', 'Ranking'])

In [6]:
# Assume 'game_results' is your complete dataset of NFL game results
df_results = pd.read_csv('../data/nfl_games_pfr.csv')

In [12]:
# Offensive rankings
# Initialize an empty list to store rankings DataFrames
ranking_dfs = []

game_results = df_results[['season','week','Winner Yds','Loser Yds','Margin Yds']]
game_results = game_results.rename(columns={'Winner Yds':'winner', 'Loser Yds':'loser','Margin Yds':'margin'})

# Loop over each distinct season in the dataset
for season in game_results['season'].unique():
    # Filter the game data for the current season
    season_data = game_results[game_results['season'] == season]
    
    # Loop over the weeks for this season
    for week in range(2, season_data['week'].max() + 1):
        # Filter the game data up to the current week for the current season
        filtered_data = season_data[season_data['week'] <= week]
        
        # Create the directed graph for the current season and weeks
        G = create_graph(filtered_data,week)
        
        # Calculate the rankings based on the weighted graph
        rankings = calculate_rankings(G)

        # Round the rankings to 4 decimal places
        rankings['Ranking'] = rankings['Ranking'].round(5)
        
        # Add columns indicating the season and week, with a leading zero for weeks
        rankings['SeasonWeek'] = f"{season}_W{str(week).zfill(2)}"
        rankings['Season'] = season
        rankings['Week'] = week
        rankings['Type'] = "Offense"
        
        # Append the rankings to the list
        ranking_dfs.append(rankings)

# Concatenate all rankings DataFrames into a single DataFrame
final_offense_rankings_df = pd.concat(ranking_dfs)

# Create a cross-tab view (pivot table) where each row is a team, and each column is Season+Week
pivot_df = final_offense_rankings_df.pivot(index='Team', columns='SeasonWeek', values='Ranking')

# Display the cross-tab DataFrame
print(final_offense_rankings_df.head())
print(pivot_df.head())


  Team  Ranking SeasonWeek  Season  Week     Type
0   KC  0.01464   2024_W02    2024     2  Offense
1  BAL  0.03461   2024_W02    2024     2  Offense
2  PHI  0.01464   2024_W02    2024     2  Offense
3   GB  0.02024   2024_W02    2024     2  Offense
4  ATL  0.02501   2024_W02    2024     2  Offense
SeasonWeek  2020_W02  2020_W03  2020_W04  2020_W05  2020_W06  2020_W07  \
Team                                                                     
ARI          0.04911   0.05144   0.03870   0.01496   0.02372   0.02224   
ATL          0.02908   0.01895   0.00865   0.00796   0.00883   0.01052   
BAL          0.05298   0.03218   0.01230   0.02112   0.01494   0.01414   
BUF          0.03548   0.02480   0.01053   0.01885   0.01680   0.02217   
CAR          0.05682   0.06940   0.20568   0.15440   0.18321   0.16836   

SeasonWeek  2020_W08  2020_W09  2020_W10  2020_W11  ...  2023_W12  2023_W13  \
Team                                                ...                       
ARI          0.02923   

In [13]:
# Defensive rankings
# Initialize an empty list to store rankings DataFrames
ranking_dfs = []

game_results = df_results[['season','week','Winner TO','Loser TO','Margin TO']]
game_results = game_results.rename(columns={'Winner TO':'winner', 'Loser TO':'loser','Margin TO':'margin'})

# Loop over each distinct season in the dataset
for season in game_results['season'].unique():
    # Filter the game data for the current season
    season_data = game_results[game_results['season'] == season]
    
    # Loop over the weeks for this season
    for week in range(2, season_data['week'].max() + 1):
        # Filter the game data up to the current week for the current season
        filtered_data = season_data[season_data['week'] <= week]
        
        # Create the directed graph for the current season and weeks
        G = create_graph(filtered_data,week)
        
        # Calculate the rankings based on the weighted graph
        rankings = calculate_rankings(G)

        # Round the rankings to 4 decimal places
        rankings['Ranking'] = rankings['Ranking'].round(5)
        
        # Add columns indicating the season and week, with a leading zero for weeks
        rankings['SeasonWeek'] = f"{season}_W{str(week).zfill(2)}"
        rankings['Season'] = season
        rankings['Week'] = week
        rankings['Type'] = "Defense"
        
        # Append the rankings to the list
        ranking_dfs.append(rankings)

# Concatenate all rankings DataFrames into a single DataFrame
final_defense_rankings_df = pd.concat(ranking_dfs)

# Create a cross-tab view (pivot table) where each row is a team, and each column is Season+Week
pivot_df = final_defense_rankings_df.pivot(index='Team', columns='SeasonWeek', values='Ranking')

# Display the cross-tab DataFrame
print(final_defense_rankings_df.head())
print(pivot_df.head())


  Team  Ranking SeasonWeek  Season  Week     Type
0  BAL  0.01919   2024_W02    2024     2  Defense
1   KC  0.04936   2024_W02    2024     2  Defense
2   GB  0.01919   2024_W02    2024     2  Defense
3  PHI  0.05197   2024_W02    2024     2  Defense
4  PIT  0.01919   2024_W02    2024     2  Defense
SeasonWeek  2020_W02  2020_W03  2020_W04  2020_W05  2020_W06  2020_W07  \
Team                                                                     
ARI          0.04210   0.04854   0.05538   0.05150   0.03561   0.02502   
ATL          0.03028   0.01877   0.01305   0.01040   0.00988   0.01536   
BAL          0.01637   0.01317   0.02406   0.01596   0.01336   0.01167   
BUF          0.04210   0.01849   0.01473   0.01011   0.01944   0.01870   
CAR          0.04210   0.02488   0.01666   0.01258   0.01608   0.02917   

SeasonWeek  2020_W08  2020_W09  2020_W10  2020_W11  ...  2023_W12  2023_W13  \
Team                                                ...                       
ARI          0.02669   

In [17]:
combined_rankings = pd.concat([final_offense_rankings_df,final_defense_rankings_df])
print(combined_rankings)


   Team  Ranking SeasonWeek  Season  Week     Type
0    KC  0.01464   2024_W02    2024     2  Offense
1   BAL  0.03461   2024_W02    2024     2  Offense
2   PHI  0.01464   2024_W02    2024     2  Offense
3    GB  0.02024   2024_W02    2024     2  Offense
4   ATL  0.02501   2024_W02    2024     2  Offense
..  ...      ...        ...     ...   ...      ...
27  LAR  0.02852   2020_W17    2020    17  Defense
28  PIT  0.02790   2020_W17    2020    17  Defense
29  NYG  0.04691   2020_W17    2020    17  Defense
30  TEN  0.02279   2020_W17    2020    17  Defense
31  DEN  0.06669   2020_W17    2020    17  Defense

[4480 rows x 6 columns]


In [14]:
# Output the results and pivot table to a CSV file
pivot_df.to_csv('../data/nfl_rankings_pivot.csv')
final_rankings_df.to_csv('../data/nfl_rankings.csv')

In [None]:
# General LM from Chat GPT
df = pd.read_csv('nfl_data.csv')

# Features (X) and target (y)
X = df[['HomeOffense', 'HomeDefense', 'AwayOffense', 'AwayDefense']]
y = df['Spread']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)






In [31]:
def get_rank(team, season, week, rankings_df):
    # Check if the week is 1 or if the team/season/week combination exists in the rankings
    if week == 1:
        return None  # No ranking data for week 1
    rank_row = rankings_df[(rankings_df['Team'] == team) & 
                           (rankings_df['Season'] == season) & 
                           (rankings_df['Week'] == week)]
    if rank_row.empty:
        return None  # Return None if no ranking data is available for this team/season/week
    return rank_row['Ranking'].values[0]

def predict_winner(team_1, team_2, rank_1, rank_2):
    # Handle cases where either rank_1 or rank_2 is None
    if rank_1 is None or rank_2 is None:
        # print(f"Warning: Missing ranking for {team_1} or {team_2}. Skipping prediction.")
        return None  # No prediction for this game if any rank is None
    
    # If both ranks are available, predict the winner
    return team_1 if rank_1 > rank_2 else team_2

In [125]:
correct_predictions = 0
total_games = 0

game_results['rank_1'] = None  # Ranking for team_1
game_results['rank_2'] = None  # Ranking for team_2
game_results['rank_diff'] = None  # Ranking for team_2
game_results['predicted_winner'] = None  # Predicted winner
game_results['correct'] = None  # Predicted winner

for index, row in game_results.iterrows():
    if row['week'] > 4:
        rank_1 = get_rank(row['Home Team Abbr'], row['season'], row['week'] - 1, final_rankings_df)
        rank_2 = get_rank(row['Away Team Abbr'], row['season'], row['week'] - 1, final_rankings_df)

        # Store the rankings in the DataFrame
        game_results.at[index, 'rank_1'] = rank_1
        game_results.at[index, 'rank_2'] = rank_2
        if rank_1 is not None and rank_2 is not None:
            rank_diff = round(rank_1 - rank_2,5)
            game_results.at[index, 'rank_diff'] = rank_diff
            if abs(rank_diff) < 0.05:
                continue
        else:
            game_results.at[index, 'rank_diff'] = None  # Or some default value like 0
            continue
        
        predicted_winner = predict_winner(row['Home Team Abbr'], row['Away Team Abbr'], rank_1, rank_2)
        game_results.at[index, 'predicted_winner'] = predicted_winner

        if predicted_winner is None:
            continue  # Skip this game if no prediction could be made

        actual_winner = row['winner']
        if actual_winner is None:
            continue  # Skip this game if no prediction could be made
        
        if predicted_winner == actual_winner:
            correct_predictions += 1
            game_results.at[index, 'correct'] = True
        else:
            game_results.at[index, 'correct'] = False
        total_games += 1 

accuracy = correct_predictions / total_games
print(f"Prediction accuracy: {accuracy:.2%}")
print(f"Games predicted: {total_games}")

Prediction accuracy: 70.29%
Games predicted: 441


In [39]:
# Output the results to a CSV file
game_results.to_csv('../data/nfl_games_predictions.csv')