In [145]:
import pandas as pd 
import numpy as np 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [89]:
nfl_scores = pd.read_csv('scores_2020.csv')
nfl_scores.head()

Unnamed: 0,week,game_date,away_team_name,away_team_score,home_team_name,home_team_score
0,1,10-Sep-20,Houston Texans,20,Kansas City Chiefs,34
1,1,13-Sep-20,New York Jets,17,Buffalo Bills,27
2,1,13-Sep-20,Seattle Seahawks,38,Atlanta Falcons,25
3,1,13-Sep-20,Philadelphia Eagles,17,Washington Football Team,27
4,1,13-Sep-20,Cleveland Browns,6,Baltimore Ravens,38


Let's start by splitting each game into individual events for the target team we are trying to predict a win outcome for vs. the opponent.

In [95]:
nfl_scores_1 = nfl_scores.rename(columns = {
    'away_team_name' : 'team',
    'home_team_name' : 'opponent',
    'away_team_score' : 'team_score',
    'home_team_score' : 'opponent_score'
})

nfl_scores_2 = nfl_scores.rename(columns = {
    'away_team_name' : 'opponent',
    'home_team_name' : 'team',
    'away_team_score' : 'opponent_score',
    'home_team_score' : 'team_score'
})

nfl_scores = pd.concat((nfl_scores_1, nfl_scores_2), axis = 0)
nfl_scores = nfl_scores.drop(columns=['game_date'])


In [96]:
# Calculate the game outcome, our eventual y predicted outcome
nfl_scores['outcome'] = np.where(nfl_scores['team_score'] > nfl_scores['opponent_score'], 1, 0)

Let's try to predict the Clevland Browns chance of winning on week 11.

In [147]:
# Create some features for the other teams
target_week = 11
scores_before_target_week = nfl_scores[nfl_scores['week'] < target_week]

# Calculate some features from each teams statistics to this point in the season
agg_dict = {
    'outcome' : 'sum',
    'team_score' : 'mean',
    'opponent_score' : 'mean'
}

agg_features = scores_before_target_week.groupby('team').agg(agg_dict).reset_index()

team_features = agg_features.rename(columns={
    'outcome' : 'team_wins',
    'team_score' : 'team_avg_points_for',
    'opponent_score' : 'team_avg_points_against'
})

opponent_features = agg_features.rename(columns={
    'team' : 'opponent',
    'outcome' : 'opponent_wins',
    'team_score' : 'opponent_avg_points_for',
    'opponent_score' : 'opponent_avg_points_against'
})

scores_including_target_week = nfl_scores[nfl_scores['week'] <= target_week]

scores_including_target_week = scores_including_target_week.merge(team_features, how='left', on='team')
scores_including_target_week = scores_including_target_week.merge(opponent_features, how='left', on='opponent')
scores_including_target_week = pd.get_dummies(scores_including_target_week, columns = ['opponent', 'team'])

training_data = scores_including_target_week[scores_including_target_week['week'] < target_week]
testing_data = scores_including_target_week[scores_including_target_week['week'] == target_week]



Unnamed: 0,week,team_score,opponent_score,outcome,team_wins,team_avg_points_for,team_avg_points_against,opponent_wins,opponent_avg_points_for,opponent_avg_points_against,...,team_New Orleans Saints,team_New York Giants,team_New York Jets,team_Philadelphia Eagles,team_Pittsburgh Steelers,team_San Francisco 49ers,team_Seattle Seahawks,team_Tampa Bay Buccaneers,team_Tennessee Titans,team_Washington Football Team
0,1,20,34,0,8,22.222222,28.000000,32,31.777778,20.333333,...,0,0,0,0,0,0,0,0,0,0
1,1,17,27,0,0,13.444444,29.777778,28,27.200000,26.500000,...,0,0,1,0,0,0,0,0,0,0
2,1,38,25,1,24,32.222222,29.555556,12,27.000000,27.888889,...,0,0,0,0,0,0,1,0,0,0
3,1,17,27,0,12,22.555556,25.777778,8,20.000000,24.222222,...,0,0,0,1,0,0,0,0,0,0
4,1,6,38,0,24,24.000000,27.111111,24,27.111111,18.333333,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1269,10,23,16,1,24,24.000000,18.666667,24,32.222222,29.555556,...,0,0,0,0,0,0,0,0,0,0
1270,10,36,10,1,36,30.111111,19.000000,8,22.666667,27.777778,...,0,0,0,0,1,0,0,0,0,0
1271,10,27,13,1,28,30.111111,23.666667,16,23.800000,23.400000,...,1,0,0,0,0,0,0,0,0,0
1272,10,23,17,1,16,21.000000,23.444444,24,27.111111,18.333333,...,0,0,0,0,0,0,0,0,0,0


In [148]:
train_x = training_data.drop(columns=['outcome', 'team_score', 'opponent_score'], axis = 1)
train_y = training_data['outcome']
test_x = testing_data.drop(columns=['outcome', 'team_score', 'opponent_score'], axis = 1)
test_y = testing_data['outcome']

scaler = StandardScaler()
scaled_train_x = scaler.fit_transform(train_x)
lm = LogisticRegression(random_state=0).fit(scaled_train_x, train_y)

scaled_test_x = scaler.fit_transform(test_x)
confusion_matrix(test_y, lm.predict(scaled_test_x))

# # print(predicted_y)
# # print(test_y[:])
# sum(test_y[:] - predicted_y)


array([[24, 32],
       [32, 24]], dtype=int64)