In [91]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import accuracy_score

In [92]:
# Load the datasets
df_2021 = pd.read_csv('2021data.csv')
df_2022 = pd.read_csv('2022data.csv')
df_2023 = pd.read_csv('2023data.csv')

# Combine the datasets
data = pd.concat([df_2021, df_2022, df_2023])

# Save the combined dataset
data.to_csv('combined_dataset.csv', index=False)

In [93]:
data

Unnamed: 0,Week,Team,Day,Date,Time,Link,Result,OT,Rec,H/A,...,Off_RushYd,Off_TO,Def_1stD,Def_TotYd,Def_PassYd,Def_RushYD,Def_TO,Exp_Offense,Exp_Defense,Exp_SpecTms
0,1,49ers,Sun,12-Sep,1:00PM ET,boxscore,W,,Jan-00,@,...,131.0,2.0,31.0,430.0,314.0,116.0,1.0,15.48,-5.16,-1.73
1,2,49ers,Sun,19-Sep,1:00PM ET,boxscore,W,,Feb-00,@,...,117.0,,18.0,328.0,177.0,151.0,,7.86,-5.30,2.68
2,3,49ers,Sun,26-Sep,8:20PM ET,boxscore,L,,1-Feb,,...,67.0,2.0,21.0,353.0,253.0,100.0,,12.19,-15.14,0.95
3,4,49ers,Sun,3-Oct,4:05PM ET,boxscore,L,,2-Feb,,...,143.0,2.0,14.0,234.0,129.0,105.0,,8.61,-2.20,-14.31
4,5,49ers,Sun,10-Oct,4:25PM ET,boxscore,L,,3-Feb,@,...,152.0,1.0,20.0,304.0,210.0,94.0,1.0,-6.53,-1.74,-0.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,14,vikings,Sun,10-Dec,4:05PM ET,boxscore,W,,6-Jul,@,...,132.0,,8.0,202.0,146.0,56.0,3.0,-10.05,20.46,-6.43
612,15,vikings,Sat,16-Dec,1:00PM ET,boxscore,L,OT,7-Jul,@,...,143.0,2.0,22.0,378.0,298.0,80.0,1.0,8.62,-9.29,-1.35
613,16,vikings,Sun,24-Dec,1:00PM ET,boxscore,L,,8-Jul,,...,17.0,4.0,28.0,389.0,246.0,143.0,1.0,11.43,-15.44,-0.61
614,17,vikings,Sun,31-Dec,8:20PM ET,boxscore,L,,9-Jul,,...,67.0,2.0,28.0,470.0,293.0,177.0,1.0,-14.27,-22.54,12.19


In [94]:
data['Result'] = data['Result'].apply(lambda x: 1 if x == 'W' else 0)  # Convert Win to 1 and anything else to 0

#1 if OT, 0 if no OT
data['OT'] = data['OT'].fillna(0).replace('OT', 1)

# 1 is home, 0 is away
data['H/A'] = data['H/A'].fillna(1).replace('@', 0)

data = data.drop(columns = ['Day', 'Date', 'Time', 'Link', 'Rec', 'Week'])

data = data.dropna()

data = data.reset_index()

  data['OT'] = data['OT'].fillna(0).replace('OT', 1)


In [95]:
data.head()


Unnamed: 0,index,Team,Result,OT,H/A,Opp,Score_Tm,Score_Opp,Off_1stD,Off_TotYd,...,Off_RushYd,Off_TO,Def_1stD,Def_TotYd,Def_PassYd,Def_RushYD,Def_TO,Exp_Offense,Exp_Defense,Exp_SpecTms
0,0,49ers,1,0,0,Detroit Lions,41.0,33.0,21.0,442.0,...,131.0,2.0,31.0,430.0,314.0,116.0,1.0,15.48,-5.16,-1.73
1,4,49ers,0,0,0,Arizona Cardinals,10.0,17.0,19.0,338.0,...,152.0,1.0,20.0,304.0,210.0,94.0,1.0,-6.53,-1.74,-0.47
2,6,49ers,0,0,1,Indianapolis Colts,18.0,30.0,13.0,280.0,...,111.0,4.0,17.0,295.0,147.0,148.0,2.0,-15.3,-1.16,4.24
3,11,49ers,1,0,1,Minnesota Vikings,34.0,26.0,23.0,423.0,...,208.0,1.0,17.0,323.0,256.0,67.0,2.0,8.93,4.16,-6.1
4,12,49ers,0,0,0,Seattle Seahawks,23.0,30.0,17.0,365.0,...,71.0,3.0,21.0,327.0,181.0,146.0,3.0,-0.51,-2.27,-3.44


In [96]:
data = data.drop(columns = ['index'])
data['Score_Tm'] = data['Score_Tm'].astype(float)
data = data[~(data['H/A'] == 'N')]
data['H/A'] = data['H/A'].astype(int)
column = data.pop('Opp')
data.insert(1, 'Opp', column)
data.head()

Unnamed: 0,Team,Opp,Result,OT,H/A,Score_Tm,Score_Opp,Off_1stD,Off_TotYd,Off_PassYd,Off_RushYd,Off_TO,Def_1stD,Def_TotYd,Def_PassYd,Def_RushYD,Def_TO,Exp_Offense,Exp_Defense,Exp_SpecTms
0,49ers,Detroit Lions,1,0,0,41.0,33.0,21.0,442.0,311.0,131.0,2.0,31.0,430.0,314.0,116.0,1.0,15.48,-5.16,-1.73
1,49ers,Arizona Cardinals,0,0,0,10.0,17.0,19.0,338.0,186.0,152.0,1.0,20.0,304.0,210.0,94.0,1.0,-6.53,-1.74,-0.47
2,49ers,Indianapolis Colts,0,0,1,18.0,30.0,13.0,280.0,169.0,111.0,4.0,17.0,295.0,147.0,148.0,2.0,-15.3,-1.16,4.24
3,49ers,Minnesota Vikings,1,0,1,34.0,26.0,23.0,423.0,215.0,208.0,1.0,17.0,323.0,256.0,67.0,2.0,8.93,4.16,-6.1
4,49ers,Seattle Seahawks,0,0,0,23.0,30.0,17.0,365.0,294.0,71.0,3.0,21.0,327.0,181.0,146.0,3.0,-0.51,-2.27,-3.44


In [87]:
data['H/A'].value_counts()

H/A
0    416
1    416
Name: count, dtype: int64

In [97]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 3:19], data.iloc[:, 2], test_size = 0.2, random_state = 1)

Score_Tm
17.0    39
20.0    36
27.0    29
10.0    24
31.0    22
        ..
8.0      1
39       1
45.0     1
48       1
49       1
Name: count, Length: 81, dtype: int64

In [98]:
rf = RandomForestClassifier(n_estimators = 100, random_state = 1)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

0.9221556886227545


array([1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0], dtype=int64)

In [13]:
# Filter the dataset for Detroit Lions games
lions_games = data[data['Team'] == 'Lions']

# For simplicity, we will create a "typical" game for the Lions based on their historical data's mean
typical_lions_game = lions_games[numeric_columns].apply(lambda x: x.mean(), axis=0)
typical_lions_game = typical_lions_game.to_frame().transpose()

# We'll need to handle categorical data separately as we can't just take a mean
# For this example, we'll assume the mode (most common category) for categorical attributes
typical_lions_categorical = lions_games[categorical_columns].mode().iloc[0]

# Combine numerical and categorical "typical" game data
typical_lions_game = pd.concat([typical_lions_game, typical_lions_categorical], axis=1)

# We assume that the 'Result' column (our label) is not present in the new data we want to predict
typical_lions_game = typical_lions_game[X_train.columns]

# Now we use our trained model to make a prediction
# Note: This prediction is speculative and based on the "average" past game features
lions_next_game_pred = rf_pipeline.predict(typical_lions_game)

# Convert prediction back to 'W' or 'L'
lions_next_game_result = 'W' if lions_next_game_pred[0] == 1 else 'L'
lions_next_game_result


IndexError: single positional indexer is out-of-bounds

In [12]:
# Check how many Lions games are available in the dataset
lions_games_count = lions_games.shape[0]

# Check for any columns that might have all null values, which could cause the mode to fail
null_columns = lions_games[categorical_columns].isnull().all()

# Display the number of Lions games and any categorical columns that have only null values
(lions_games_count, null_columns)

NameError: name 'lions_games' is not defined