Import Dependencies / Machine Learning

In [1]:
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

Import Dataset

In [3]:
# Load the .csv dataset.
file_path = "../Resources/Use me 2020 NFL 3rd & Long Raw Dataset.csv"
df = pd.read_csv(file_path)
df

Unnamed: 0,GameId,GameDate,Quarter,Minute,Second,OffenseTeam,DefenseTeam,Down,ToGo,Yards,...,IsTwoPointConversion,IsTwoPointConversionSuccessful,RushDirection,YardLineFixed,YardLineDirection,IsPenaltyAccepted,PenaltyTeam,IsNoPlay,PenaltyType,PenaltyYards
0,2020122602,12/26/2020,2,9,44,LV,MIA,2,10,-9,...,0,0,,18,OPP,0,,0,,0
1,2020122602,12/26/2020,2,9,3,LV,MIA,3,19,18,...,0,0,LEFT GUARD,27,OPP,0,,0,,0
2,2020122500,12/25/2020,3,1,17,MIN,NO,3,9,0,...,0,0,,33,OPP,0,NO,1,DEFENSIVE HOLDING,0
3,2020122500,12/25/2020,3,1,12,MIN,NO,1,10,0,...,0,0,,18,OPP,1,NO,1,DEFENSIVE PASS INTERFERENCE,17
4,2020122500,12/25/2020,3,1,6,MIN,NO,1,1,-3,...,0,0,LEFT GUARD,1,OPP,0,,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46184,2020091300,9/13/2020,4,1,38,ATL,SEA,2,9,2,...,0,0,,20,OPP,0,,0,,0
46185,2020091300,9/13/2020,4,5,18,,,0,0,0,...,0,0,,0,OPP,0,,0,,0
46186,2020091300,9/13/2020,4,2,0,ATL,SEA,1,10,1,...,0,0,,21,OPP,0,,0,,0
46187,2020091300,9/13/2020,4,2,0,,,0,0,0,...,0,0,,0,OPP,0,,0,,0


In [4]:
df.columns

Index(['GameId', 'GameDate', 'Quarter', 'Minute', 'Second', 'OffenseTeam',
       'DefenseTeam', 'Down', 'ToGo', 'Yards', 'Yards (+,-)',
       'Successful 3rd Down Y/N', 'YardLine', 'SeriesFirstDown', 'NextScore',
       'Description', 'TeamWin', 'SeasonYear', 'Formation', 'PlayType',
       'IsRush', 'IsPass', 'IsIncomplete', 'IsTouchdown', 'PassType', 'IsSack',
       'IsChallenge', 'IsChallengeReversed', 'Challenger', 'IsMeasurement',
       'IsInterception', 'IsFumble', 'IsPenalty', 'IsTwoPointConversion',
       'IsTwoPointConversionSuccessful', 'RushDirection', 'YardLineFixed',
       'YardLineDirection', 'IsPenaltyAccepted', 'PenaltyTeam', 'IsNoPlay',
       'PenaltyType', 'PenaltyYards'],
      dtype='object')

In [5]:
df["OffenseTeam"].unique()

array(['LV', 'MIN', 'TB', 'ARI', 'DEN', 'MIA', nan, 'NYJ', 'CLE', 'NYG',
       'BAL', 'CIN', 'HOU', 'CHI', 'IND', 'PIT', 'DAL', 'WAS', 'SEA',
       'LA', 'CAR', 'TEN', 'BUF', 'KC', 'GB', 'ATL', 'NE', 'JAX', 'SF',
       'LAC', 'PHI', 'DET', 'NO'], dtype=object)

Intialize dataset for analysis by creating new dataframe using 6 of 43 columns in dataset

In [6]:
#Keep
df = df[['Quarter', 'Down' , 'ToGo', 'Successful 3rd Down Y/N' , 'Formation','PlayType',]]
df.head()

Unnamed: 0,Quarter,Down,ToGo,Successful 3rd Down Y/N,Formation,PlayType
0,2,2,10,4th Down,SHOTGUN,SACK
1,2,3,19,4th Down,SHOTGUN,RUSH
2,3,3,9,4th Down,SHOTGUN,PASS
3,3,1,10,4th Down,UNDER CENTER,PASS
4,3,1,1,4th Down,UNDER CENTER,RUSH


Get count of blank cells
Count now zero

In [7]:
df.isna().sum()

Quarter                       0
Down                          0
ToGo                          0
Successful 3rd Down Y/N       0
Formation                  1034
PlayType                   2131
dtype: int64

In [8]:
df[["Formation", "PlayType"]] = \
    df[["Formation", "PlayType"]].fillna("unk")

Get count of blank cells
Count now zero

In [9]:
df.isna().sum()

Quarter                    0
Down                       0
ToGo                       0
Successful 3rd Down Y/N    0
Formation                  0
PlayType                   0
dtype: int64

In [10]:
#Drop all null values
df = df.dropna()
df.head()

Unnamed: 0,Quarter,Down,ToGo,Successful 3rd Down Y/N,Formation,PlayType
0,2,2,10,4th Down,SHOTGUN,SACK
1,2,3,19,4th Down,SHOTGUN,RUSH
2,3,3,9,4th Down,SHOTGUN,PASS
3,3,1,10,4th Down,UNDER CENTER,PASS
4,3,1,1,4th Down,UNDER CENTER,RUSH


In [11]:
# Get list of unique play types
df['PlayType'].unique().tolist()

['SACK',
 'RUSH',
 'PASS',
 'NO PLAY',
 'FIELD GOAL',
 'TIMEOUT',
 'unk',
 'PUNT',
 'SCRAMBLE',
 'EXTRA POINT',
 'QB KNEEL',
 'TWO-POINT CONVERSION',
 'FUMBLES',
 'CLOCK STOP',
 'KICK OFF',
 'EXCEPTION',
 'PENALTY']

In [12]:
# create list of playtypes resulting in actual play
playlist=df['PlayType'].unique().tolist()
playlist2=['PASS','RUSH','FUMBLE','SACK','SCRAMBLE']

In [13]:
# created new dataframe using loc method to retrieve Quarter 4, 3rd Down, and unique play typles
q4_df=\
df.loc[(df['Quarter']==4)&\
       (df['Down']== 3)&\
      (df['PlayType'].isin(playlist2))\
      ].reset_index()

In [14]:
# Get count of play types
q4_df['PlayType'].value_counts()

PASS        1192
RUSH         434
SACK         137
SCRAMBLE      74
Name: PlayType, dtype: int64

In [15]:
q4_df

Unnamed: 0,index,Quarter,Down,ToGo,Successful 3rd Down Y/N,Formation,PlayType
0,11,4,3,10,N,SHOTGUN,PASS
1,25,4,3,2,Y,UNDER CENTER,RUSH
2,32,4,3,3,Y,SHOTGUN,RUSH
3,55,4,3,7,N,SHOTGUN,PASS
4,67,4,3,10,N,SHOTGUN,PASS
...,...,...,...,...,...,...,...
1832,45977,4,3,3,Y,SHOTGUN,PASS
1833,45984,4,3,9,N,SHOTGUN,PASS
1834,45993,4,3,19,N,SHOTGUN,PASS
1835,46090,4,3,1,N,SHOTGUN,RUSH


In [16]:
# Use loc method to retrieve columns form dataframe
q4_df2 = q4_df.loc[:,"ToGo":"PlayType"]
q4_df2

Unnamed: 0,ToGo,Successful 3rd Down Y/N,Formation,PlayType
0,10,N,SHOTGUN,PASS
1,2,Y,UNDER CENTER,RUSH
2,3,Y,SHOTGUN,RUSH
3,7,N,SHOTGUN,PASS
4,10,N,SHOTGUN,PASS
...,...,...,...,...
1832,3,Y,SHOTGUN,PASS
1833,9,N,SHOTGUN,PASS
1834,19,N,SHOTGUN,PASS
1835,1,N,SHOTGUN,RUSH


# Split Data into Training and Testing

In [17]:
# Create our features
y = q4_df2["Successful 3rd Down Y/N"]
X = q4_df2.drop("Successful 3rd Down Y/N", axis=1)
X = pd.get_dummies(X)

In [18]:
# Create our target
y = y.replace({"N":0, "Y":1})
y.value_counts()

0    1091
1     746
Name: Successful 3rd Down Y/N, dtype: int64

In [19]:
X.describe()

Unnamed: 0,ToGo,Formation_NO HUDDLE,Formation_NO HUDDLE SHOTGUN,Formation_SHOTGUN,Formation_UNDER CENTER,PlayType_PASS,PlayType_RUSH,PlayType_SACK,PlayType_SCRAMBLE
count,1837.0,1837.0,1837.0,1837.0,1837.0,1837.0,1837.0,1837.0,1837.0
mean,6.88024,0.009799,0.085465,0.765378,0.139358,0.648884,0.236255,0.074578,0.040283
std,5.152133,0.098528,0.279649,0.423878,0.346414,0.477449,0.424896,0.262781,0.196676
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,6.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
75%,10.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
max,34.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Data samples used to train model

In [21]:
X_train

Unnamed: 0,ToGo,Formation_NO HUDDLE,Formation_NO HUDDLE SHOTGUN,Formation_SHOTGUN,Formation_UNDER CENTER,PlayType_PASS,PlayType_RUSH,PlayType_SACK,PlayType_SCRAMBLE
812,13,0,0,1,0,1,0,0,0
1388,1,0,0,0,1,0,1,0,0
1205,19,0,0,1,0,1,0,0,0
1282,9,0,0,1,0,0,1,0,0
957,18,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
905,2,0,0,1,0,1,0,0,0
1791,7,0,0,1,0,1,0,0,0
1096,5,0,0,1,0,1,0,0,0
235,9,0,0,1,0,1,0,0,0


Data Samples used to test model

In [22]:
X_test

Unnamed: 0,ToGo,Formation_NO HUDDLE,Formation_NO HUDDLE SHOTGUN,Formation_SHOTGUN,Formation_UNDER CENTER,PlayType_PASS,PlayType_RUSH,PlayType_SACK,PlayType_SCRAMBLE
556,1,0,0,0,1,0,1,0,0
1406,6,0,0,1,0,1,0,0,0
1451,8,0,0,1,0,1,0,0,0
111,1,0,1,0,0,0,1,0,0
1788,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1047,18,0,0,1,0,0,1,0,0
1615,15,0,0,1,0,1,0,0,0
1092,10,0,0,1,0,1,0,0,0
1493,10,0,0,1,0,1,0,0,0


# Logistic Regression (Binary Classification)

In [23]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [24]:
# Train the data
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [25]:
# Make Predictions
predictions = classifier.predict(X_test)

First three X_test predictions (yes, no, no)

In [26]:
classifier.predict(X_test)

array([1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,

In [27]:
classifier.predict_proba(X_test)

array([[0.39369046, 0.60630954],
       [0.5409057 , 0.4590943 ],
       [0.62232104, 0.37767896],
       [0.35231023, 0.64768977],
       [0.38268928, 0.61731072],
       [0.58217307, 0.41782693],
       [0.49906956, 0.50093044],
       [0.5409057 , 0.4590943 ],
       [0.39369046, 0.60630954],
       [0.59932548, 0.40067452],
       [0.55238525, 0.44761475],
       [0.74603467, 0.25396533],
       [0.33747514, 0.66252486],
       [0.42300783, 0.57699217],
       [0.39369046, 0.60630954],
       [0.62232104, 0.37767896],
       [0.823848  , 0.176152  ],
       [0.41601742, 0.58398258],
       [0.45724643, 0.54275357],
       [0.60030351, 0.39969649],
       [0.41601742, 0.58398258],
       [0.62232104, 0.37767896],
       [0.62232104, 0.37767896],
       [0.88171914, 0.11828086],
       [0.97199364, 0.02800636],
       [0.37593105, 0.62406895],
       [0.45707086, 0.54292914],
       [0.34790909, 0.65209091],
       [0.66085858, 0.33914142],
       [0.41601742, 0.58398258],
       [0.

In [36]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print ("Accuracy score")
print (accuracy_score(y_test, predictions))

Accuracy score
0.6804347826086956


In [37]:
# Display the confusion matrix
# Calculating the confusion matrix.
cm = confusion_matrix(y_test,predictions)

# Create a DataFrame from the confusion matrix.
cm_df= pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


In [38]:
#from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Displaying results
print("Confusion Matrix")
display(cm_df)
#print(f"Accuracy Score : {accuracy_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,186,82
Actual 1,65,127


Classification Report
              precision    recall  f1-score   support

           0       0.74      0.69      0.72       268
           1       0.61      0.66      0.63       192

    accuracy                           0.68       460
   macro avg       0.67      0.68      0.68       460
weighted avg       0.69      0.68      0.68       460



# Balanced Random Forest Classifier

In [29]:
# Resample the training data with the BalancedRandomForestClassifier
# Create a random forest classifier.
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.metrics import classification_report_imbalanced
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=78) 
# Fitting the model
brf_model = brf_model.fit(X_train_scaled, y_train)
y_pred = brf_model.predict(X_test_scaled)

In [30]:
# Calculated the balanced accuracy score
# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test,y_pred)
acc_score

0.6262049129353233

In [31]:
# Display the confusion matrix
# Calculating the confusion matrix.
cm = confusion_matrix(y_test,y_pred)

# Create a DataFrame from the confusion matrix.
cm_df= pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


In [32]:
#from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Balanced Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,150,118
Actual 1,59,133


Balanced Accuracy Score : 0.6262049129353233
Classification Report
              precision    recall  f1-score   support

           0       0.72      0.56      0.63       268
           1       0.53      0.69      0.60       192

    accuracy                           0.62       460
   macro avg       0.62      0.63      0.61       460
weighted avg       0.64      0.62      0.62       460



In [33]:
# List the features sorted in descending order by feature importance
importances = brf_model.feature_importances_
sorted(zip(importances, X.columns), reverse=True)

[(0.6627404505289869, 'ToGo'),
 (0.18019640499663878, 'PlayType_SACK'),
 (0.04138976817686553, 'PlayType_PASS'),
 (0.02735116546275124, 'PlayType_RUSH'),
 (0.02553780772936693, 'PlayType_SCRAMBLE'),
 (0.02143842184203999, 'Formation_SHOTGUN'),
 (0.01824763409301369, 'Formation_NO HUDDLE SHOTGUN'),
 (0.018013088110058428, 'Formation_UNDER CENTER'),
 (0.0050852590602785555, 'Formation_NO HUDDLE')]

# Easy Ensemble AdaBoost Classifier

In [34]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
classifier = EasyEnsembleClassifier(n_estimators=100)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [51]:
# Calculated the balanced accuracy score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
acc_score = accuracy_score(y_test, y_pred)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.6630434782608695


In [36]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,150,118
Actual 1,59,133


In [37]:
# Print the imbalanced classification report
print("Classification Report")
print(f"Accuracy Score : {acc_score}")
print(classification_report(y_test, y_pred))

Classification Report
Accuracy Score : 0.6152173913043478
              precision    recall  f1-score   support

           0       0.72      0.56      0.63       268
           1       0.53      0.69      0.60       192

    accuracy                           0.62       460
   macro avg       0.62      0.63      0.61       460
weighted avg       0.64      0.62      0.62       460



# Combination (Over and Under)

In [38]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 503, 1: 159})

In [39]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [40]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.6262049129353233

In [41]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[215,  53],
       [102,  90]], dtype=int64)

In [42]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.68      0.80      0.47      0.74      0.61      0.39       268
          1       0.63      0.47      0.80      0.54      0.61      0.36       192

avg / total       0.66      0.66      0.61      0.65      0.61      0.38       460



# SMOTE Oversampling

In [43]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

Counter(y_resampled)

Counter({0: 823, 1: 823})

In [44]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [45]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [46]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[215,  53],
       [102,  90]], dtype=int64)

In [47]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.68      0.80      0.47      0.74      0.61      0.39       268
          1       0.63      0.47      0.80      0.54      0.61      0.36       192

avg / total       0.66      0.66      0.61      0.65      0.61      0.38       460

