### Machine Learning Evaluation
#### Description: 
Raw football data was imported and cleaned utilizing Pandas for the purpose of evaluating different supervised machine learning models with our dummy data. Data was initialized and split into testing and training datasets and was further evaluted with multiple models. Further analysis of accuracy scores and classification_reports will be completed to determine the most effective model to perform predictive modeling on our final dataset. 



### Import Dependencies/Machine Learning 

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter


### Import Dataset

In [3]:
# Load the .csv dataset.
file_path = "../Resources/final_raw.csv"
df = pd.read_csv(file_path)
df

Unnamed: 0,game_id,vis_team,home_team,vis_score,home_score,WINNER,OT,Roof,Surface,Temperature,...,rush_broken_tackles,rec_air_yds,rec_yac,rec_drops,offense,off_pct,last_300_median_DKP,last_300_median_FDP,last_600_median_DKP,last_600_median_FDP
0,201909050chi,GNB,CHI,10,3,VW,False,outdoors,grass,65,...,0,0.0,0,0,61,100,18.06,18.06,18.61,18.61
1,201909050chi,GNB,CHI,10,3,VW,False,outdoors,grass,65,...,0,63.2,19,0,59,97,23.30,18.60,21.40,17.75
2,201909050chi,GNB,CHI,10,3,VW,False,outdoors,grass,65,...,0,81.0,2,0,41,67,6.80,5.80,8.85,6.60
3,201909050chi,GNB,CHI,10,3,VW,False,outdoors,grass,65,...,1,-1.0,1,0,37,61,27.30,24.80,27.30,24.80
4,201909050chi,GNB,CHI,10,3,VW,False,outdoors,grass,65,...,0,21.0,8,0,33,54,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13933,202101240kan,BUF,KAN,24,38,HW,False,outdoors,astroturf,40,...,0,0.0,0,0,42,65,11.60,8.85,8.95,6.45
13934,202101240kan,BUF,KAN,24,38,HW,False,outdoors,astroturf,40,...,0,0.0,0,0,7,11,,,,
13935,202101240kan,BUF,KAN,24,38,HW,False,outdoors,astroturf,40,...,0,0.0,0,0,7,11,,,,
13936,202101240kan,BUF,KAN,24,38,HW,False,outdoors,astroturf,40,...,0,0.0,0,0,4,6,,,,


###

In [4]:
df.columns

Index(['game_id', 'vis_team', 'home_team', 'vis_score', 'home_score', 'WINNER',
       'OT', 'Roof', 'Surface', 'Temperature', 'Humidity', 'Wind_Speed',
       'Vegas_Line', 'Vegas_Favorite', 'Over_Under', 'game_date.x', 'season',
       'game_date.y', 'player_id', 'pos', 'player', 'team', 'pass_cmp',
       'pass_att', 'pass_yds', 'pass_td', 'pass_int', 'pass_sacked',
       'pass_sacked_yds', 'pass_long', 'pass_rating', 'rush_att', 'rush_yds',
       'rush_td', 'rush_long', 'targets', 'rec', 'rec_yds', 'rec_td',
       'rec_long', 'fumbles_lost', 'rush_scrambles', 'designed_rush_att',
       'comb_pass_rush_play', 'comb_pass_play', 'comb_rush_play',
       'Team_abbrev', 'Opponent_abbrev', 'two_point_conv', 'total_ret_td',
       'offensive_fumble_recovery_td', 'pass_yds_bonus', 'rush_yds_bonus',
       'rec_yds_bonus', 'Total_DKP', 'Off_DKP', 'Total_FDP', 'Off_FDP',
       'pass_target_yds', 'pass_poor_throws', 'pass_blitzed', 'pass_hurried',
       'rush_yds_before_contact', 'rush_

### Initialize Dataset
      

In [5]:
df["Surface"].unique()

array(['grass', 'fieldturf', 'sportturf', 'fieldturf ', 'astroturf',
       'matrixturf'], dtype=object)

In [6]:
#Drop 
#df = df.drop(['',])

#Keep
df = df[['pass_att', 'pass_yds', 'offense', 'Surface', 'OT']]
df.head()


Unnamed: 0,pass_att,pass_yds,offense,Surface,OT
0,30,203,61,grass,False
1,0,0,59,grass,False
2,0,0,41,grass,False
3,0,0,37,grass,False
4,0,0,33,grass,False


In [7]:
#Drop all null values
df = df.dropna()
df.head()

Unnamed: 0,pass_att,pass_yds,offense,Surface,OT
0,30,203,61,grass,False
1,0,0,59,grass,False
2,0,0,41,grass,False
3,0,0,37,grass,False
4,0,0,33,grass,False


### Split Data into Training and Testing

In [8]:
# Create our features and target

y = df['OT']

X = df.drop("OT", axis=1)
X = pd.get_dummies(X)

In [9]:
X.describe()

Unnamed: 0,pass_att,pass_yds,offense,Surface_astroturf,Surface_fieldturf,Surface_fieldturf.1,Surface_grass,Surface_matrixturf,Surface_sportturf
count,13938.0,13938.0,13938.0,13938.0,13938.0,13938.0,13938.0,13938.0,13938.0
mean,2.876812,20.70254,32.824652,0.098866,0.152676,0.060769,0.621108,0.0386,0.027981
std,9.692101,71.17253,21.978545,0.298493,0.359688,0.238915,0.485129,0.192645,0.164924
min,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,31.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,0.0,0.0,51.0,0.0,0.0,0.0,1.0,0.0,0.0
max,68.0,517.0,98.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [None]:
# Train the data
classifier.fit(X_train, y_train)

In [None]:
#Make predictions
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

### Balanced Random Forest Classifier


In [None]:
# Resample the training data with the BalancedRandomForestClassifier
# Create a random forest classifier.
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=78) 
# Fitting the model
brf_model = brf_model.fit(X_train_scaled, y_train)
y_pred = brf_model.predict(X_test_scaled)


In [None]:
# Calculated the balanced accuracy score
# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test,y_pred)
acc_score

In [None]:
# Display the confusion matrix
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Print the imbalanced classification report
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Balanced Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
importances = brf_model.feature_importances_
sorted(zip(importances, X.columns), reverse=True)

### Easy Ensemble AdaBoost Classifer

In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
classifier = EasyEnsembleClassifier(n_estimators=100)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
acc_score = accuracy_score(y_test, y_pred)
print(f"Accuracy Score : {acc_score}")

In [None]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

In [None]:
# Print the imbalanced classification report
print("Classification Report")
print(classification_report(y_test, y_pred))

### Combination (Over and Under) Sampling
In this section, you will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. You will resample the data using the SMOTEENN algorithm and complete the folliowing steps:

1. View the count of the target classes using Counter from the collections library.
2. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the imbalanced_classification_report from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [None]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

### SMOTE Oversampling

In [None]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))