# MLB Predictions

In [1]:
#import required modules
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

## Import Mariners 2022 Season Data

In [2]:
#load Mariners' dataset to feed into model
X = pd.read_csv(
    Path('./Resources/mariners_2022_X.csv'),
    index_col=[0],
)

y = pd.read_csv(
    Path('./Resources/mariners_2022_y.csv'),
    index_col=[0],
)

In [3]:
#Review dataframe
display(X)
display(y)

Unnamed: 0,Rank,Home_Away_@,Home_Away_Home,Opp_ATL,Opp_BAL,Opp_BOS,Opp_CHW,Opp_CLE,Opp_DET,Opp_HOU,...,Opp_NYY,Opp_OAK,Opp_PHI,Opp_SDP,Opp_TBR,Opp_TEX,Opp_TOR,Opp_WSN,D/N_D,D/N_N
1,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,3.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
159,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
160,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
161,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Unnamed: 0,W/L
1,1
2,1
3,0
4,0
5,0
...,...
158,0
159,0
160,1
161,1


## Split into Training and Testing Sets Using `train_test_split`

In [4]:
# split data using test_train_split w/ random_state=1 and default test size of 25%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=1)

In [5]:
#check target dataset for balance
y_test.value_counts()

W/L
1      23
0      18
dtype: int64

## Normalize Data Using StandardScaler

In [6]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [7]:
# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Logistic Regression Model

In [8]:
# Instantiate the Logistic Regression Model w/ random_state = 1
model = LogisticRegression(random_state=1)

# Fit the model using training data
lr_model = model.fit(X_train_scaled, y_train)

In [9]:
# Make predictions using testing data
y_pred = lr_model.predict(X_test_scaled)

### Evaluation Metrics

In [10]:
#accuracy score
print(balanced_accuracy_score(y_test, y_pred))

0.5640096618357487


In [11]:
# Generate confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 7, 11],
       [ 6, 17]], dtype=int64)

In [12]:
#print results of confusion matrix
TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
print('True Positive(TP) = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN) = ', TN)
print('False Negative(FN) = ', FN)

True Positive(TP) =  17
False Positive(FP) =  11
True Negative(TN) =  7
False Negative(FN) =  6


In [13]:
# Generate classificaiton report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.39      0.45        18
           1       0.61      0.74      0.67        23

    accuracy                           0.59        41
   macro avg       0.57      0.56      0.56        41
weighted avg       0.58      0.59      0.57        41



## XGBoost Model

In [14]:
# Instantiate XGBoost Model w/ random_state = 1
model = XGBClassifier(random_state=1)

In [15]:
# Fit the model using the scaled training data
XGBModel = model.fit(X_train_scaled, y_train)



In [16]:
# Make predictions for test data
y_pred = XGBModel.predict(X_test_scaled)

# convert prediction percentages to binary
predictions = [round(value) for value in y_pred]

### Evaluation Metrics

In [17]:
accuracy = balanced_accuracy_score(y_test, predictions)
accuracy

0.5483091787439613

In [18]:
# Generate confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 8, 10],
       [ 8, 15]], dtype=int64)

In [19]:
TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
print('True Positive(TP) = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN) = ', TN)
print('False Negative(FN) = ', FN)

True Positive(TP) =  15
False Positive(FP) =  10
True Negative(TN) =  8
False Negative(FN) =  8


In [20]:
# Generate classificaiton report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.44      0.47        18
           1       0.60      0.65      0.63        23

    accuracy                           0.56        41
   macro avg       0.55      0.55      0.55        41
weighted avg       0.56      0.56      0.56        41



## Test Models Using List Indexing for Testing and Training Sets

In [21]:
#split into training and testing sets
X_train = X.loc[:122]
X_test = X.loc[123:]
y_train = y.loc[:122]
y_test = y.loc[123:]

In [39]:
#check testing targets for balance
y_test.value_counts()

W/L
1      24
0      16
dtype: int64

## Normalize Data Using StandardScaler

In [23]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [24]:
# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Logistic Regression Model

In [25]:
# Instantiate the Logistic Regression Model w/ random_state = 1
model = LogisticRegression(random_state=1)

# Fit the model using training data
lr_model = model.fit(X_train_scaled, y_train)

In [26]:
# Make predictions using testing data
y_pred = lr_model.predict(X_test_scaled)

### Evaluation Metrics

In [27]:
#accuracy score
print(balanced_accuracy_score(y_test, y_pred))

0.5416666666666667


In [28]:
# Generate confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 2, 14],
       [ 1, 23]], dtype=int64)

In [29]:
#print results of confusion matrix
TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
print('True Positive(TP) = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN) = ', TN)
print('False Negative(FN) = ', FN)

True Positive(TP) =  23
False Positive(FP) =  14
True Negative(TN) =  2
False Negative(FN) =  1


In [30]:
# Generate classificaiton report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.12      0.21        16
           1       0.62      0.96      0.75        24

    accuracy                           0.62        40
   macro avg       0.64      0.54      0.48        40
weighted avg       0.64      0.62      0.54        40



## XGBoost Model

In [31]:
# Instantiate XGBoost Model w/ random_state = 1
model = XGBClassifier(random_state=1)

In [32]:
# Fit the model using the scaled training data
XGBModel = model.fit(X_train_scaled, y_train)



In [33]:
# Make predictions for test data
y_pred = XGBModel.predict(X_test_scaled)

# convert prediction percentages to binary
predictions = [round(value) for value in y_pred]

### Evaluation Metrics

In [34]:
accuracy = balanced_accuracy_score(y_test, predictions)
accuracy

0.4791666666666667

In [35]:
# Generate confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 0, 16],
       [ 1, 23]], dtype=int64)

In [36]:
TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
print('True Positive(TP) = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN) = ', TN)
print('False Negative(FN) = ', FN)

True Positive(TP) =  23
False Positive(FP) =  16
True Negative(TN) =  0
False Negative(FN) =  1


In [37]:
# Generate classificaiton report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.59      0.96      0.73        24

    accuracy                           0.57        40
   macro avg       0.29      0.48      0.37        40
weighted avg       0.35      0.57      0.44        40

