In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
mlb_game_data = pd.read_csv("mlb_2022_training_data.csv")
mlb_game_data.head()

Unnamed: 0,HRs Hit,BA,OBP,SLG,OPS,Opp ERA,Num Pitchers Used,Target HR,Next H/A_H,Next Venue_ARI,...,Next Venue_PIT,Next Venue_SDP,Next Venue_SEA,Next Venue_SFG,Next Venue_STL,Next Venue_TBR,Next Venue_TEX,Next Venue_TOR,Next Venue_WSN,Next Opp Arm_R
0,1,0.115,0.324,0.231,0.554,4.5,5,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0.091,0.219,0.145,0.364,2.12,4,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2,1,0.116,0.216,0.209,0.426,2.08,4,1,1,1,...,0,0,0,0,0,0,0,0,0,1
3,2,0.129,0.257,0.259,0.516,2.83,6,1,1,1,...,0,0,0,0,0,0,0,0,0,1
4,1,0.13,0.257,0.26,0.518,1.64,6,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [5]:
y = mlb_game_data['Target HR']
X = mlb_game_data.drop(columns=['Target HR'])

X_train, X_test, y_train, y_test = train_test_split(X, y)

# # Instantiate a StandardScaler instance
# scaler = StandardScaler()

# # Fit the training data to the standard scaler
# X_scaler = scaler.fit(X_train)

# # Transform the training data using the scaler
# X_train_scaled = X_scaler.transform(X_train)

# # Transform the testing data using the scaler
# X_test_scaled = X_scaler.transform(X_test)

model_all = LogisticRegression(max_iter=500)
model_all.fit(X_train, y_train)

training_predictions = model_all.predict(X_train)
testing_predictions = model_all.predict(X_test)

print("(tn, fp, fn, tp)")
print(confusion_matrix(y_test, testing_predictions).ravel())

# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print("-"*20+"TRAINING"+"-"*20)
print(training_report)

# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print("-"*20+"TEST"+"-"*20)
print(testing_report)

(tn, fp, fn, tp)
[ 20 417  20 751]
--------------------TRAINING--------------------
              precision    recall  f1-score   support

           0       0.54      0.04      0.08      1350
           1       0.63      0.98      0.77      2272

    accuracy                           0.63      3622
   macro avg       0.59      0.51      0.43      3622
weighted avg       0.60      0.63      0.51      3622

--------------------TEST--------------------
              precision    recall  f1-score   support

           0       0.50      0.05      0.08       437
           1       0.64      0.97      0.77       771

    accuracy                           0.64      1208
   macro avg       0.57      0.51      0.43      1208
weighted avg       0.59      0.64      0.52      1208



In [6]:
pickle.dump(model_all, open('model-all.pkl', 'wb'))

In [21]:
pickle_model = pickle.load(open('model-all.pkl', 'rb'))
pickel_testing_predictions = pickle_model.predict(X_test)
print(classification_report(y_test, pickel_testing_predictions))

              precision    recall  f1-score   support

           0       0.53      0.12      0.20       453
           1       0.64      0.94      0.76       755

    accuracy                           0.63      1208
   macro avg       0.58      0.53      0.48      1208
weighted avg       0.60      0.63      0.55      1208



In [11]:
mlb_game_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4830 entries, 0 to 4829
Data columns (total 40 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   HRs Hit            4830 non-null   int64  
 1   BA                 4830 non-null   float64
 2   OBP                4830 non-null   float64
 3   SLG                4830 non-null   float64
 4   OPS                4830 non-null   float64
 5   Opp ERA            4830 non-null   float64
 6   Num Pitchers Used  4830 non-null   int64  
 7   Target HR          4830 non-null   int64  
 8   Next H/A_H         4830 non-null   int64  
 9   Next Venue_ARI     4830 non-null   int64  
 10  Next Venue_ATL     4830 non-null   int64  
 11  Next Venue_BAL     4830 non-null   int64  
 12  Next Venue_BOS     4830 non-null   int64  
 13  Next Venue_CHC     4830 non-null   int64  
 14  Next Venue_CHW     4830 non-null   int64  
 15  Next Venue_CIN     4830 non-null   int64  
 16  Next Venue_CLE     4830 

In [18]:
drop_cols = ['Target HR','Num Pitchers Used','OPS','HRs Hit','OBP','Next Opp Arm_R']

y = mlb_game_data['Target HR']
X = mlb_game_data.drop(columns=drop_cols)

X_train, X_test, y_train, y_test = train_test_split(X, y)

# # Instantiate a StandardScaler instance
# scaler = StandardScaler()

# # Fit the training data to the standard scaler
# X_scaler = scaler.fit(X_train)

# # Transform the training data using the scaler
# X_train_scaled = X_scaler.transform(X_train)

# # Transform the testing data using the scaler
# X_test_scaled = X_scaler.transform(X_test)

model_2 = LogisticRegression(max_iter=200)
model_2.fit(X_train, y_train)

training_predictions = model_2.predict(X_train)
testing_predictions = model_2.predict(X_test)

print("(tn, fp, fn, tp)")
print(confusion_matrix(y_test, testing_predictions).ravel())

# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print("-"*20+"TRAINING"+"-"*20)
print(training_report)

# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print("-"*20+"TEST"+"-"*20)
print(testing_report)

(tn, fp, fn, tp)
[  9 434   8 757]
--------------------TRAINING--------------------
              precision    recall  f1-score   support

           0       0.46      0.02      0.04      1344
           1       0.63      0.99      0.77      2278

    accuracy                           0.63      3622
   macro avg       0.54      0.50      0.40      3622
weighted avg       0.57      0.63      0.50      3622

--------------------TEST--------------------
              precision    recall  f1-score   support

           0       0.53      0.02      0.04       443
           1       0.64      0.99      0.77       765

    accuracy                           0.63      1208
   macro avg       0.58      0.50      0.41      1208
weighted avg       0.60      0.63      0.50      1208

