This Jupyter Notebook analyzes baseball pitches to predict a batter's swing probability using data from three seasons. It processes the data by handling missing values, 
one-hot encoding categorical variables, and ensuring consistency between training and testing datasets.

Key functionalities include:

- Calculating Variance Inflation Factor (VIF) to check for multicollinearity.
- Optimizing a RandomForestClassifier with GridSearchCV based on ROC AUC scores.
- Evaluating model performance with metrics like ROC AUC, accuracy, confusion matrices, and generating ROC curves.
- Visualizing feature importance to highlight influential factors in the model’s predictions.
- Saving the best model and updating the dataset with predicted probabilities for further validation.

#### Import Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import pickle

#### Utility Functions

In [None]:
# function to check multicolinearity amongst features
def calculate_vifs(df):
    # Adding a constant column for intercept
    df = add_constant(df)
    
    # Calculating VIF for each feature
    vifs = pd.DataFrame()
    vifs["Variable"] = df.columns
    vifs["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    
    return vifs

#### Read, preprocess and split data into train-test. Feature Engineering

In [None]:
# Load the data
data_season1 = pd.read_csv('year1.csv')
data_season2 = pd.read_csv('year2.csv')
data_season3 = pd.read_csv('year3.csv')
# Combine season 1 and 2 for training
data_train = pd.concat([data_season1, data_season2], ignore_index=True)
# since we have a lot of data, we can drop the rows win NaN values
data_train = data_train.dropna(axis=0)

In [None]:
# similar process for season 3. For all rows with NaN values, Swing Probability is assigned as 0.5
nan_rows = data_season3.isna().any(axis=1)
data_season3['SwingProbability'] = 0.5
data_season3_clean = data_season3.dropna(axis=0).copy()

In [None]:
# Define swing events
swing_events = ['foul', 'hit_into_play', 'swinging_strike', 'foul_tip',
                'foul_bunt', 'swinging_strike_blocked', 'missed_bunt', 'bunt_foul_tip','foul_pitchout']

# create a new column to specify if batter swung or not
data_train['swing'] = data_train['description'].apply(lambda x: 1 if x in swing_events else 0)

# create a new feature and add it to both training data and season 3 data
data_train['relative_pitch_height'] = (data_train['plate_z'] - data_train['sz_bot']) / (data_train['sz_top'] - data_train['sz_bot'])
data_season3_clean['relative_pitch_height'] = (data_season3_clean['plate_z'] - data_season3_clean['sz_bot']) / (data_season3_clean['sz_top'] - data_season3_clean['sz_bot'])

# pick the features for the model and one-hot encode the categorical features
features = ['release_speed', 'plate_x','stand','p_throws','pfx_x', 'pfx_z','balls', 'strikes', 'pitch_type', 'relative_pitch_height']

In [None]:
X_train = pd.get_dummies(data_train[features], drop_first=True)
X_test = pd.get_dummies(data_season3_clean[features], drop_first=True)

# Ensure the same dummy variable columns in train and test
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Prepare target
y_train = data_train['swing']

# Split the data for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=21)

#### Model Development

In [None]:
vif_data = calculate_vifs(X_train)
print(vif_data)

In [None]:
# Model Training
model = RandomForestClassifier()
param_grid = {'n_estimators': [50, 100, 200, 300], 'max_depth': [10, 15, 20, 25, 30]}
param_grid = {'n_estimators': [300], 'max_depth': [25]}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

# pick the best model
best_model = grid_search.best_estimator_
print(best_model)

#### Evaluations and Predictions

In [None]:
# make predictions on validation data
y_pred_proba = best_model.predict_proba(X_val)[:, 1]
y_pred = best_model.predict(X_val)

In [None]:
# ROC AUC Score
roc_auc = roc_auc_score(y_val, y_pred_proba)
print(f'ROC AUC Score: {roc_auc}')

In [None]:
# Accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy}')

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_val, y_pred)
print(classification_report(y_val, y_pred))    
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
disp = disp.plot(cmap=plt.cm.Blues,values_format='g')
disp.plot()
plt.show()

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
plt.figure()
plt.plot(fpr, tpr, color='cornflowerblue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Feature Importance
importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1]
features = X_train.columns

plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices], align="center", color="navy")
plt.xticks(range(X_train.shape[1]), features[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()

In [None]:
# save best model
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)
    
# Predict for season 3
data_season3_clean['SwingProbability'] = best_model.predict_proba(X_test)[:, 1]

data_season3.update(data_season3_clean)

# Save the validation file
data_season3.to_csv('validation.csv', index=False)