In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from util.player_name import get_player_name
from svm import SupportVectorModel
from random_forest import RandomForestModel
from neural_network import NeuralNetworkModel
from decision_tree import DecisionTreeModel
from hierarchical import HierarchicalModel
import joblib

In [None]:
df = pd.read_csv("../data/cleaning_3/clean.csv")

df.head()

In [None]:
categorical_features = [
    'zone_1.0', 'zone_2.0', 'zone_3.0', 'zone_4.0', 'zone_5.0', 'zone_6.0',
    'zone_7.0', 'zone_8.0', 'zone_9.0', 'zone_11.0', 'zone_12.0',
    'zone_13.0', 'zone_14.0',
    'pitch_name_4-Seam Fastball', 'pitch_name_Changeup',
    'pitch_name_Curveball', 'pitch_name_Cutter', 'pitch_name_Knuckle Curve',
    'pitch_name_Sinker', 'pitch_name_Slider', 'pitch_name_Split-Finger',
    'pitch_name_Sweeper', 
    'Pitcher Side'
]

continuous_features = [
    'balls', 'strikes', 'outs_when_up', 
    'release_speed', 'release_spin_rate', 
    'release_extension',
    'release_pos_y', 
    'spin_axis', 
    'api_break_z_with_gravity',
    'api_break_x_arm', 
    'api_break_x_batter_in',
    'release_pos_x','release_pos_z', 
    'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az']

outcomes = ['Outcome']

In [None]:
rows = []
importance_rows = []
player_dict = {}

for player in list(set(df["batter"])):
    player_name = get_player_name(player)
    
    svm_model = SupportVectorModel(df, continuous_features, categorical_features, outcomes, player, player_name)
    svm_metrics = svm_model.get_summary()
    svm_importance = svm_model.get_feature_importance().set_index('feature')['importance'].to_dict()
    
    rf_model = RandomForestModel(df, continuous_features, categorical_features, outcomes, player, player_name)
    rf_metrics = rf_model.get_summary()
    rf_importance = rf_model.get_feature_importance().set_index('feature')['importance'].to_dict()
    
    nn_model = NeuralNetworkModel(df, continuous_features, categorical_features, outcomes, player, player_name)
    nn_metrics = nn_model.get_summary()
    nn_importance = nn_model.get_feature_importance()  
    
    dt_model = DecisionTreeModel(df, continuous_features, categorical_features, outcomes, player, player_name)
    dt_metrics = dt_model.get_summary()
    dt_importance = dt_model.get_feature_importance().set_index('feature')['importance'].to_dict()
    
    hi_model = HierarchicalModel(df, continuous_features, categorical_features, outcomes, player, player_name)
    hi_metrics = hi_model.get_summary()
    
    rows.append({"Player ID": player, "Player": player_name, **svm_metrics, **rf_metrics, **nn_metrics, **dt_metrics, **hi_metrics})
    
    importance_rows.append({"Player ID": player, "Player": player_name, "Model": "SVM", **svm_importance})
    importance_rows.append({"Player ID": player, "Player": player_name, "Model": "Random Forest", **rf_importance})
    importance_rows.append({"Player ID": player, "Player": player_name, "Model": "Neural Network", **nn_importance})
    importance_rows.append({"Player ID": player, "Player": player_name, "Model": "Decision Tree", **dt_importance})

    player_dict[player_name] = {"SVM Model": svm_model, "Random Forest Model": rf_model, "Neural Network": nn_model, "Decision Tree": dt_model, "hierarchical Model": hi_model}

eval_df = pd.DataFrame(rows)
importance_df = pd.DataFrame(importance_rows)

In [None]:
eval_df

In [None]:
importance_df

In [None]:
importance_df.to_csv('output/importance.csv')
eval_df.to_csv('output/evaluations.csv')

Evaluating Feature Importance

In [None]:
importance_df.groupby("Model")[importance_df.select_dtypes("number").columns].agg(["mean"]).T.reset_index()[2:].sum()

In [None]:
im_df_sum = importance_df.describe()

labels = pd.DataFrame(im_df_sum.T["mean"]).reset_index()[1:]["index"]
values = pd.DataFrame(im_df_sum.T["mean"]).reset_index()[1:]["mean"]

plt.figure(figsize=(18, 6))
plt.bar(labels, values)
plt.xticks(rotation=90)
plt.ylabel("Importance Value")
plt.title("Average Importance Value of Features Across All Models")
plt.tight_layout()
plt.show()

In [None]:
im_df_sum = importance_df[importance_df["Model"] == "SVM"].describe()

labels = pd.DataFrame(im_df_sum.T["mean"]).reset_index()[1:]["index"]
values = pd.DataFrame(im_df_sum.T["mean"]).reset_index()[1:]["mean"]

plt.figure(figsize=(18, 6))
plt.bar(labels, values)
plt.xticks(rotation=90)
plt.ylabel("Importance Value")
plt.title("Average Importance Value of Features in SVM Model")
plt.tight_layout()
plt.show()

In [None]:
im_df_sum = importance_df[importance_df["Model"] == "Random Forest"].describe()

labels = pd.DataFrame(im_df_sum.T["mean"]).reset_index()[1:]["index"]
values = pd.DataFrame(im_df_sum.T["mean"]).reset_index()[1:]["mean"]

plt.figure(figsize=(18, 6))
plt.bar(labels, values)
plt.xticks(rotation=90)
plt.ylabel("Importance Value")
plt.title("Average Importance Value of Features in Random Forest Model")
plt.tight_layout()
plt.show()

In [None]:
im_df_sum = importance_df[importance_df["Model"] == "Neural Network"].describe()

labels = pd.DataFrame(im_df_sum.T["mean"]).reset_index()[1:]["index"]
values = pd.DataFrame(im_df_sum.T["mean"]).reset_index()[1:]["mean"]

plt.figure(figsize=(18, 6))
plt.bar(labels, values)
plt.xticks(rotation=90)
plt.ylabel("Importance Value")
plt.title("Average Importance Value of Features in Neural Network Model")
plt.tight_layout()
plt.show()

In [None]:
im_df_sum = importance_df[importance_df["Model"] == "Decision Tree"].describe()

labels = pd.DataFrame(im_df_sum.T["mean"]).reset_index()[1:]["index"]
values = pd.DataFrame(im_df_sum.T["mean"]).reset_index()[1:]["mean"]

plt.figure(figsize=(18, 6))
plt.bar(labels, values)
plt.xticks(rotation=90)
plt.ylabel("Importance Value")
plt.title("Average Importance Value of Features in Decision Tree Model")
plt.tight_layout()
plt.show()

Evaluating Variance of Feature Importance by Player

In [None]:
im_df_sum = importance_df.describe()

labels = pd.DataFrame(im_df_sum.T["std"]).reset_index()[1:]["index"]
values = pd.DataFrame(im_df_sum.T["std"]).reset_index()[1:]["std"]

plt.figure(figsize=(18, 6))
plt.bar(labels, values)
plt.xticks(rotation=90)
plt.ylabel("Std of Importance Value")
plt.title("Standard Deviation of Importance Value of Features Across All Models")
plt.tight_layout()
plt.show()

In [None]:
im_df_sum = importance_df[importance_df["Model"] == "SVM"].describe()

labels = pd.DataFrame(im_df_sum.T["std"]).reset_index()[1:]["index"]
values = pd.DataFrame(im_df_sum.T["std"]).reset_index()[1:]["std"]

plt.figure(figsize=(18, 6))
plt.bar(labels, values)
plt.xticks(rotation=90)
plt.ylabel("Std of Importance Value")
plt.title("Standard Deviation of Importance Value of Features in SVM Model")
plt.tight_layout()
plt.show()

In [None]:
im_df_sum = importance_df[importance_df["Model"] == "Random Forest"].describe()

labels = pd.DataFrame(im_df_sum.T["std"]).reset_index()[1:]["index"]
values = pd.DataFrame(im_df_sum.T["std"]).reset_index()[1:]["std"]

plt.figure(figsize=(18, 6))
plt.bar(labels, values)
plt.xticks(rotation=90)
plt.ylabel("Std of Importance Value")
plt.title("Standard Deviation of Importance Value of Features in Random Forest Model")
plt.tight_layout()
plt.show()

In [None]:
im_df_sum = importance_df[importance_df["Model"] == "Neural Network"].describe()

labels = pd.DataFrame(im_df_sum.T["std"]).reset_index()[1:]["index"]
values = pd.DataFrame(im_df_sum.T["std"]).reset_index()[1:]["std"]

plt.figure(figsize=(18, 6))
plt.bar(labels, values)
plt.xticks(rotation=90)
plt.ylabel("Std of Importance Value")
plt.title("Standard Deviation of Importance Value of Features in Neural Network Model")
plt.tight_layout()
plt.show()

In [None]:
im_df_sum = importance_df[importance_df["Model"] == "Decision Tree"].describe()

labels = pd.DataFrame(im_df_sum.T["std"]).reset_index()[1:]["index"]
values = pd.DataFrame(im_df_sum.T["std"]).reset_index()[1:]["std"]

plt.figure(figsize=(18, 6))
plt.bar(labels, values)
plt.xticks(rotation=90)
plt.ylabel("Std of Importance Value")
plt.title("Standard Deviation of Importance Value of Features in Decision Tree Model")
plt.tight_layout()
plt.show()

Evaluating Consistency of Models Across Players

In [None]:
eval_df_summary = eval_df.describe()

models = ["SVM", "RF", "NN", "Decision Tree", "Hierarchical"]

In [None]:
acc_sum_stat = pd.DataFrame(eval_df_summary[[f"{model} accuracy" for model in models]].T[["mean", "std"]]).reset_index()

plt.figure(figsize=(18, 6))
plt.bar(acc_sum_stat["index"], acc_sum_stat["mean"])
plt.xticks(rotation=0)
plt.ylabel("Accuracy")
plt.title("Average Accuracy of Models")
plt.tight_layout()
plt.show()

acc_sum_stat

In [None]:
prec_sum_stat = pd.DataFrame(eval_df_summary[[f"{model} precision" for model in models]].T[["mean", "std"]]).reset_index()

plt.figure(figsize=(18, 6))
plt.bar(prec_sum_stat["index"], prec_sum_stat["mean"])
plt.xticks(rotation=0)
plt.ylabel("Precision")
plt.title("Average Precision of Models")
plt.tight_layout()
plt.show()

prec_sum_stat

In [None]:
recall_sum_stat = pd.DataFrame(eval_df_summary[[f"{model} recall" for model in models]].T[["mean", "std"]]).reset_index()

plt.figure(figsize=(18, 6))
plt.bar(recall_sum_stat["index"], recall_sum_stat["mean"])
plt.xticks(rotation=0)
plt.ylabel("Recall")
plt.title("Average Recall of Models")
plt.tight_layout()
plt.show()

recall_sum_stat

In [None]:
joblib.dump(player_dict, 'models.pkl')

In [None]:
models = joblib.load('models.pkl')