In [63]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler

file_path = Path("./baseballdatabank-2022.2/baseballdatabank-2022.2/contrib/HallOfFame.csv")
hof_df = pd.read_csv(file_path, index_col = 0)
hof_df.head()

Unnamed: 0_level_0,yearID,votedBy,ballots,needed,votes,inducted,category,needed_note
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
cobbty01,1936,BBWAA,226.0,170.0,222.0,Y,Player,
ruthba01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
wagneho01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
mathech01,1936,BBWAA,226.0,170.0,205.0,Y,Player,
johnswa01,1936,BBWAA,226.0,170.0,189.0,Y,Player,


In [64]:
# Determine the number of unique values in each column.
hof_df.nunique()

yearID          80
votedBy          9
ballots         74
needed          65
votes          367
inducted         2
category         4
needed_note      2
dtype: int64

In [65]:
hof_df.dtypes

yearID           int64
votedBy         object
ballots        float64
needed         float64
votes          float64
inducted        object
category        object
needed_note     object
dtype: object

In [66]:
# Search for rows that have at least 1 null value.
for column in hof_df.columns:
    print(f"Column {column} has {hof_df[column].isnull().sum()} null values")

Column yearID has 0 null values
Column votedBy has 0 null values
Column ballots has 197 null values
Column needed has 354 null values
Column votes has 197 null values
Column inducted has 0 null values
Column category has 0 null values
Column needed_note has 4034 null values


In [67]:
# drop rows not containing players
hof_df = hof_df[hof_df['category'].str.contains('Player')==True]

In [69]:
# Drop the 'needed_note', 'ballots', 'needed', 'votes' column since it's not going to be used on the clustering algorithm.
hof_df.drop(columns=['needed_note', 'ballots', 'needed', 'votes', 'category'], inplace=True)
hof_df.head()

Unnamed: 0_level_0,yearID,votedBy,inducted
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cobbty01,1936,BBWAA,Y
ruthba01,1936,BBWAA,Y
wagneho01,1936,BBWAA,Y
mathech01,1936,BBWAA,Y
johnswa01,1936,BBWAA,Y


In [71]:
hof_binary_encoded = pd.get_dummies(hof_df, columns=["votedBy"])
hof_binary_encoded.head()

Unnamed: 0_level_0,yearID,inducted,votedBy_BBWAA,votedBy_Final Ballot,votedBy_Negro League,votedBy_Nominating Vote,votedBy_Old Timers,votedBy_Run Off,votedBy_Special Election,votedBy_Veterans
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
cobbty01,1936,Y,1,0,0,0,0,0,0,0
ruthba01,1936,Y,1,0,0,0,0,0,0,0
wagneho01,1936,Y,1,0,0,0,0,0,0,0
mathech01,1936,Y,1,0,0,0,0,0,0,0
johnswa01,1936,Y,1,0,0,0,0,0,0,0


In [72]:
# Create our features
X = hof_df.drop('inducted', axis = 1)
X = pd.get_dummies(X)

# Create our target
y = hof_df['inducted']

In [73]:
X.describe()

Unnamed: 0,yearID,votedBy_BBWAA,votedBy_Final Ballot,votedBy_Negro League,votedBy_Nominating Vote,votedBy_Old Timers,votedBy_Run Off,votedBy_Special Election,votedBy_Veterans
count,4066.0,4066.0,4066.0,4066.0,4066.0,4066.0,4066.0,4066.0,4066.0
mean,1969.838908,0.909493,0.005165,0.005165,0.018692,0.006149,0.019921,0.000492,0.034924
std,23.469849,0.286942,0.071689,0.071689,0.13545,0.078181,0.139747,0.022176,0.183609
min,1936.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1950.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1966.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1988.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2018.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [74]:
# Check the balance of our target values
y.value_counts()

N    3810
Y     256
Name: inducted, dtype: int64

In [75]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [76]:
# Resample the training data with the BalancedRandomForestClassifier
resample = BalancedRandomForestClassifier(n_estimators=100, random_state = 1)
resample = resample.fit(X_train, y_train)

In [77]:
# Calculated the balanced accuracy score
y_pred = resample.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6709568467995803

In [78]:
# Display the confusion matrix
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[713 240]
 [ 26  38]]


In [79]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          N       0.96      0.75      0.59      0.84      0.67      0.45       953
          Y       0.14      0.59      0.75      0.22      0.67      0.44        64

avg / total       0.91      0.74      0.60      0.80      0.67      0.45      1017



In [80]:
# List the features sorted in descending order by feature importance
feature = sorted(zip(resample.feature_importances_, X.columns), reverse=True)
feature

[(0.5134358573110234, 'yearID'),
 (0.22956882719062563, 'votedBy_BBWAA'),
 (0.11693640103260988, 'votedBy_Veterans'),
 (0.05220595371748306, 'votedBy_Old Timers'),
 (0.03835505740110181, 'votedBy_Negro League'),
 (0.02539939481312574, 'votedBy_Nominating Vote'),
 (0.01538848019693516, 'votedBy_Run Off'),
 (0.004759766721590816, 'votedBy_Final Ballot'),
 (0.003950261615504491, 'votedBy_Special Election')]