In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor as RF

In [47]:
#import data
Batting = pd.read_csv('../projectData/Batting.csv')
People = pd.read_csv('../projectData/People.csv')
HOF = pd.read_csv('../projectData/HallOfFame.csv')
HOF2024 = HOF[(HOF['yearid'] == 2025) & (HOF['inducted'] == 'Y')]

In [48]:
#group all of the hall of famers that were players
HOF_inducted = HOF[(HOF['inducted'] == 'Y') & (HOF['category'] == 'Player')]
HOF_batting = HOF_inducted.merge(Batting[['playerID', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'SO']], on='playerID', how='left')

In [49]:
#aggregate batting statistics per Hall of Fame player
hof_player_stats = HOF_batting.groupby('playerID').agg({
    'G': 'sum',
    'AB': 'sum', 
    'H': 'sum',
    '2B': 'sum',
    '3B': 'sum',
    'HR': 'sum',
    'RBI': 'sum',
    'SB': 'sum',
    'SO': 'sum'
}).reset_index()

#creating the target variable
hof_player_stats['is_hof'] = 1

hof_player_stats

Unnamed: 0,playerID,G,AB,H,2B,3B,HR,RBI,SB,SO,is_hof
0,aaronha01,3298.0,12364.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1383.0,1
1,alexape01,703.0,1810.0,378.0,60.0,13.0,11.0,163.0,3.0,276.0,1
2,allendi01,1749.0,6332.0,1848.0,320.0,79.0,351.0,1119.0,133.0,1556.0,1
3,alomaro01,2379.0,9073.0,2724.0,504.0,80.0,210.0,1134.0,474.0,1140.0,1
4,ansonca01,2524.0,10281.0,3435.0,582.0,142.0,97.0,2075.0,277.0,330.0,1
...,...,...,...,...,...,...,...,...,...,...,...
285,wynnea01,796.0,1704.0,365.0,59.0,5.0,17.0,173.0,1.0,330.0,1
286,yastrca01,3308.0,11988.0,3419.0,646.0,59.0,452.0,1844.0,168.0,1393.0,1
287,youngcy01,918.0,2960.0,623.0,87.0,35.0,18.0,290.0,29.0,381.0,1
288,youngro01,1211.0,4627.0,1491.0,236.0,93.0,42.0,592.0,153.0,390.0,1


In [50]:
#get non hall of fame playerIDs
non_HOF = [player for player in People['playerID'] if player not in HOF_inducted['playerID']]
non_HOF_batting = Batting[~Batting['playerID'].isin(HOF_inducted['playerID'])][['playerID', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'SO']]

In [51]:
#aggregate batting statistics per player
non_hof_player_stats = non_HOF_batting.groupby('playerID').agg({
    'G': 'sum',
    'AB': 'sum', 
    'H': 'sum',
    '2B': 'sum',
    '3B': 'sum',
    'HR': 'sum',
    'RBI': 'sum',
    'SB': 'sum',
    'SO': 'sum'
}).reset_index()

#they aren't in the HOF so they get 0s
non_hof_player_stats['is_hof'] = 0

non_hof_player_stats

Unnamed: 0,playerID,G,AB,H,2B,3B,HR,RBI,SB,SO,is_hof
0,aardsda01,331,4,0,0,0,0,0.0,0.0,2.0,0
1,aaronto01,437,944,216,42,6,13,94.0,9.0,145.0,0
2,aasedo01,448,5,0,0,0,0,0.0,0.0,3.0,0
3,abadan01,15,21,2,0,0,0,0.0,0.0,5.0,0
4,abadfe01,406,9,1,0,0,0,0.0,0.0,5.0,0
...,...,...,...,...,...,...,...,...,...,...,...
20716,zupofr01,16,18,3,1,0,0,0.0,0.0,6.0,0
20717,zuvelpa01,209,491,109,17,2,2,20.0,2.0,50.0,0
20718,zuverge01,266,142,21,2,1,0,7.0,0.0,39.0,0
20719,zwilldu01,366,1280,364,76,15,30,202.0,46.0,155.0,0


In [52]:
#combine Hall of Fame and non-Hall of Fame data
all_player_stats = pd.concat([hof_player_stats, non_hof_player_stats], ignore_index=True)

In [53]:
#select the features for the model
features = ['G', 'AB', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'SO']
x = all_player_stats[features]
y = all_player_stats['is_hof']

In [54]:
#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=314,
                                                    test_size=0.25,
                                                    shuffle=True) 

In [57]:
#build the random forest model
rf_auto = RF(max_features=6,
             n_estimators=500,
             max_depth=10,
             bootstrap=True,
             random_state=314)
rf_auto.fit(X_train, y_train)

In [58]:
#MSE
y_hat_rf = rf_auto.predict(X_test)
mse_rf = np.mean((y_test - y_hat_rf)**2)
print('test mse: ',mse_rf)

test mse:  0.006931731489424506


In [59]:
#feature importance list
feature_imp = pd.DataFrame(
    {'importance':rf_auto.feature_importances_},
    index=features)
feature_imp.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
H,0.279611
G,0.20574
AB,0.112157
RBI,0.106681
SO,0.101346
3B,0.054664
2B,0.049188
HR,0.04636
SB,0.044252
