In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('500hits.csv', encoding = 'latin-1')

In [3]:
df.head()

Unnamed: 0,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA,HOF
0,Ty Cobb,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,178,0.366,1
1,Stan Musial,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,31,0.331,1
2,Tris Speaker,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,129,0.345,1
3,Derek Jeter,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,97,0.31,1
4,Honus Wagner,21,2792,10430,1736,3430,640,252,101,0,963,327,722,15,0.329,1


In [4]:
df = df.drop(columns = ['PLAYER', 'CS'])

In [5]:
X, y = df.iloc[:,0:13], df.iloc[:,13]

In [6]:
X

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA
0,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,0.366
1,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,0.331
2,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,0.345
3,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,0.310
4,21,2792,10430,1736,3430,640,252,101,0,963,327,722,0.329
...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,15,1920,6653,1105,1665,285,39,291,964,1224,1427,225,0.250
461,17,1829,6092,900,1664,379,10,275,1065,936,1453,20,0.273
462,15,1834,6499,1062,1661,338,67,210,761,960,1190,315,0.256
463,16,1822,6309,714,1660,254,25,54,593,396,489,74,0.263


In [7]:
y

Unnamed: 0,HOF
0,1
1,1
2,1
3,1
4,1
...,...
460,0
461,0
462,0
463,0


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 17)

In [10]:
X_train.shape, X_test.shape

((372, 13), (93, 13))

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
rf = RandomForestClassifier()
rf

In [14]:
rf.fit(X_train, y_train)

In [16]:
y_pred = rf.predict(X_test)
y_pred

array([0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0])

In [17]:
rf.score(X_test, y_test)

0.8279569892473119

In [18]:
from sklearn.metrics import classification_report

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.92      0.88        61
           1       0.81      0.66      0.72        32

    accuracy                           0.83        93
   macro avg       0.82      0.79      0.80        93
weighted avg       0.83      0.83      0.82        93



In [21]:
features = pd.DataFrame(rf.feature_importances_, index = X.columns)
features

Unnamed: 0,0
YRS,0.029853
G,0.087736
AB,0.097158
R,0.12669
H,0.143339
2B,0.064883
3B,0.050068
HR,0.06461
RBI,0.072061
BB,0.043403


In [22]:
# Hyperparamaters

In [23]:
rf2 = RandomForestClassifier(
    n_estimators = 1000,
    criterion = 'entropy',
    min_samples_split = 10,
    max_depth = 14,
    random_state = 42
)

In [24]:
rf2

In [25]:
rf2.fit(X_train, y_train)

In [26]:
y_pred2 = rf2.predict(X_test)
y_pred2

array([0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0])

In [27]:
rf2.score(X_test, y_test)

0.8494623655913979

In [28]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89        61
           1       0.85      0.69      0.76        32

    accuracy                           0.85        93
   macro avg       0.85      0.81      0.82        93
weighted avg       0.85      0.85      0.85        93

