# part 1 : Scikit Learn

In [1]:
import pandas as pd 

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('500hits.csv', encoding = 'latin-1')

In [4]:
df.head()

Unnamed: 0,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA,HOF
0,Ty Cobb,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,178,0.366,1
1,Stan Musial,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,31,0.331,1
2,Tris Speaker,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,129,0.345,1
3,Derek Jeter,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,97,0.31,1
4,Honus Wagner,21,2792,10430,1736,3430,640,252,101,0,963,327,722,15,0.329,1


In [5]:
df.columns

Index(['PLAYER', 'YRS', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB',
       'SO', 'SB', 'CS', 'BA', 'HOF'],
      dtype='object')

                                 # Creating Features and Target Datasets

Before we split our dataset into training and testing, we should drop any column that we believe isn’t helpful for predicting. In this case, we drop PLAYER as it’s just a name. A name shouldn’t be a predictor if a baseball player should make the hall of fame.

Additionally we will want to split our dataset into features and target. This often is represented by a capital X and lowercase y. We do not want our target in the feature dataset. So for X we drop the PLAYER and HOF columns.


In [6]:
X = df.drop(columns=['PLAYER', 'HOF'])

We set y equal to the target we want to predict which is if a player makes the baseball hall of fame.

In [7]:
y = df['HOF']

By utilizing head, we can quickly see the first 5 rows of the features dataset.

In [8]:
X.shape

(465, 14)

In [9]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: HOF, dtype: int64

                              # Our First Train Test Split

When we train test split, we will create 4 results. A train and test for X and a train and test for Y.

We set these equal to train_test_split. Inside we have the first two parameters which should be out X and y datasets.

You want to have a random_state selected so that the split can be reproduced again. Set this to an integer.

Next you’ll want to define a test_size. The standard is o.2 which would be 20% of our data. It’s good practice to have an 80% training with 20% test split in practice. This can change though depending on the sample size of your data, but this is a good start.


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11, test_size=0.2)

In [11]:
X_train.shape

(372, 14)

In [12]:
X_test.shape

(93, 14)

In [13]:
X_train.head()

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA
289,18,2028,7298,873,1993,284,40,325,1163,620,1313,20,38,0.273
288,13,1836,6826,856,1996,392,117,43,402,513,360,247,108,0.292
350,14,1799,6373,876,1854,282,64,49,630,844,627,37,51,0.291
196,17,2158,7911,1057,2276,414,92,94,844,690,784,246,138,0.288
184,19,2390,8570,1386,2369,420,65,244,1084,1197,1099,143,75,0.276


In [14]:
X_test.describe().round(3)

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA
count,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0
mean,17.204,2057.409,7452.968,1135.065,2139.258,374.742,80.398,194.677,867.011,797.387,836.065,191.817,54.473,0.287
std,3.154,368.58,1265.371,283.877,415.165,93.929,51.792,151.6,495.127,328.755,552.309,166.926,42.508,0.022
min,11.0,1399.0,5472.0,601.0,1660.0,206.0,14.0,15.0,0.0,266.0,15.0,8.0,0.0,0.248
25%,15.0,1820.0,6622.0,935.0,1818.0,310.0,45.0,78.0,618.0,527.0,359.0,61.0,15.0,0.272
50%,17.0,1997.0,7359.0,1108.0,2054.0,361.0,68.0,151.0,926.0,750.0,745.0,137.0,50.0,0.285
75%,19.0,2282.0,8096.0,1283.0,2256.0,432.0,99.0,291.0,1138.0,937.0,1179.0,271.0,83.0,0.299
max,25.0,2850.0,10876.0,1859.0,3430.0,668.0,252.0,612.0,1922.0,1747.0,2597.0,744.0,173.0,0.34


# part 2 

In [15]:
df.columns

Index(['PLAYER', 'YRS', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB',
       'SO', 'SB', 'CS', 'BA', 'HOF'],
      dtype='object')

In [17]:
df = df.drop(columns=['PLAYER', 'CS'])
print(df)

     YRS     G     AB     R     H   2B   3B   HR   RBI    BB    SO   SB  \
0     24  3035  11434  2246  4189  724  295  117   726  1249   357  892   
1     22  3026  10972  1949  3630  725  177  475  1951  1599   696   78   
2     22  2789  10195  1882  3514  792  222  117   724  1381   220  432   
3     20  2747  11195  1923  3465  544   66  260  1311  1082  1840  358   
4     21  2792  10430  1736  3430  640  252  101     0   963   327  722   
..   ...   ...    ...   ...   ...  ...  ...  ...   ...   ...   ...  ...   
460   15  1920   6653  1105  1665  285   39  291   964  1224  1427  225   
461   17  1829   6092   900  1664  379   10  275  1065   936  1453   20   
462   15  1834   6499  1062  1661  338   67  210   761   960  1190  315   
463   16  1822   6309   714  1660  254   25   54   593   396   489   74   
464   15  1468   5629   785  1660  247   71   61   499   266   471  267   

        BA  HOF  
0    0.366    1  
1    0.331    1  
2    0.345    1  
3    0.310    1  
4    0.32

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 465 entries, 0 to 464
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   YRS     465 non-null    int64  
 1   G       465 non-null    int64  
 2   AB      465 non-null    int64  
 3   R       465 non-null    int64  
 4   H       465 non-null    int64  
 5   2B      465 non-null    int64  
 6   3B      465 non-null    int64  
 7   HR      465 non-null    int64  
 8   RBI     465 non-null    int64  
 9   BB      465 non-null    int64  
 10  SO      465 non-null    int64  
 11  SB      465 non-null    int64  
 12  BA      465 non-null    float64
 13  HOF     465 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 51.0 KB


In [19]:
df.describe().round(3)

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA,HOF
count,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0
mean,17.049,2048.699,7511.456,1150.314,2170.247,380.953,78.555,201.049,894.26,783.561,847.471,195.905,0.289,0.329
std,2.765,354.392,1294.066,289.635,424.191,96.483,49.363,143.623,486.193,327.432,489.224,181.846,0.021,0.475
min,11.0,1331.0,4981.0,601.0,1660.0,177.0,3.0,9.0,0.0,239.0,0.0,7.0,0.246,0.0
25%,15.0,1802.0,6523.0,936.0,1838.0,312.0,41.0,79.0,640.0,535.0,436.0,63.0,0.273,0.0
50%,17.0,1993.0,7241.0,1104.0,2076.0,366.0,67.0,178.0,968.0,736.0,825.0,137.0,0.287,0.0
75%,19.0,2247.0,8180.0,1296.0,2375.0,436.0,107.0,292.0,1206.0,955.0,1226.0,285.0,0.3,1.0
max,26.0,3308.0,12364.0,2295.0,4189.0,792.0,309.0,755.0,2297.0,2190.0,2597.0,1406.0,0.366,2.0


In [20]:
X1 = df.iloc[:, 0:13]

In [21]:
X2 = df.iloc[:, 0:13]

In [23]:
from sklearn.preprocessing import StandardScaler

In [24]:
StandardScaler = StandardScaler()

In [25]:
X1 = StandardScaler.fit_transform(X1)

In [27]:
df.columns

Index(['YRS', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'SO', 'SB',
       'BA', 'HOF'],
      dtype='object')

In [28]:
X1 = pd.DataFrame(X1, columns = ['YRS', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'SO', 'SB', 'BA', 'HOF'])

In [30]:
X1.head()

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA,HOF
0,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,
