In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('500hits.csv', encoding = 'latin-1')

In [4]:
df.head()

Unnamed: 0,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA,HOF
0,Ty Cobb,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,178,0.366,1
1,Stan Musial,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,31,0.331,1
2,Tris Speaker,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,129,0.345,1
3,Derek Jeter,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,97,0.31,1
4,Honus Wagner,21,2792,10430,1736,3430,640,252,101,0,963,327,722,15,0.329,1


In [5]:
# select x and y, drop player and hall of fame (HOF)
X = df.drop(columns = ['PLAYER', 'HOF'])
y = df['HOF']

In [7]:
# confirm the shapes of X and y using the shape attribute
X.shape, y.shape

((465, 14), (465,))

In [8]:
# create the test train split in the preprocessing phase in order to fit a model to the data
# random_state keeps the randomisation uniform everytime that you run the train_test_spit
# test_size sort of depends on the amount of data that you have
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11, test_size=0.2)


In [9]:
X_train.shape, y_train.shape

((372, 14), (372,))

In [10]:
X_test.shape, y_test.shape

((93, 14), (93,))

In [11]:
X_train.describe().round(4)

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA
count,372.0,372.0,372.0,372.0,372.0,372.0,372.0,372.0,372.0,372.0,372.0,372.0,372.0,372.0
mean,17.0108,2046.5215,7526.078,1154.1263,2177.9946,382.5054,78.0941,202.6425,901.0726,780.1048,850.3226,196.9274,58.9866,0.2892
std,2.6625,351.2329,1302.4058,291.3084,426.6153,97.1734,48.7984,141.7263,484.3703,327.4531,472.918,185.5857,49.3222,0.0211
min,11.0,1331.0,4981.0,651.0,1660.0,177.0,3.0,9.0,0.0,239.0,0.0,7.0,0.0,0.246
25%,15.0,1797.5,6507.5,936.0,1838.0,312.0,41.0,79.75,645.0,536.5,448.0,64.5,23.0,0.274
50%,17.0,1992.0,7237.0,1099.0,2080.5,367.0,67.0,185.5,977.5,719.0,844.0,141.0,52.0,0.288
75%,19.0,2245.5,8198.25,1305.0,2383.75,436.25,108.0,293.25,1218.5,961.25,1234.25,285.5,84.0,0.3002
max,26.0,3308.0,12364.0,2295.0,4189.0,792.0,309.0,755.0,2297.0,2190.0,1936.0,1406.0,335.0,0.366


In [13]:
X_test.describe().round(4)

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA
count,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0
mean,17.2043,2057.4086,7452.9677,1135.0645,2139.2581,374.7419,80.3978,194.6774,867.0108,797.3871,836.0645,191.8172,54.4731,0.2869
std,3.1539,368.5804,1265.3711,283.8771,415.1649,93.9293,51.7918,151.5998,495.1271,328.7546,552.3089,166.926,42.5085,0.0218
min,11.0,1399.0,5472.0,601.0,1660.0,206.0,14.0,15.0,0.0,266.0,15.0,8.0,0.0,0.248
25%,15.0,1820.0,6622.0,935.0,1818.0,310.0,45.0,78.0,618.0,527.0,359.0,61.0,15.0,0.272
50%,17.0,1997.0,7359.0,1108.0,2054.0,361.0,68.0,151.0,926.0,750.0,745.0,137.0,50.0,0.285
75%,19.0,2282.0,8096.0,1283.0,2256.0,432.0,99.0,291.0,1138.0,937.0,1179.0,271.0,83.0,0.299
max,25.0,2850.0,10876.0,1859.0,3430.0,668.0,252.0,612.0,1922.0,1747.0,2597.0,744.0,173.0,0.34


### Scaling data
Use of Normalization vs Standardization:
- Normalization: uses the uniform distribution to scale the value using the formua
- Uses a min max scaler
  $X - min(X) \over max(X) - min(X)$
- Standardization: scales the data using the mean and the standard deviation (uses the normal distribution with a mean of zero and a standard deviation of one. The formula
  $X - mean(X) \over std(X)$
- Standardization is much more preferred when you have outliers within your data since compared to normalization, standadization can accomodate values outside of the range 0 to 1

- The encoding: This error typically arises when you attempt to read a CSV file with an encoding that doesn’t match the file’s own encoding.

In [14]:
import pandas as pd

In [18]:
df = pd.read_csv('500hits.csv', encoding = 'latin-1')

In [19]:
df = df.drop(columns=['PLAYER', 'CS'])

In [20]:
df.describe().round(3)

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA,HOF
count,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0
mean,17.049,2048.699,7511.456,1150.314,2170.247,380.953,78.555,201.049,894.26,783.561,847.471,195.905,0.289,0.329
std,2.765,354.392,1294.066,289.635,424.191,96.483,49.363,143.623,486.193,327.432,489.224,181.846,0.021,0.475
min,11.0,1331.0,4981.0,601.0,1660.0,177.0,3.0,9.0,0.0,239.0,0.0,7.0,0.246,0.0
25%,15.0,1802.0,6523.0,936.0,1838.0,312.0,41.0,79.0,640.0,535.0,436.0,63.0,0.273,0.0
50%,17.0,1993.0,7241.0,1104.0,2076.0,366.0,67.0,178.0,968.0,736.0,825.0,137.0,0.287,0.0
75%,19.0,2247.0,8180.0,1296.0,2375.0,436.0,107.0,292.0,1206.0,955.0,1226.0,285.0,0.3,1.0
max,26.0,3308.0,12364.0,2295.0,4189.0,792.0,309.0,755.0,2297.0,2190.0,2597.0,1406.0,0.366,2.0


In [25]:
X1 = df.iloc[: ,0:13]

In [26]:
X2 = df.iloc[:, 0:13]

In [27]:
# standard scaling
from sklearn.preprocessing import StandardScaler

In [28]:
scalerStandard = StandardScaler()

In [29]:
X1 = scalerStandard.fit_transform(X1)

In [30]:
X1 = pd.DataFrame(X1, columns = X2.columns)

In [31]:
X1.head()

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA
0,2.516295,2.786078,3.034442,3.787062,4.764193,3.559333,4.389485,-0.585841,-0.346449,1.423013,-1.003628,3.832067,3.64829
1,1.792237,2.760655,2.677044,2.76053,3.444971,3.569709,1.996457,1.909487,2.175837,2.493089,-0.309948,-0.64908,1.996159
2,1.792237,2.091184,2.075964,2.528955,3.171214,4.264876,2.909053,-0.585841,-0.350567,1.826585,-1.283965,1.299723,2.657012
3,1.06818,1.972543,2.849554,2.670665,3.055576,1.691719,-0.254611,0.410896,0.858071,0.912434,2.030966,0.892346,1.004881
4,1.430208,2.099658,2.257758,2.024329,2.972977,2.68778,3.517449,-0.697364,-1.84129,0.548609,-1.065016,2.896201,1.901752


In [32]:
X1.describe().round(3)

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA
count,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0
mean,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,0.0
std,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001
min,-2.19,-2.027,-1.958,-1.899,-1.204,-2.116,-1.532,-1.339,-1.841,-1.665,-1.734,-1.04,-2.016
25%,-0.742,-0.697,-0.765,-0.741,-0.784,-0.715,-0.762,-0.851,-0.524,-0.76,-0.842,-0.732,-0.742
50%,-0.018,-0.157,-0.209,-0.16,-0.222,-0.155,-0.234,-0.161,0.152,-0.145,-0.046,-0.324,-0.081
75%,0.706,0.56,0.517,0.504,0.483,0.571,0.577,0.634,0.642,0.524,0.775,0.49,0.533
max,3.24,3.557,3.754,3.956,4.764,4.265,4.673,3.861,2.888,4.3,3.58,6.662,3.648


In [33]:
# Normalization
from sklearn.preprocessing import MinMaxScaler

In [34]:
# feature_range can be given any range to work with
scalerMinMax = MinMaxScaler(feature_range = (0, 1))

In [35]:
X2 = scalerMinMax.fit_transform(X2)

In [36]:
X2 = pd.DataFrame(X2, columns = X1.columns)

In [37]:
X2.head()

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA
0,0.866667,0.861912,0.874035,0.971074,1.0,0.889431,0.954248,0.144772,0.316064,0.517683,0.137466,0.632595,1.0
1,0.733333,0.85736,0.811459,0.79575,0.778964,0.891057,0.568627,0.624665,0.849369,0.697078,0.268002,0.050751,0.708333
2,0.733333,0.737481,0.706217,0.756198,0.733096,1.0,0.715686,0.144772,0.315194,0.585341,0.084713,0.303788,0.825
3,0.6,0.716237,0.841663,0.780401,0.713721,0.596748,0.205882,0.336461,0.570744,0.432086,0.70851,0.250893,0.533333
4,0.666667,0.738998,0.738047,0.670012,0.699881,0.752846,0.813725,0.123324,0.0,0.371092,0.125915,0.511079,0.691667


In [39]:
X2.describe().round(3)

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA
count,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0
mean,0.403,0.363,0.343,0.324,0.202,0.332,0.247,0.257,0.389,0.279,0.326,0.135,0.356
std,0.184,0.179,0.175,0.171,0.168,0.157,0.161,0.193,0.212,0.168,0.188,0.13,0.177
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.267,0.238,0.209,0.198,0.07,0.22,0.124,0.094,0.279,0.152,0.168,0.04,0.225
50%,0.4,0.335,0.306,0.297,0.164,0.307,0.209,0.227,0.421,0.255,0.318,0.093,0.342
75%,0.533,0.463,0.433,0.41,0.283,0.421,0.34,0.379,0.525,0.367,0.472,0.199,0.45
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Encoding
- Let's consider categorical data for a moment, to turn these into numbers and analyse them alongside everthing else, we will convert the variables using encoding
- One of the encoding methods is the one hot encoding technique: focuses on norminal data (categorical data with no numeric significance, no hierarchy related to it like Ordinal data is (Small, medium, big)) 

In [40]:
import pandas as pd

In [41]:
d = {'sales': [100000,222000,1000000,522000,111111,222222,1111111,20000,75000,90000,1000000,10000], 'city': ['Tampa','Tampa','Orlando','Jacksonville','Miami','Jacksonville','Miami','Miami','Orlando','Orlando','Orlando','Orlando'], 'size': ['Small', 'Medium','Large','Large','Small','Medium','Large','Small','Medium','Medium','Medium','Small',]}

In [42]:
df = pd.DataFrame(data = d)

In [43]:
df.head()

Unnamed: 0,sales,city,size
0,100000,Tampa,Small
1,222000,Tampa,Medium
2,1000000,Orlando,Large
3,522000,Jacksonville,Large
4,111111,Miami,Small


In [44]:
df['city'].unique() # get the unique values for the cities

array(['Tampa', 'Orlando', 'Jacksonville', 'Miami'], dtype=object)

In [45]:
from sklearn.preprocessing import OneHotEncoder

In [46]:
# handle_unknown for unknown values
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse_output=False).set_output(transform = 'pandas')

In [47]:
ohe_transform = ohe.fit_transform(df[['city']])

In [48]:
ohe_transform

Unnamed: 0,city_Jacksonville,city_Miami,city_Orlando,city_Tampa
0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0
5,1.0,0.0,0.0,0.0
6,0.0,1.0,0.0,0.0
7,0.0,1.0,0.0,0.0
8,0.0,0.0,1.0,0.0
9,0.0,0.0,1.0,0.0


In [49]:
# concat the encoded values to the original dataframe
df = pd.concat([df, ohe_transform], axis = 1).drop(columns = ['city'])

In [50]:
df

Unnamed: 0,sales,size,city_Jacksonville,city_Miami,city_Orlando,city_Tampa
0,100000,Small,0.0,0.0,0.0,1.0
1,222000,Medium,0.0,0.0,0.0,1.0
2,1000000,Large,0.0,0.0,1.0,0.0
3,522000,Large,1.0,0.0,0.0,0.0
4,111111,Small,0.0,1.0,0.0,0.0
5,222222,Medium,1.0,0.0,0.0,0.0
6,1111111,Large,0.0,1.0,0.0,0.0
7,20000,Small,0.0,1.0,0.0,0.0
8,75000,Medium,0.0,0.0,1.0,0.0
9,90000,Medium,0.0,0.0,1.0,0.0
