In [1]:
import pandas as pd
import numpy as np

In [2]:
np.random.seed(42)
fastball_speed = np.random.randint(90, 106, size = 500)
tommy_john = np.where(fastball_speed >96, np.random.choice([0,1], size = 500, p= [0.3, 0.7]), 0)

In [3]:
d = {'fastball_speed': fastball_speed, 'tommy_john': tommy_john}
df = pd.DataFrame(data = d)
df

Unnamed: 0,fastball_speed,tommy_john
0,96,0
1,93,0
2,102,1
3,104,1
4,100,0
...,...,...
495,104,1
496,92,0
497,101,1
498,90,0


In [4]:
X = df[['fastball_speed']]
y = df[['tommy_john']]

In [6]:
#Normal Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [10]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train.values.ravel())

In [11]:
lr.score(X_test, y_test)

0.7666666666666667

In [13]:
#BASIC CROSS-VALIDATION
from sklearn.model_selection import cross_val_score
CVS = cross_val_score(lr, X, y.values.ravel(), cv = 10)
CVS

array([0.68, 0.78, 0.72, 0.72, 0.78, 0.82, 0.74, 0.74, 0.76, 0.82])

In [14]:
np.average(CVS)

np.float64(0.756)

In [15]:
np.std(CVS)

np.float64(0.04270831300812523)

In [20]:
#K-fold
from sklearn.model_selection import KFold
kf =KFold(n_splits = 16, shuffle = True, random_state = 42)
kfscore = cross_val_score(lr, X, y.values.ravel(), cv = kf, scoring = 'f1')
kfscore2 = cross_val_score(lr, X, y.values.ravel(), cv = kf, scoring = 'accuracy')

In [21]:
kfscore

array([0.63636364, 0.88      , 0.63636364, 0.78787879, 0.60606061,
       0.7       , 0.84615385, 0.75      , 0.63157895, 0.72      ,
       0.52631579, 0.66666667, 0.6       , 0.69230769, 0.66666667,
       0.57142857])

In [22]:
np.average(kfscore)

np.float64(0.6823615529207634)

In [23]:
np.std(kfscore)

np.float64(0.09353783520595774)

In [24]:
kfscore2

array([0.75      , 0.90625   , 0.75      , 0.78125   , 0.58064516,
       0.80645161, 0.87096774, 0.80645161, 0.77419355, 0.77419355,
       0.70967742, 0.70967742, 0.74193548, 0.74193548, 0.77419355,
       0.61290323])

In [25]:
np.average(kfscore2)

np.float64(0.7556703629032258)

In [27]:
np.std(kfscore2)

np.float64(0.07846074043763407)

In [29]:
#STRATIFIED K-FOLD
from sklearn.model_selection import StratifiedKFold
SKF = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)

In [30]:
SKFscore = cross_val_score(lr, X, y.values.ravel(), cv = SKF )

In [31]:
SKFscore

array([0.8 , 0.84, 0.64, 0.8 , 0.74, 0.72, 0.78, 0.8 , 0.7 , 0.74])

In [32]:
np.average(SKFscore)

np.float64(0.756)

In [33]:
np.std(SKFscore)

np.float64(0.05642694391866354)

In [39]:
#Pipeline

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

scale = StandardScaler()
pipe = make_pipeline(scale, lr)
pipe.fit(X_train, y_train.values.ravel())


In [42]:
scorepipe = cross_val_score(pipe, X, y.values.ravel(), cv = 10)
scorepipe

array([0.68, 0.78, 0.72, 0.72, 0.78, 0.82, 0.74, 0.74, 0.76, 0.82])

In [45]:
np.average(scorepipe)

np.float64(0.756)

In [46]:
np.std(scorepipe)

np.float64(0.04270831300812523)