## References

- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html#sklearn.model_selection.train_test_split
- https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html
- https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html#sklearn.linear_model.RidgeClassifier
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler

In [1]:
%run ../common.py
import numpy as np

In [2]:
titanic=open_dataset("../data/train.csv")

In [3]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.2500,1.0
1,2,1,1,0,38.0,1,0,71.2833,2.0
2,3,1,3,0,26.0,0,0,7.9250,1.0
3,4,1,1,0,35.0,1,0,53.1000,1.0
4,5,0,3,1,35.0,0,0,8.0500,1.0
...,...,...,...,...,...,...,...,...,...
886,887,0,2,1,27.0,0,0,13.0000,1.0
887,888,1,1,0,19.0,0,0,30.0000,1.0
888,889,0,3,0,,1,2,23.4500,1.0
889,890,1,1,1,26.0,0,0,30.0000,2.0


## Prepare Data

In [4]:
labels = titanic["Survived"]
features = titanic.drop(columns="Survived")

In [5]:
labels.head(5)

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [6]:
features.head(5)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,1,22.0,1,0,7.25,1.0
1,2,1,0,38.0,1,0,71.2833,2.0
2,3,3,0,26.0,0,0,7.925,1.0
3,4,1,0,35.0,1,0,53.1,1.0
4,5,3,1,35.0,0,0,8.05,1.0


In [7]:
from sklearn.impute import SimpleImputer

In [8]:
age = np.array(features["Age"])
age = age.reshape(-1,1)
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
imp_median.fit(age)
age = imp_median.transform(age)
features["Age"] = age
features["Age"].unique()

array([22.  , 38.  , 26.  , 35.  , 28.  , 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  ,  8.  ,
       19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  , 49.  ,
       29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  , 16.  ,
       25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  , 71.  ,
       37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 , 51.  ,
       55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  , 45.5 ,
       20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  , 60.  ,
       10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  , 70.  ,
       24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [9]:
features["Embarked"].unique()

array([ 1.,  2.,  3., nan])

In [10]:
embark = np.array(features["Embarked"])
embark = embark.reshape(-1,1)
imp_constant = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=1)
imp_constant.fit(embark)
embark = imp_constant.transform(embark)
features["Embarked"] = embark
features["Embarked"].unique()

array([1., 2., 3.])

In [11]:
from sklearn.model_selection import train_test_split
X_full = np.array(features)
y_full = np.array(labels)
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.25, random_state=42)

## Train Algos

In [12]:
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

normalize = True

if not normalize:
    X = X_train 
else:
    scaler = StandardScaler()
    print("X train mean", X_train.mean())
    print("X train std", X_train.std())
    scaler.fit(X_train)
    X = scaler.transform(X_train)
    print("X scaled mean", X.mean())
    print("X scaled std", X.std())
y = y_train

classifier = RidgeClassifier()
cval_score = cross_val_score(classifier, X, y, cv=3, scoring="accuracy")

X train mean 64.25461770209581
X train std 172.62364320075324
X scaled mean -1.7949713571783968e-17
X scaled std 1.0000000000000004


In [13]:
cval_score

array([0.8161435 , 0.77578475, 0.8018018 ])