In [1]:
import pandas as pd
import os
import sklearn
from sklearn import tree 
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import impute
from sklearn import pipeline
from sklearn import compose
import numpy as np
sklearn.set_config(display="diagram")

In [3]:
dir = "C:/Users/pc/Downloads/ai-level1/titanic"
titanic_train = pd.read_csv(os.path.join(dir, "train.csv"))
titanic_train.shape
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
cont_features = ['SibSp', 'Parch', 'Age']
steps = [('cont_imp', impute.SimpleImputer()), ('scaler', preprocessing.StandardScaler())]
cont_pipe = pipeline.Pipeline(steps)
#cont_pipe.fit_transform(titanic_train[cont_features])
#cont_pipe

array([[ 0.43279337, -0.47367361, -0.5924806 ],
       [ 0.43279337, -0.47367361,  0.63878901],
       [-0.4745452 , -0.47367361, -0.2846632 ],
       ...,
       [ 0.43279337,  2.00893337,  0.        ],
       [-0.4745452 , -0.47367361, -0.2846632 ],
       [-0.4745452 , -0.47367361,  0.17706291]])

In [7]:
cat_features = ['Embarked', 'Sex', 'Pclass']
steps = [('cat_imp', impute.SimpleImputer(strategy="most_frequent")), ('enc', preprocessing.OrdinalEncoder(dtype=np.int32))]
cat_pipe = pipeline.Pipeline(steps)
#cat_pipe.fit_transform(titanic_train[cat_features])
#cat_pipe

array([[2, 1, 2],
       [0, 0, 0],
       [2, 0, 2],
       ...,
       [2, 0, 2],
       [0, 1, 0],
       [1, 1, 2]])

In [8]:
pre_pipe = compose.ColumnTransformer(
    [ ("categorical", cat_pipe, cat_features), ("numerical", cont_pipe, cont_features),  ]
)
#pre_pipe.fit_transform(titanic_train)
#pre_pipe

array([[ 2.        ,  1.        ,  2.        ,  0.43279337, -0.47367361,
        -0.5924806 ],
       [ 0.        ,  0.        ,  0.        ,  0.43279337, -0.47367361,
         0.63878901],
       [ 2.        ,  0.        ,  2.        , -0.4745452 , -0.47367361,
        -0.2846632 ],
       ...,
       [ 2.        ,  0.        ,  2.        ,  0.43279337,  2.00893337,
         0.        ],
       [ 0.        ,  1.        ,  0.        , -0.4745452 , -0.47367361,
        -0.2846632 ],
       [ 1.        ,  1.        ,  2.        , -0.4745452 , -0.47367361,
         0.17706291]])

In [9]:
pipe = pipeline.Pipeline(
    steps=[("pre", pre_pipe), ("dt", tree.DecisionTreeClassifier())]
)
pipe

In [13]:
X_train = titanic_train
y_train = titanic_train['Survived']
pipe_grid = { 'pre__numerical__cont_imp__strategy':['mean', 'median'], 'dt__max_depth':list(range(1, 10)), 'dt__min_samples_split':[2, 5, 10]}
cv = model_selection.KFold(10)
clf = model_selection.GridSearchCV(pipe, pipe_grid, cv=cv, scoring='accuracy',return_train_score=True)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print(clf.best_index_)
print(clf.best_estimator_)
df = pd.DataFrame(clf.cv_results_)
columns = ['params', 'split0_test_score', 'split1_test_score', 'split9_test_score', 'mean_test_score']
print(df[columns])

{'dt__max_depth': 4, 'dt__min_samples_split': 10, 'pre__numerical__cont_imp__strategy': 'median'}
0.8283021223470662
23
Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('cat_imp',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('enc',
                                                                   OrdinalEncoder(dtype=<class 'numpy.int32'>))]),
                                                  ['Embarked', 'Sex',
                                                   'Pclass']),
                                                 ('numerical',
                                                  Pipeline(steps=[('cont_imp',
                                                                   SimpleImputer(strategy='median')),
                                      

In [14]:
titanic_test = pd.read_csv(os.path.join(dir, "test.csv"))
print(titanic_test.shape)
titanic_test.head()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [15]:
X_test = titanic_test
titanic_test['Survived'] = clf.predict(X_test)
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0


In [16]:
titanic_test.to_csv(os.path.join(dir, "submission1.csv"), columns=["PassengerId", "Survived"], index=False)

In [17]:
import joblib

In [18]:
#persist model using joblib
objects = {
    'clf':clf
}
joblib.dump(objects, os.path.join(dir, "titanic_v2.pkl"))

['C:/Users/pc/Downloads/ai-level1/titanic\\titanic_v2.pkl']