In [19]:
import sklearn
import pandas as pd
import numpy as np
import os
from sklearn import preprocessing
from sklearn import impute
from sklearn import pipeline
from sklearn import compose
from sklearn import model_selection
from sklearn import tree, neighbors

In [20]:
dir = "."
titanic_train = pd.read_csv(os.path.join(dir, "train.csv"))
titanic_train.shape
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [21]:
### create the pipeline for continous features for parameter tuning
#### lets do impute for missing and scale to bring up all the features to one scale for better learning 

cont_features = ['SibSp', 'Parch', 'Age', 'Fare']
stages = [('cont_imp',impute.SimpleImputer()),('scaler', preprocessing.StandardScaler())]
cont_pipe = pipeline.Pipeline(stages)
cont_pipe

In [22]:
#### pipeline for categorical features
### impute for missing, ordinal encoder for preprocessing

cat_features = ['Embarked', 'Sex', 'Pclass']
steps = [('cat_imp', impute.SimpleImputer(strategy="most_frequent")), ('enc', preprocessing.OrdinalEncoder(dtype=np.int32))]
cat_pipe = pipeline.Pipeline(steps)
cat_pipe

In [23]:
### now club both of the above pipelines to result to one
pre_pipe = compose.ColumnTransformer(
    [ ("categorical", cat_pipe, cat_features), ("continous", cont_pipe, cont_features)]
)
pre_pipe

In [24]:
### output of the pipe must be sent to model generation
### for this we need to create the stage

stages = [('pre',pre_pipe),  ("knn", neighbors.KNeighborsClassifier())]
pipe = pipeline.Pipeline(stages)
pipe


In [25]:
### now set the fine tuning 
X_train = titanic_train
y_train = titanic_train['Survived']

pipe_grid = { 'pre__continous__cont_imp__strategy':['mean', 'median'], 'knn__n_neighbors':[5, 7, 9]}
cv = model_selection.KFold(10)
clf = model_selection.GridSearchCV(pipe, pipe_grid, cv=cv, scoring='accuracy',return_train_score=True)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print(clf.best_index_)
print(clf.best_estimator_)

{'knn__n_neighbors': 7, 'pre__continous__cont_imp__strategy': 'mean'}
0.8002496878901374
2
Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('cat_imp',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('enc',
                                                                   OrdinalEncoder(dtype=<class 'numpy.int32'>))]),
                                                  ['Embarked', 'Sex',
                                                   'Pclass']),
                                                 ('continous',
                                                  Pipeline(steps=[('cont_imp',
                                                                   SimpleImputer()),
                                                                  ('scaler',
       

In [26]:
titanic_test = pd.read_csv(os.path.join(dir, "test.csv"))
print(titanic_test.shape)
titanic_test.head()



X_test = titanic_test
titanic_test['Survived'] = clf.predict(X_test)
titanic_test.head()


titanic_test.to_csv(os.path.join(dir, "submission1.csv"), columns=["PassengerId", "Survived"], index=False)

(418, 11)
