In [40]:
%cd /content/drive/My Drive/Datasets/titanic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler,FunctionTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

pd.set_option("display.max_columns",500)

/content/drive/My Drive/Datasets/titanic


In [41]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_ans = pd.read_csv("gender_submission.csv")

display(df_train.info())
display(df_train.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


None

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [42]:
col_drop = ['Cabin','PassengerId','Name','Ticket']
col_dummies = ['Sex','Embarked']

df_train.drop(columns=col_drop,inplace = True)
df_train = df_train[df_train['Embarked'].notna()]
X = df_train.drop('Survived',axis = 1)
y = pd.DataFrame(data=df_train['Survived'])

X = X.fillna(np.nan)
X = pd.get_dummies(X,drop_first = True)

df_test.drop(columns=col_drop, inplace = True)
X_test = df_test[df_test['Embarked'].notna()]

X['Age'] = X.Age.apply(lambda x: X.Age.mean() if np.isnan(x) else x)
X_test['Age'] = X_test.Age.apply(lambda x: X_test.Age.mean() if np.isnan(x) else x)
X_test['Fare'] = X_test.Fare.apply(lambda x: X_test.Fare.mean() if np.isnan(x) else x)
X_test = pd.get_dummies(X_test, drop_first = True)
y_test = df_ans.drop('PassengerId', axis = 1)

display(X.head())
display(y.head())
display(X_test.head())
display(y_test.head())

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,1,0,1
1,1,38.0,1,0,71.2833,0,0,0
2,3,26.0,0,0,7.925,0,0,1
3,1,35.0,1,0,53.1,0,0,1
4,3,35.0,0,0,8.05,1,0,1


Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,34.5,0,0,7.8292,1,1,0
1,3,47.0,1,0,7.0,0,0,1
2,2,62.0,0,0,9.6875,1,1,0
3,3,27.0,0,0,8.6625,1,0,1
4,3,22.0,1,1,12.2875,0,0,1


Unnamed: 0,Survived
0,0
1,1
2,0
3,0
4,1


In [43]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

knn = KNeighborsClassifier()
logreg = LogisticRegression(solver='liblinear')
svc = SVC()
dt = RandomForestClassifier(min_samples_leaf = 0.1)

cat_col = ['Pclass','Sex_male','Embarked_Q','Embarked_S']
num_col = [i for i in X.columns if i not in cat_col]

In [44]:
feature_union = FeatureUnion([
      ('category', FunctionTransformer(lambda x: x[cat_col])),
      ('numeric', Pipeline([
        ('select', FunctionTransformer(lambda x: x[num_col])),
        ('scale', StandardScaler()),
        ('PCA', PCA())
        ])
      )
])

pca_grid = '__feature_select__numeric__PCA__n_components'
pca_grid_value = [1,2,3,4]

knn_union = Pipeline([('feature_select',feature_union),('KNN',knn)])
logreg_union = Pipeline([('feature_select',feature_union), ('LOGREG', logreg)])
svc_union = Pipeline([('feature_select',feature_union), ('SVC',svc)])
vote = VotingClassifier(estimators=[('knn',knn_union),('log',logreg_union),('svc',svc_union), ('dt', dt)])
params = {'knn__KNN__n_neighbors': [2,3,4,5,6], 'log__LOGREG__penalty': ['l1','l2'], 'log__LOGREG__C': [0.001, 0.01, 0.1, 1]
  ,'svc__SVC__C': [0.001, 0.01, 0.1, 1], 'knn'+pca_grid: pca_grid_value, 'log'+pca_grid: pca_grid_value, 'svc'+pca_grid: pca_grid_value
  ,'dt__max_features': ['sqrt','log2'], 'dt__n_estimators': [50,100,150,200], 'dt__criterion': ['gini','entropy']
}

In [45]:
model_vote = RandomizedSearchCV(vote, params, n_jobs = -1)
model_vote.fit(X, y.values.ravel())

y_train_pred = model_vote.predict(X)
y_test_pred = model_vote.predict(X_test)

display("Parameter : ",model_vote.best_params_)
print('\nTraining Score: {}, Test Score: {}'.format(model_vote.score(X, y), model_vote.score(X_test, y_test)))
print('\nClassification_report:\n',classification_report(y_test, y_test_pred))
print('\nConfusion_matrix:\n',confusion_matrix(y_test, y_test_pred))

'Parameter : '

{'dt__criterion': 'entropy',
 'dt__max_features': 'log2',
 'dt__n_estimators': 50,
 'knn__KNN__n_neighbors': 4,
 'knn__feature_select__numeric__PCA__n_components': 2,
 'log__LOGREG__C': 0.1,
 'log__LOGREG__penalty': 'l1',
 'log__feature_select__numeric__PCA__n_components': 1,
 'svc__SVC__C': 1,
 'svc__feature_select__numeric__PCA__n_components': 1}


Training Score: 0.8256467941507312, Test Score: 0.9043062200956937

Classification_report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93       266
           1       1.00      0.74      0.85       152

    accuracy                           0.90       418
   macro avg       0.93      0.87      0.89       418
weighted avg       0.92      0.90      0.90       418


Confusion_matrix:
 [[266   0]
 [ 40 112]]
