In [184]:
import pandas as pd
import numpy as np
import seaborn as sb
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import MultinomialNB

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.neighbors import KNeighborsClassifier as knnc
from sklearn.naive_bayes import GaussianNB as gnb

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import warnings
warnings.filterwarnings('ignore')


In [185]:
train=pd.read_csv('titanic_train.csv',header=0)

In [186]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [187]:
cols=['Name','Ticket','Cabin']
train = train.drop(cols,axis=1)

In [188]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 62.8+ KB


In [189]:
train=train.dropna()

In [190]:
dummies = []
cols = ['Pclass','Sex','Embarked']
for col in cols:
    dummies.append(pd.get_dummies(train[col]))

In [191]:
train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Sex          712 non-null    object 
 4   Age          712 non-null    float64
 5   SibSp        712 non-null    int64  
 6   Parch        712 non-null    int64  
 7   Fare         712 non-null    float64
 8   Embarked     712 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 55.6+ KB


In [192]:
titanic_dummies = pd.concat(dummies, axis=1)

In [193]:
train = pd.concat((train,titanic_dummies),axis=1)

In [194]:
train = train.drop(['Pclass','Sex','Embarked'],axis=1)

In [195]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Age          712 non-null    float64
 3   SibSp        712 non-null    int64  
 4   Parch        712 non-null    int64  
 5   Fare         712 non-null    float64
 6   1            712 non-null    uint8  
 7   2            712 non-null    uint8  
 8   3            712 non-null    uint8  
 9   female       712 non-null    uint8  
 10  male         712 non-null    uint8  
 11  C            712 non-null    uint8  
 12  Q            712 non-null    uint8  
 13  S            712 non-null    uint8  
dtypes: float64(2), int64(4), uint8(8)
memory usage: 44.5 KB


In [196]:
train['Age'] = train['Age'].interpolate()

In [197]:
X = train.values
y = train['Survived'].values

In [198]:
X = np.delete(X,1,axis=1)

In [203]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [204]:
classifiers = [
    knnc(),
    dtc(),
    SVC(),
    SVC(kernel='linear'),
    gnb()
]
classifier_names = [
    'K nearest neighbors',
    'Decision Tree Classifier',
    'SVM classifier with RBF kernel',
    'SVM classifier with linear kernel',
    'Gaussian Naive Bayes'
]    

In [205]:
for clf, clf_name in zip(classifiers, classifier_names):
    cv_scores = cross_val_score(clf, X_train, y_train, cv=10)
    
    print(clf_name, ' mean accuracy: ', round(cv_scores.mean()*100, 3), '% std: ', round(cv_scores.var()*100, 3),'%')

K nearest neighbors  mean accuracy:  64.269 % std:  0.375 %
Decision Tree Classifier  mean accuracy:  75.706 % std:  0.236 %
SVM classifier with RBF kernel  mean accuracy:  64.461 % std:  0.037 %
SVM classifier with linear kernel  mean accuracy:  77.918 % std:  0.133 %
Gaussian Naive Bayes  mean accuracy:  78.718 % std:  0.144 %


In [210]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=5)
clf.fit(X_train,y_train)
clf.score(X_test,y_test)


0.7616822429906542

In [211]:
clf.feature_importances_

array([0.05195205, 0.12199133, 0.06941619, 0.00571734, 0.08521236,
       0.06113314, 0.        , 0.13903877, 0.        , 0.46553881,
       0.        , 0.        , 0.        ])

In [212]:
from sklearn import ensemble
clf = ensemble.RandomForestClassifier(n_estimators=100)
clf.fit (X_train, y_train)
clf.score (X_test, y_test)

0.7757009345794392

In [215]:
clf = ensemble.GradientBoostingClassifier()
clf.fit (X_train, y_train)
clf.score (X_test, y_test)

0.8037383177570093

In [222]:
clf = ensemble.GradientBoostingClassifier(n_estimators=100)
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.8037383177570093