In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('titanic.csv')

In [3]:
dataset.shape

(891, 12)

In [4]:
dataset.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
X = dataset.iloc[:,[2,4,5,6,9]].values
y = dataset.iloc[:,1].values 

In [6]:
from sklearn.impute import SimpleImputer

si = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:,[0,2,3,4]] = si.fit_transform(X[:,[0,2,3,4]])

In [7]:
from sklearn.preprocessing import LabelEncoder

X[:, 1] = LabelEncoder().fit_transform(X[:, 1])

In [8]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X[:, [2,4]] = sc.fit_transform(X[:, [2,4]])

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [10]:
X_train.shape

(712, 5)

In [11]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train[:,] = sc.fit_transform(X_train[:,])
X_test[:,] = sc.transform(X_test[:,])

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

## Logistic regression

In [13]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(random_state=43)
logistic_model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=43, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
acc = cross_val_score(logistic_model, X_train, y_train, cv=10)

In [15]:
acc.mean()*100

79.36032863849765

In [16]:
params = [{'C': [0.61, 0.62, 0.63, 0.64, 0.65, 0.7, 1], 'solver': ['liblinear', 'lbfgs', 'saga'], 'max_iter': [100, 200, 300]}]

In [17]:
logistic_grid = GridSearchCV(logistic_model, params, scoring='accuracy', cv=10, n_jobs=-1)

logistic_grid.fit(X_train, y_train)
print(logistic_grid.best_score_)
print(logistic_grid.best_params_)

0.7936032863849765
{'C': 0.61, 'max_iter': 100, 'solver': 'liblinear'}


In [18]:
logistic_model = LogisticRegression(C=0.61, solver='liblinear', random_state=43)

In [19]:
logistic_model.fit(X_train,y_train)

LogisticRegression(C=0.61, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=43, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
logistic_pred = logistic_model.predict(X_test)

In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score

c1 = confusion_matrix(y_test,logistic_pred)
print(c1)
accuracy_score(y_test,logistic_pred)

[[94 16]
 [21 48]]


0.7932960893854749

## K-Nearest Neighbours

In [22]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [23]:
acc = cross_val_score(knn_model, X_train, y_train, cv=10)

In [24]:
acc.mean()*100

78.79890453834116

In [25]:
params = [{'n_neighbors': [8,9,10,11,12], 'metric': ['euclidean', 'minkowski'], 'p': [2,3,4]}]

In [26]:
knn_grid = GridSearchCV(knn_model, params, scoring='accuracy', cv=10, n_jobs=-1)

knn_grid.fit(X_train, y_train)
print(knn_grid.best_score_)
print(knn_grid.best_params_)

0.8202660406885759
{'metric': 'minkowski', 'n_neighbors': 9, 'p': 3}


In [27]:
knn_model = KNeighborsClassifier(metric='minkowski', n_neighbors=9, n_jobs=-1, p=3)
knn_model.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=9, p=3,
                     weights='uniform')

In [28]:
knn_pred = knn_model.predict(X_test)

In [29]:
c2 = confusion_matrix(y_test,knn_pred)
print(c2)
accuracy_score(y_test,knn_pred)

[[97 13]
 [21 48]]


0.8100558659217877

## SVM

In [30]:
from sklearn.svm import SVC

svm_model = SVC(kernel='rbf',random_state=0)
svm_model.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [31]:
acc = cross_val_score(svm_model, X_train, y_train, cv=10)

In [32]:
acc.mean()*100

82.31220657276994

In [33]:
params = [{'C': [1,2,3], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': [2,3,4,5]}]

In [34]:
svm_grid = GridSearchCV(svm_model, params, scoring='accuracy', cv=10, n_jobs=-1)

svm_grid.fit(X_train, y_train)
print(svm_grid.best_score_)
print(svm_grid.best_params_)

0.8231220657276996
{'C': 3, 'degree': 2, 'kernel': 'rbf'}


In [35]:
svm_model = SVC(kernel='rbf',random_state=0,C=3,degree=2)
svm_model.fit(X_train, y_train)

SVC(C=3, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=2, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [36]:
svm_pred = svm_model.predict(X_test)

In [37]:
c3 = confusion_matrix(y_test,svm_pred)
print(c3)
accuracy_score(y_test,svm_pred)

[[98 12]
 [23 46]]


0.8044692737430168

## Naive Bayes

In [38]:
from sklearn.naive_bayes import GaussianNB

naive_model = GaussianNB()
naive_model.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [39]:
acc = cross_val_score(naive_model, X_train, y_train, cv=10)

In [40]:
acc.mean()*100

78.80672926447573

In [41]:
naive_pred = naive_model.predict(X_test)

In [42]:
c5 = confusion_matrix(y_test,naive_pred)
print(c5)
accuracy_score(y_test,naive_pred)

[[90 20]
 [17 52]]


0.7932960893854749

## Decision tree

In [43]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier(criterion='entropy',random_state=0)
decision_tree_model.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [44]:
acc = cross_val_score(decision_tree_model, X_train, y_train, cv=10)

In [45]:
acc.mean()*100

77.37284820031299

In [46]:
params = [{'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'log2', 'sqrt']}]

In [47]:
decision_tree_grid = GridSearchCV(decision_tree_model, params, scoring='accuracy', cv=10, n_jobs=-1)

decision_tree_grid.fit(X_train, y_train)
print(decision_tree_grid.best_score_)
print(decision_tree_grid.best_params_)

0.7906885758998434
{'criterion': 'gini', 'max_features': 'auto'}


In [48]:
decision_tree_model = DecisionTreeClassifier(criterion='gini',random_state=0,max_features='auto')
decision_tree_model.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [49]:
decision_tree_pred = decision_tree_model.predict(X_test)

In [50]:
c6 = confusion_matrix(y_test,decision_tree_pred)
print(c6)
accuracy_score(y_test,decision_tree_pred)

[[96 14]
 [20 49]]


0.8100558659217877

In [51]:
from sklearn import tree
import graphviz

dot_data = tree.export_graphviz(decision_tree_model, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("visual_tree") 

'visual_tree.pdf'

## Random forest classifier

In [52]:
from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
random_forest_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [53]:
acc = cross_val_score(random_forest_model, X_train, y_train, cv=10)

In [54]:
acc.mean()*100

81.87206572769954

In [55]:
params = [{'n_estimators':[8,9,10,11,12,15], 'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'log2', 'sqrt']}]

In [56]:
random_forest_grid = GridSearchCV(random_forest_model, params, scoring='accuracy', cv=15, n_jobs=-1)

random_forest_grid.fit(X_train, y_train)
print(random_forest_grid.best_score_)
print(random_forest_grid.best_params_)

0.8271572104018912
{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 12}


In [57]:
random_forest_model = RandomForestClassifier(n_estimators=11,criterion='entropy',random_state=0,max_features='auto')
random_forest_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=11,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [58]:
random_forest_pred = random_forest_model.predict(X_test)

In [59]:
c7 = confusion_matrix(y_test,random_forest_pred)
print(c7)
accuracy_score(y_test,random_forest_pred)

[[99 11]
 [20 49]]


0.8268156424581006