# Importing dataset and pre-processing

Import the libraries

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Import the dataset

In [3]:
#importing the dataset
dataset=pd.read_csv("train_formatted.csv")
dataset

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,unknown,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,unknown,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,unknown,S
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,unknown,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,unknown,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C,C


Extracting matrix of features and dependent variable

In [0]:
X=dataset.iloc[:,[1,3,4,5,6,7,8,9,10]].values
y=dataset.iloc[:,0].values

In [13]:
X

array([[3, 'male', 22.0, ..., 7.25, 'unknown', 'S'],
       [1, 'female', 38.0, ..., 71.2833, 'C', 'C'],
       [3, 'female', 26.0, ..., 7.925, 'unknown', 'S'],
       ...,
       [3, 'female', nan, ..., 23.45, 'unknown', 'S'],
       [1, 'male', 26.0, ..., 30.0, 'C', 'C'],
       [3, 'male', 32.0, ..., 7.75, 'unknown', 'Q']], dtype=object)

Dealing with Missing Values

In [0]:
#dealing with missing values
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
imputer=imputer.fit(X[:,2:3])
X[:,2:3]=imputer.transform(X[:,2:3])

In [15]:
X

array([[3, 'male', 22.0, ..., 7.25, 'unknown', 'S'],
       [1, 'female', 38.0, ..., 71.2833, 'C', 'C'],
       [3, 'female', 26.0, ..., 7.925, 'unknown', 'S'],
       ...,
       [3, 'female', 29.69911764705882, ..., 23.45, 'unknown', 'S'],
       [1, 'male', 26.0, ..., 30.0, 'C', 'C'],
       [3, 'male', 32.0, ..., 7.75, 'unknown', 'Q']], dtype=object)

Encoding categorical variables

In [0]:
#categorical encoding 
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_X=LabelEncoder()
X[:,0]=labelencoder_X.fit_transform(X[:,0])
X[:,1]=labelencoder_X.fit_transform(X[:,1])
X[:,7]=labelencoder_X.fit_transform(X[:,7])
X[:,8]=labelencoder_X.fit_transform(X[:,8])

In [0]:
#Removing ticket column
X=X[:,[0,1,2,3,4,6,7,8]]

Dummy coding categorical variables with more than 2 categories

In [0]:
#Dummy coding using OneHotEncoder
#X
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(
    transformers=[
        ("OneHot",        # Just a name
         OneHotEncoder(), # The transformer class
         [0]              # The column(s) to be applied on.
         )
    ],
    remainder='passthrough' # donot apply anything to the remaining columns
)
X = transformer.fit_transform(X.tolist())
X = X.astype('float64')

# To avoid the dummy variable trap , we remove one  of the dummy variables 
X=X[:,1:]

In [0]:
#Dummy coding using OneHotEncoder
#X
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(
    transformers=[
        ("OneHot",        # Just a name
         OneHotEncoder(), # The transformer class
         [7]              # The column(s) to be applied on.
         )
    ],
    remainder='passthrough' # donot apply anything to the remaining columns
)
X = transformer.fit_transform(X.tolist())
X = X.astype('float64')

# To avoid the dummy variable trap , we remove one  of the dummy variables 
X=X[:,1:]

In [0]:
#Dummy coding using OneHotEncoder
#X
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(
    transformers=[
        ("OneHot",        # Just a name
         OneHotEncoder(), # The transformer class
         [15]              # The column(s) to be applied on.
         )
    ],
    remainder='passthrough' # donot apply anything to the remaining columns
)
X = transformer.fit_transform(X.tolist())
X = X.astype('float64')

# To avoid the dummy variable trap , we remove one  of the dummy variables 
X=X[:,1:]

In [21]:
X[1,:]

array([ 0.    ,  0.    ,  0.    ,  0.    ,  1.    ,  0.    ,  0.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
       38.    ,  1.    ,  0.    , 71.2833])

Splitting the dataset into training set and test set

In [0]:
#splitting the dataset into training and test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

Applying feature scaling

In [0]:
from sklearn.preprocessing import StandardScaler
sc_X2=StandardScaler()
# scaling dummy variables
X_train=sc_X2.fit_transform(X_train)
X_test=sc_X2.transform(X_test)

In [24]:
X_train[1,:]

array([-0.31426968,  0.61414657, -0.05307449, -0.23076923, -0.25664813,
       -0.19468147, -0.20973381, -0.10660036, -0.06504853, -0.03750293,
        0.54488848,  1.96893685, -1.11944833,  0.72882288,  0.09662937,
       -0.46445234, -0.47741019, -0.42640542])

In [25]:
X_train.shape

(712, 18)

# Building the Artificial Neural Network

In [0]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense

Initialising the ANN

In [0]:
classifier=Sequential()

Adding 1st Layer

In [0]:
classifier.add(Dense(10,bias_initializer='uniform',activation='relu',input_shape=(18,)))

Adding 2nd layer

In [0]:
classifier.add(Dense(6,bias_initializer='uniform',activation='relu'))

Adding output layer

In [0]:
classifier.add(Dense(1,bias_initializer='uniform',activation='sigmoid'))

In [0]:
classifier.compile(optimizer='adam',loss='binary_crossentropy',metrics='accuracy')

Fitting the ANN

In [120]:
classifier.fit(x=X_train,y=y_train,batch_size=5,epochs=500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fcc88211c50>

#Making predictions and testing accuracy

In [121]:
y_pred=classifier.predict(X_test)
y_pred

array([[1.21390305e-01],
       [1.24525696e-01],
       [3.93905211e-03],
       [1.00000000e+00],
       [4.48016077e-01],
       [1.61443979e-01],
       [9.99991179e-01],
       [1.33347362e-01],
       [1.94655068e-03],
       [3.08910012e-01],
       [4.10678424e-02],
       [9.32498455e-01],
       [1.20133758e-01],
       [1.00000000e+00],
       [9.91907954e-01],
       [9.28246975e-01],
       [1.13862731e-01],
       [1.41599402e-01],
       [1.37223795e-01],
       [3.76252353e-01],
       [9.23151001e-02],
       [9.99930978e-01],
       [1.20023452e-01],
       [5.54353893e-01],
       [5.36996484e-01],
       [1.00000000e+00],
       [1.31727085e-01],
       [5.16747177e-01],
       [9.99886513e-01],
       [2.07434416e-01],
       [1.00073382e-01],
       [9.68404472e-01],
       [1.32228881e-01],
       [1.28248036e-01],
       [3.36023010e-02],
       [1.44432914e-02],
       [1.40544161e-01],
       [5.40850237e-02],
       [2.23091953e-02],
       [3.57126771e-03],


In [0]:
y_pred=(y_pred>0.5)

Making the confusion matrix

In [123]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
pred_true=cm[0,0]+cm[1,1]
pred_false=cm[0,1]+cm[1,0]
print("correct predictions:"+str(pred_true))
print("incorrect predictions:"+str(pred_false))

correct predictions:137
incorrect predictions:42


In [124]:
cm

array([[96, 14],
       [28, 41]])