In [21]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Read the CSV and Preform Basic Data Cleaning

In [22]:
gender_df = pd.read_csv("data/gender_submission.csv")
test_df = pd.read_csv("data/test.csv")
train_df = pd.read_csv("data/train.csv")

In [23]:
#checking for null values
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [24]:
#Description of dataset
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [25]:
#detailed look at what is actually missing in data

total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

Unnamed: 0,Total,%
Cabin,687,77.1
Age,177,19.9
Embarked,2,0.2
PassengerId,0,0.0
Survived,0,0.0


In [26]:
#Merged test dataset with gender to have is people survived or not.
test_merged_df = test_df.merge(gender_df, on='PassengerId')

Dropped Columns:

In [27]:
#Non needed columns
train_df = train_df.drop(['Ticket'], axis=1)
test_merged_df = test_merged_df.drop(['Ticket'], axis=1)

In [28]:
train_df = train_df.drop(['Cabin'], axis=1)
test_merged_df = test_merged_df.drop(['Cabin'], axis=1)

In [29]:
train_df = train_df.drop(['Name'], axis=1)
test_merged_df = test_merged_df.drop(['Name'], axis=1)

In [30]:
#dropping na from names
train_df = train_df[train_df['Age'].notna()]
test_merged_df = test_merged_df[test_df['Age'].notna()]

In [31]:
train_df = train_df.drop(['Embarked'], axis=1)
test_merged_df = test_merged_df.drop(['Embarked'], axis=1)

Checking dataframes after dropping columns and merging

In [32]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  714 non-null    int64  
 1   Survived     714 non-null    int64  
 2   Pclass       714 non-null    int64  
 3   Sex          714 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        714 non-null    int64  
 6   Parch        714 non-null    int64  
 7   Fare         714 non-null    float64
dtypes: float64(2), int64(5), object(1)
memory usage: 50.2+ KB


In [33]:
test_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 332 entries, 0 to 415
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  332 non-null    int64  
 1   Pclass       332 non-null    int64  
 2   Sex          332 non-null    object 
 3   Age          332 non-null    float64
 4   SibSp        332 non-null    int64  
 5   Parch        332 non-null    int64  
 6   Fare         331 non-null    float64
 7   Survived     332 non-null    int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 23.3+ KB


In [34]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,male,22.0,1,0,7.25
1,2,1,1,female,38.0,1,0,71.2833
2,3,1,3,female,26.0,0,0,7.925
3,4,1,1,female,35.0,1,0,53.1
4,5,0,3,male,35.0,0,0,8.05


In [35]:
test_merged_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,892,3,male,34.5,0,0,7.8292,0
1,893,3,female,47.0,1,0,7.0,1
2,894,2,male,62.0,0,0,9.6875,0
3,895,3,male,27.0,0,0,8.6625,0
4,896,3,female,22.0,1,1,12.2875,1


In [36]:
#changed the fare na to 0 and changed type to interget from a float64
data = [train_df, test_merged_df]

for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

In [37]:
data

[     PassengerId  Survived  Pclass     Sex   Age  SibSp  Parch  Fare
 0              1         0       3    male  22.0      1      0     7
 1              2         1       1  female  38.0      1      0    71
 2              3         1       3  female  26.0      0      0     7
 3              4         1       1  female  35.0      1      0    53
 4              5         0       3    male  35.0      0      0     8
 ..           ...       ...     ...     ...   ...    ...    ...   ...
 885          886         0       3  female  39.0      0      5    29
 886          887         0       2    male  27.0      0      0    13
 887          888         1       1  female  19.0      0      0    30
 889          890         1       1    male  26.0      0      0    30
 890          891         0       3    male  32.0      0      0     7
 
 [714 rows x 8 columns],
      PassengerId  Pclass     Sex   Age  SibSp  Parch  Fare  Survived
 0            892       3    male  34.5      0      0     7    

In [38]:
#Made genders bianary
genders = {"male": 0, "female": 1}
data = [train_df, test_merged_df]

for dataset in data:
    dataset['Sex'] = dataset['Sex'].map(genders)

In [39]:
X_train = train_df.drop(["PassengerId", "Survived"], axis=1)
Y_train = train_df["Survived"]
X_test  = test_merged_df.drop(["PassengerId", "Survived"], axis=1).copy()
Y_test = test_merged_df["Survived"]
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((714, 6), (714,), (332, 6), (332,))

In [40]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler() 
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# Artificial Neural network

In [50]:
#importing libraries and packages for keras
import tensorflow.keras
from tensorflow.keras.models import Sequential          #initialize neural network
from tensorflow.keras.layers import Dense               #building layers of ANN

In [51]:
#initiliazing ANN
model= Sequential()

In [52]:
#adding input layer and first hidden layer
model.add(Dense(4, kernel_initializer= 'uniform', activation = 'relu', input_dim = 6))

In [53]:
#adding the second hidden lay
model.add(Dense(4, kernel_initializer= 'uniform', activation = 'relu'))

In [54]:
#adding the output layer
model.add(Dense(1, kernel_initializer= 'uniform', activation = 'sigmoid'))

In [55]:
#Compiling ANN (Stochastic gradient)
model.compile(optimizer = 'adam',loss='binary_crossentropy', metrics = ['accuracy'] )

In [56]:
#fitting the ANN to the training set
model.fit(X_train, Y_train, batch_size = 10, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7fd3eb73c4c0>

In [59]:
#predicting the test dataset results. We are cosidering that y_pred > 0.5 is survival 
y_prediction = model.predict(X_test)
y_final = (y_prediction > 0.5).astype(int).reshape(X_test.shape[0])
y_prediction

array([[0.07970029],
       [0.32039252],
       [0.05361819],
       [0.11565906],
       [0.5245962 ],
       [0.20936829],
       [0.5653595 ],
       [0.28933746],
       [0.66135365],
       [0.08188626],
       [0.31209698],
       [0.850971  ],
       [0.0405407 ],
       [0.850971  ],
       [0.8277471 ],
       [0.19874552],
       [0.15101248],
       [0.48134568],
       [0.4390435 ],
       [0.21211848],
       [0.29303575],
       [0.62136084],
       [0.81827974],
       [0.02566791],
       [0.850971  ],
       [0.14104307],
       [0.35978413],
       [0.07791805],
       [0.18644774],
       [0.42551076],
       [0.46062493],
       [0.1689066 ],
       [0.6411915 ],
       [0.12839827],
       [0.08655873],
       [0.05770531],
       [0.8273572 ],
       [0.850971  ],
       [0.12565449],
       [0.32246017],
       [0.80676997],
       [0.523301  ],
       [0.5154539 ],
       [0.28281873],
       [0.71870875],
       [0.8435075 ],
       [0.08214217],
       [0.077

In [64]:
output = pd.DataFrame({'PassengerId': test_merged_df['PassengerId'], 'Survived': y_final})
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
409,1301,1
411,1303,1
412,1304,1
414,1306,1
