# Import dependencies

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt

# Loading data

In [66]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [67]:
train.shape, test.shape

((891, 12), (418, 11))

In [68]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [69]:
y_train = train['Survived']

In [70]:
x_train = train.drop(['Survived'], axis = 1)

In [71]:
x_train.shape, y_train.shape, x_train.columns

((891, 11),
 (891,),
 Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
        'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'))

## Imbalanced dataset

In [106]:
y_train.value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

# Exploring and Pre-processing the dataset

In [72]:
x_train.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [73]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [74]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [97]:
#x_train['Parch'].value_counts()
#x_train['SibSp'].value_counts()
x_train['Embarked'].value_counts()

Embarked
S                    644
C                    168
Q                     77
29.69911764705882      2
Name: count, dtype: int64

In [98]:
#test['Parch'].value_counts()
#test['SibSp'].value_counts()
test['Embarked'].value_counts()

Embarked
S    270
C    102
Q     46
Name: count, dtype: int64

In [75]:
#passenger_train = x_train['PassengerId']
x_train.drop(['PassengerId','Ticket','Cabin','Name'], axis = 1, inplace = True)
passenger_test = test['PassengerId']
test.drop(['PassengerId','Ticket','Cabin','Name'], axis = 1, inplace = True)

In [79]:
mean_age_train = x_train['Age'].mean()
mean_age_test = test['Age'].mean()

In [80]:
mean_age_train, mean_age_test

(29.69911764705882, 30.272590361445783)

In [81]:
x_train.fillna(mean_age_train,inplace = True)
test.fillna(mean_age_test,inplace = True)

In [84]:
x_train['Sex'] = [0 if x == 'female' else 1 for x in x_train['Sex']]
test['Sex'] = [0 if x == 'female' else 1 for x in test['Sex']]

In [101]:
x_train['Embarked'] = [0 if x == 'S' else 1 if x == 'C' else 2 for x in x_train['Embarked']]
test['Embarked'] = [0 if x == 'S' else 1 if x == 'C' else 2 for x in test['Embarked']]

In [102]:
x_train.shape, test.shape

((891, 7), (418, 7))

In [103]:
x_train.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,0
1,1,0,38.0,1,0,71.2833,1
2,3,0,26.0,0,0,7.925,0
3,1,0,35.0,1,0,53.1,0
4,3,1,35.0,0,0,8.05,0
5,3,1,29.699118,0,0,8.4583,2
6,1,1,54.0,0,0,51.8625,0
7,3,1,2.0,3,1,21.075,0
8,3,0,27.0,0,2,11.1333,0
9,2,0,14.0,1,0,30.0708,1


In [104]:
test.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,2
1,3,0,47.0,1,0,7.0,0
2,2,1,62.0,0,0,9.6875,2
3,3,1,27.0,0,0,8.6625,0
4,3,0,22.0,1,1,12.2875,0
5,3,1,14.0,0,0,9.225,0
6,3,0,30.0,0,0,7.6292,2
7,2,1,26.0,1,1,29.0,0
8,3,0,18.0,0,0,7.2292,1
9,3,1,21.0,2,0,24.15,0


Survived
0    549
1    342
Name: count, dtype: int64