<a href="https://www.kaggle.com/code/mrsalty/titanic-machine-learning-from-disaster?scriptVersionId=119149199" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Imports

In [1]:
import numpy as np 
import pandas as pd 
import os
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Dropout, Activation, BatchNormalization, Input
from keras.models import Model, Sequential 
from keras.callbacks import EarlyStopping

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


## Load data

In [2]:
# load data from files
X_train = pd.read_csv("/kaggle/input/titanic/train.csv")
X_submission = pd.read_csv("/kaggle/input/titanic/test.csv")
y_train = X_train['Survived']

# remove y column
X_train = X_train.drop(columns='Survived')

## Data dictionary


|Variable|Definition|Key
|----|----|----
|survival|Survival|0 = No, 1 = Yes
|pclass|Ticket class|1 = 1st, 2 = 2nd, 3 = 3rd
|sex|Sex|
|Age|Age in years|
|sibsp|# of siblings / spouses aboard the Titanic
|parch|# of parents / children aboard the Titanic
|ticket|Ticket number
|fare|Passenger fare
|cabin|Cabin number
|embarked|Port of Embarkation|C = Cherbourg, Q = Queenstown, S = Southampto

## Data analysis

In [3]:
X_train.shape

(891, 11)

In [4]:
X_submission.shape

(418, 11)

In [5]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
X_submission.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [7]:
# count of nan values per column
for col in X_train:
    nan_count = X_train[col].isna().sum()
    if nan_count>0:
        print(f'{col}:{nan_count}')

Age:177
Cabin:687
Embarked:2


In [8]:
# count of unique values per column
for col in X_train:
        print(f'{col}:{X_train[col].nunique()}')

PassengerId:891
Pclass:3
Name:891
Sex:2
Age:88
SibSp:7
Parch:7
Ticket:681
Fare:248
Cabin:147
Embarked:3


## Preprocessing 

Fill nan with 0

In [9]:
for col in X_train:
    nan_count = X_train[col].isna().sum()
    if nan_count>0:
        print(f'{col}:{nan_count}')
        X_train[col] = X_train[col].fillna(0)
        X_submission[col] = X_train[col].fillna(0)

Age:177
Cabin:687
Embarked:2


remove column not relevant for training

In [10]:
X_train = X_train.drop(columns='Name')
X_train = X_train.drop(columns='Ticket')
X_train = X_train.drop(columns='PassengerId')
X_submission = X_submission.drop(columns='Name')
X_submission = X_submission.drop(columns='Ticket')
X_submission_passengerId = X_submission['PassengerId']
X_submission = X_submission.drop(columns='PassengerId')


In [11]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.25,0,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.925,0,S
3,1,female,35.0,1,0,53.1,C123,S
4,3,male,35.0,0,0,8.05,0,S


ordinal encode categorical features

In [12]:
cols_to_encode = ["Sex","Cabin","Embarked"]

enc = OrdinalEncoder()
for col in cols_to_encode:
    X_train[col] = X_train[col].astype('str')
    X_submission[col] = X_train[col].astype('str') 
    
enc.fit(X_train[cols_to_encode])
enc.categories_
X_train[cols_to_encode] = enc.transform(X_train[cols_to_encode])
X_submission[cols_to_encode] = enc.transform(X_submission[cols_to_encode])
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1.0,22.0,1,0,7.25,0.0,3.0
1,1,0.0,38.0,1,0,71.2833,82.0,1.0
2,3,0.0,26.0,0,0,7.925,0.0,3.0
3,1,0.0,35.0,1,0,53.1,56.0,3.0
4,3,1.0,35.0,0,0,8.05,0.0,3.0


TODO: scale columns?

In [13]:
scaler = MinMaxScaler()
 
scaled_ds = scaler.fit_transform(X_train.to_numpy())
scaled_ds = pd.DataFrame(X_train, columns=['Pclass', 'Sex', 'Age', 'Fare', 'Cabin'])
 
scaled_ds.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin
0,3,1.0,22.0,7.25,0.0
1,1,0.0,38.0,71.2833,82.0
2,3,0.0,26.0,7.925,0.0
3,1,0.0,35.0,53.1,56.0
4,3,1.0,35.0,8.05,0.0


## Split train and test data

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

## Train model

In [15]:
input = Input(shape=(8,))
x = Dense(64, activation='relu')(input)
x = BatchNormalization()(x)
x = Dense(128, activation='relu',)(x)
x = BatchNormalization()(x)
x = Dense(64, activation='relu',)(x)
x = BatchNormalization()(x)
x = Dense(1, activation='sigmoid')(x)
model = Model(inputs=input, outputs=x)

2023-02-14 13:29:24.431490: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [16]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience = 3, verbose=1)

model.compile(optimizer='adam'
                    , loss='binary_crossentropy'
                    , metrics=['accuracy'])

In [17]:
NUM_EPOCHS = 50
BATCH_SIZE = 128

history = model.fit(X_train
                    , y_train
                    , epochs=NUM_EPOCHS
                    , batch_size=BATCH_SIZE
                    , validation_data=(X_val, y_val)
                    , verbose=1
                    , callbacks=[early_stopping_callback])


2023-02-14 13:29:24.721425: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 00018: early stopping


In [18]:
X_submission_passengerId.shape

(418,)

In [19]:
y_pred = model.predict(X_submission)
y_pred = (y_pred > 0.5).astype(int).ravel()
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1
 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0
 1 0 0 0 1 0 0 1 0 0 0]


In [20]:
final_df = pd.DataFrame({'PassengerId': X_submission_passengerId, 'Survived': y_pred.astype(int).tolist()})
final_df.head(20)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0


In [21]:
final_df.to_csv('submission.csv', index=False)
print(" Submission saved")

 Submission saved
