In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dftrain = pd.read_csv('train.csv')
dftest = pd.read_csv('test.csv')

In [3]:
dftrain.shape

(891, 12)

In [4]:
dftrain.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
dftrain.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
dftrain.describe(include='O')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [7]:
dftest.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [8]:
#Look, there are a lot of blank values for Age and one for F, let's fill the gaps!

age_mean = pd.concat([dftrain['Age'], dftest['Age']], ignore_index=True).mean()
fare_mean = pd.concat([dftrain['Fare'], dftrain['Fare']], ignore_index=True).mean()

#Now, let's select and adjust some features for the training set:
features = ['Sex']+['Pclass']+['Age']+['SibSp']+['Parch']+['Fare']+['Embarked']

train = dftrain[['Survived']+features].copy()
train['Sex'] = train['Sex'].map({'male': 1, 'female': 0}).astype(int)
train['Age'] = train['Age'].fillna(age_mean)
train = train.dropna()
train['Embarked'] = train['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

train.value_counts()

Survived  Sex  Pclass  Age        SibSp  Parch  Fare    Embarked
0         1    3       29.881138  0      0      7.8958  0           13
                                                8.0500  0           12
                                                7.7500  2            9
1         0    3       29.881138  0      0      7.7500  2            7
0         1    2       29.881138  0      0      0.0000  0            6
                                                                    ..
               3       19.000000  0      0      6.7500  2            1
                                                7.6500  0            1
                                                7.7750  0            1
                                                8.0500  0            1
1         1    3       45.000000  0      0      8.0500  0            1
Length: 778, dtype: int64

In [9]:
train.describe()

Unnamed: 0,Survived,Sex,Pclass,Age,SibSp,Parch,Fare,Embarked
count,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0
mean,0.382452,0.649044,2.311586,29.689687,0.524184,0.382452,32.096681,0.362205
std,0.48626,0.477538,0.8347,12.968698,1.103705,0.806761,49.697504,0.636157
min,0.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,0.0,2.0,22.0,0.0,0.0,7.8958,0.0
50%,0.0,1.0,3.0,29.881138,0.0,0.0,14.4542,0.0
75%,1.0,1.0,3.0,35.0,1.0,0.0,31.0,1.0
max,1.0,1.0,3.0,80.0,8.0,6.0,512.3292,2.0


In [10]:
#Now, let's select and adjust some features for the testing set:
test = dftest[['Sex']+['Pclass']+['Age']+['SibSp']+['Parch']+['Fare']+['Embarked']].copy()
test['Sex'] = test['Sex'].map({'male': 1, 'female': 0}).astype(int)
test['Age'] = test['Age'].fillna(age_mean)
test['Fare'] = test['Fare'].fillna(fare_mean)
test['Embarked'] = test['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

test.describe()

Unnamed: 0,Sex,Pclass,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,0.636364,2.26555,30.192052,0.447368,0.392344,35.619,0.464115
std,0.481622,0.841838,12.635527,0.89676,0.981429,55.840751,0.685516
min,0.0,1.0,0.17,0.0,0.0,0.0,0.0
25%,0.0,1.0,23.0,0.0,0.0,7.8958,0.0
50%,1.0,3.0,29.881138,0.0,0.0,14.4542,0.0
75%,1.0,3.0,35.75,1.0,0.0,31.5,1.0
max,1.0,3.0,76.0,8.0,9.0,512.3292,2.0


In [11]:
#Reescaling the features:
for f in features:
  max_value = pd.concat([train[f], test[f]], ignore_index=True).max()
  min_value = pd.concat([train[f], test[f]], ignore_index=True).min()
  train[f] = (train[f] - min_value) / (max_value - min_value)
  test[f] = (test[f] - min_value) / (max_value - min_value)

In [12]:
y_train = train.pop('Survived')
train.describe()

Unnamed: 0,Sex,Pclass,Age,SibSp,Parch,Fare,Embarked
count,889.0,889.0,889.0,889.0,889.0,889.0,889.0
mean,0.649044,0.655793,0.369782,0.065523,0.042495,0.062649,0.181102
std,0.477538,0.41735,0.162454,0.137963,0.08964,0.097003,0.318079
min,0.0,0.0,0.003132,0.0,0.0,0.0,0.0
25%,0.0,0.5,0.273456,0.0,0.0,0.015412,0.0
50%,1.0,1.0,0.37218,0.0,0.0,0.028213,0.0
75%,1.0,1.0,0.436302,0.125,0.0,0.060508,0.5
max,1.0,1.0,1.0,1.0,0.666667,1.0,1.0


In [13]:
test.head()

Unnamed: 0,Sex,Pclass,Age,SibSp,Parch,Fare,Embarked
0,1.0,1.0,0.430039,0.0,0.0,0.015282,1.0
1,0.0,1.0,0.586622,0.125,0.0,0.013663,0.0
2,1.0,0.5,0.774521,0.0,0.0,0.018909,1.0
3,1.0,1.0,0.336089,0.0,0.0,0.016908,0.0
4,0.0,1.0,0.273456,0.125,0.111111,0.023984,0.0


In [14]:
#Bulding model
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(7,)),
    keras.layers.Dense(4, activation='relu', use_bias=True),
    keras.layers.Dense(2, activation='sigmoid', use_bias=True)
])

In [15]:
#Compiling the model
model.compile(optimizer='Adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

In [16]:
#Training the model
model.fit(train, y_train, epochs=28)

Epoch 1/28
Epoch 2/28
Epoch 3/28
Epoch 4/28
Epoch 5/28
Epoch 6/28
Epoch 7/28
Epoch 8/28
Epoch 9/28
Epoch 10/28
Epoch 11/28
Epoch 12/28
Epoch 13/28
Epoch 14/28
Epoch 15/28
Epoch 16/28
Epoch 17/28
Epoch 18/28
Epoch 19/28
Epoch 20/28
Epoch 21/28
Epoch 22/28
Epoch 23/28
Epoch 24/28
Epoch 25/28
Epoch 26/28
Epoch 27/28
Epoch 28/28


<keras.callbacks.History at 0x7f78101fd6a0>

In [17]:
#Now, let's predict for the test values
predictions = model.predict(test)
predictions_surv=[]
#And create a binary list for the predictions
for i in range(len(predictions)):
  if predictions[i][0] > predictions[i][1]:
    predictions_surv.append(int(0))
  else:
    predictions_surv.append(int(1))
print(predictions_surv)

[0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 

In [18]:
# It's time to create a submission csv
dfsubmission = dftest[['PassengerId']].copy()
dfsubmission['Survived'] = predictions_surv
dfsubmission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [19]:
dfsubmission.to_csv('submission_pedro.csv', index=False)