# titaNNic

Tome la competencia de [Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic) para practicar con Redes Neuronales.

## Importamos

In [1]:
# pandas
import pandas as pd

# numpy
import numpy as np

# keras
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.models import load_model
from keras.utils import to_categorical ## Util para convertir en categoricas las variables target

# matplotlib
import matplotlib.pyplot as plt

# Set de Notebook de Jupyter
%matplotlib inline

pd.options.display.max_rows = 10
pd.options.display.float_format = '{:,.2f}'.format
plt.rcParams['figure.figsize'] = (16, 12)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Leemos y cargamos el Dataset con Pandas

In [2]:
# Path donde estan los archivos
path_test = '/Users/martin/Documents/GitHub/practica-dma/redes_neuronales/dataset_titanic/test.csv'
path_train = '/Users/martin/Documents/GitHub/practica-dma/redes_neuronales/dataset_titanic/train.csv'

# Leo los dataset
titanic_test = pd.read_csv(path_test)
titanic_train = pd.read_csv(path_train)

In [3]:
# Seteamos el PassangerId como Index
titanic_test.set_index('PassengerId', inplace = True)

In [4]:
# Seteamos el PassangerId como Index
titanic_train.set_index('PassengerId', inplace = True)

In [5]:
# Cantidad de filas que tiene el df
titanic_shape = titanic_train.shape[0]

# Selecciono los index que despues voy a usar en validacion y entrenamiento
valid_index = np.random.choice(titanic_train.index, int(titanic_shape *0.10), replace = False)
valid_index 

array([282, 601, 623,  74, 611, 112,  78, 109, 720, 332, 514, 676, 370,
       733, 243, 485, 214, 558, 235, 262,  45, 454, 450, 443, 844,  30,
        83, 357,  99, 345,  97, 349, 797, 155, 609,  16,  55,  90, 760,
       344, 882, 688,  34, 319, 610, 254, 281, 836, 432, 742, 195,  76,
        17, 133, 600, 180, 635, 294, 452, 388, 500,  47, 662,  92, 183,
        21, 624, 548, 825, 773, 118, 680, 834, 444, 426, 338, 565, 405,
       700, 372, 872, 771, 255, 380,  68, 481, 318, 328,  65])

In [6]:
# Cargamos el df
titanic_train

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.00,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,1,0,PC 17599,71.28,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.00,0,0,STON/O2. 3101282,7.92,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,1,0,113803,53.10,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.00,0,0,373450,8.05,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.00,0,0,211536,13.00,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.00,0,0,112053,30.00,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.00,0,0,111369,30.00,C148,C


## Exploracion del dataset

### Tipos de Datos en el dataset

In [7]:
# Cuales son los tipos de datos que componen el dataset
tipo_datos = titanic_train.dtypes

In [8]:
tipo_datos[tipo_datos == 'object']

Name        object
Sex         object
Ticket      object
Cabin       object
Embarked    object
dtype: object

### Exploramos si hay nulos en el dataset

In [9]:
# Nos fijamos los nulos que hay en el dataset
nulos = titanic_train.isnull().sum()
nulos = nulos[nulos > 0] 
print(nulos)

Age         177
Cabin       687
Embarked      2
dtype: int64


#### Edad

In [10]:
# Identifico aquellos registros que tienen Null en Age
titanic_train['Age_ISNULL'] = titanic_train.Age.isnull().astype(int)

In [11]:
# Completo Age con la media
age_mean = titanic_train.Age.mean()
titanic_train['Age'] = titanic_train.Age.fillna(age_mean)

#### Embarked

In [12]:
# Identifico los casos con Null
titanic_train[titanic_train.Embarked.isnull()]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_ISNULL
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,0
830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,0


In [13]:
# Podemos asumir que siendo de la misma clase y que hayan pagado la misma tarifa, que se embarcaron en el mismo puerto
titanic_train[(titanic_train.Fare >= 70) & (titanic_train.Fare <= 90)].Embarked.fillna(-1).value_counts()

S     25
C     19
-1     2
Q      2
Name: Embarked, dtype: int64

In [14]:
# Entonces, marcamos los casos nulos
titanic_train['Embarked_ISNULL'] = titanic_train.Embarked.isnull().astype(int)

In [15]:
# Ahora, agregamos los dos casos nulos a la categoria S
titanic_train['Embarked'] = titanic_train.Embarked.fillna('S')

#Chequeo
titanic_train.loc[[62,830],['Embarked']]

Unnamed: 0_level_0,Embarked
PassengerId,Unnamed: 1_level_1
62,S
830,S


#### Cabin

In [16]:
# Utilizo Factorize. Factorize genera un array con el indice y el valor.
titanic_train['Cabin'] = titanic_train.Cabin.fillna(-1).factorize()[0]

In [17]:
titanic_train

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_ISNULL,Embarked_ISNULL
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.00,1,0,A/5 21171,7.25,0,S,0,0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,1,0,PC 17599,71.28,1,C,0,0
3,1,3,"Heikkinen, Miss. Laina",female,26.00,0,0,STON/O2. 3101282,7.92,0,S,0,0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,1,0,113803,53.10,2,S,0,0
5,0,3,"Allen, Mr. William Henry",male,35.00,0,0,373450,8.05,0,S,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.00,0,0,211536,13.00,0,S,0,0
888,1,1,"Graham, Miss. Margaret Edith",female,19.00,0,0,112053,30.00,146,S,0,0
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.70,1,2,W./C. 6607,23.45,0,S,1,0
890,1,1,"Behr, Mr. Karl Howell",male,26.00,0,0,111369,30.00,147,C,0,0


### Conversion de Tipo de Datos Objeto

Los tipos de datos que son objeto, son los siguientes:

- Sex
- Name
- Embarked
- Ticket

Usamos diferentes tecnicas para llevar esos datos a numeros.

#### Sex

Lo que vamos a hacer con esta variable, es convertilo en binario.

In [18]:
titanic_train['Sex'] = titanic_train.Sex.apply(lambda x: {'male':0, 'female':1}[x])

In [19]:
titanic_train[['Sex']]

Unnamed: 0_level_0,Sex
PassengerId,Unnamed: 1_level_1
1,0
2,1
3,1
4,1
5,0
...,...
887,0
888,1
889,1
890,0


#### Embarked

En este caso, lo que haremos es hacer One Hot Encoding.

In [20]:
# Obtenemos las dummies para el campo Embarked con 'get_dummies'
dummies_embarked = pd.get_dummies(titanic_train.Embarked)
print(dummies_embarked)

             C  Q  S
PassengerId         
1            0  0  1
2            1  0  0
3            0  0  1
4            0  0  1
5            0  0  1
...         .. .. ..
887          0  0  1
888          0  0  1
889          0  0  1
890          1  0  0
891          0  1  0

[891 rows x 3 columns]


In [21]:
# Ahora joineamos esta data al dataset titanic_train
# Ademas dropeamos la columna 'Embarked'
titanic_train = titanic_train.join(dummies_embarked).drop('Embarked', axis = 1)

In [22]:
# Vemos como quedo
titanic_train

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Age_ISNULL,Embarked_ISNULL,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,3,"Braund, Mr. Owen Harris",0,22.00,1,0,A/5 21171,7.25,0,0,0,0,0,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.00,1,0,PC 17599,71.28,1,0,0,1,0,0
3,1,3,"Heikkinen, Miss. Laina",1,26.00,0,0,STON/O2. 3101282,7.92,0,0,0,0,0,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.00,1,0,113803,53.10,2,0,0,0,0,1
5,0,3,"Allen, Mr. William Henry",0,35.00,0,0,373450,8.05,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",0,27.00,0,0,211536,13.00,0,0,0,0,0,1
888,1,1,"Graham, Miss. Margaret Edith",1,19.00,0,0,112053,30.00,146,0,0,0,0,1
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,29.70,1,2,W./C. 6607,23.45,0,1,0,0,0,1
890,1,1,"Behr, Mr. Karl Howell",0,26.00,0,0,111369,30.00,147,0,0,1,0,0


#### Ticket

In [23]:
# Uso Factorize
titanic_train['Ticket'] = titanic_train.Ticket.factorize()[0]

In [24]:
# Dropeo la variable Name, que es la unica que me quedo como objeto
titanic_train.drop('Name', axis = 1, inplace = True)

In [25]:
# Veo como quedo
titanic_train

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Age_ISNULL,Embarked_ISNULL,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,3,0,22.00,1,0,0,7.25,0,0,0,0,0,1
2,1,1,1,38.00,1,0,1,71.28,1,0,0,1,0,0
3,1,3,1,26.00,0,0,2,7.92,0,0,0,0,0,1
4,1,1,1,35.00,1,0,3,53.10,2,0,0,0,0,1
5,0,3,0,35.00,0,0,4,8.05,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,0,27.00,0,0,677,13.00,0,0,0,0,0,1
888,1,1,1,19.00,0,0,678,30.00,146,0,0,0,0,1
889,0,3,1,29.70,1,2,614,23.45,0,1,0,0,0,1
890,1,1,0,26.00,0,0,679,30.00,147,0,0,1,0,0


## Preparando el terreno para aplicar Redes

### Normalizacion

In [29]:
# Normalizo todo menos el Target que es la columna 'Survived' y aquellas columnas binarias
no_norm = ['Survived', 'Embarked_ISNULL', 'Age_ISNULL', 'C', 'Q', 'S', 'Sex']

for column in titanic_train:
    if column not in no_norm:
        titanic_train[column] = titanic_train[column].apply(lambda x: (x-titanic_train[column].mean())/titanic_train[column].std())
    else:
        titanic_train[column] = titanic_train[column]

# Veo como quedo el dataset
titanic_train

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Age_ISNULL,Embarked_ISNULL,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,0.83,0,-0.59,0.43,-0.47,-1.56,-0.50,-0.44,0,0,0,0,1
2,1,-1.57,1,0.64,0.43,-0.47,-1.55,0.79,-0.42,0,0,1,0,0
3,1,0.83,1,-0.28,-0.47,-0.47,-1.55,-0.49,-0.44,0,0,0,0,1
4,1,-1.57,1,0.41,0.43,-0.47,-1.54,0.42,-0.39,0,0,0,0,1
5,0,0.83,0,0.41,-0.47,-0.47,-1.54,-0.49,-0.44,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,-0.37,0,-0.21,-0.47,-0.47,1.88,-0.39,-0.44,0,0,0,0,1
888,1,-1.57,1,-0.82,-0.47,-0.47,1.88,-0.04,3.73,0,0,0,0,1
889,0,0.83,1,-0.00,0.43,2.01,1.56,-0.18,-0.44,1,0,0,0,1
890,1,-1.57,0,-0.28,-0.47,-0.47,1.89,-0.04,3.76,0,0,1,0,0


### Separando el dataset en Entrenamiento y Validacion

In [35]:
# Divido el dataset en Entrenamiento (train) y Validacion (valid)
# Para eso uso la variable 'valid_index'.
# Para el dataset de entrenamiento: dropeo todos aquellos indices de la variable 'valid_index'

train = titanic_train.drop(valid_index, axis = 0)
valid = titanic_train.loc[valid_index]

Para un problema de clasificacion con Redes Neuronales, la variable target hay que convertirlas en categoricas. De manera que queden en dos columnas: una para los casos 0 y otra para los casos 1.

La Capa de Salida (output_layer) tiene 2 neuronas.

Usamos `get_dummies` para transformar el target `Survived` en categorica.

In [64]:
X_train, target_train = train.drop('Survived', axis = 1).as_matrix(), to_categorical(train.Survived)
X_valid, target_valid = valid.drop('Survived', axis = 1).as_matrix(), to_categorical(valid.Survived)

### Armo la red

In [77]:
# Selecciono Modelo
model = Sequential()

# Cantidad de Columnas
n_cols = X_train.shape[1]

# Especifico la Red
model.add(Dense(36, activation = 'relu', input_shape = (n_cols,)))
#model.add(Dense(16, activation = 'relu'))
model.add(Dense(2, activation = 'softmax'))

# Compilo la Red - Optimizer: Stochastic Gradient Descent (SGD)
model.compile(loss = 'categorical_crossentropy', optimizer = 'sgd', metrics = ['accuracy'])

# Ajuste de la Red
model.fit(X_train, target_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x18235650f0>