# Kaggle Titanic competition in Tensorflow

* Get data 
* Preprocess data
* Split data
* Build model
* Train model 
* Test loss and accuracy (Training and validation set)
* Plot results 
* Apply model on test set (Test set)
* Apply model on the final test set

## Get the data & Exploration

Fist well get the raw data from the train.csv file

In [15]:
data_dir = "./data"
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


raw_train_data = pd.read_csv("data/train.csv")
raw_test_data = pd.read_csv("data/test.csv")
raw_label_data = pd.read_csv("data/gender_submission.csv")

We're going to choose Survived, Pclass, Sex, Age, and Fare for now and well drop the rest

In [16]:
raw_train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
raw_test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [18]:
raw_label_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [19]:
def drop_fields(data):
    fields_to_drop = ["PassengerId", "Name", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"]
    data = data.drop(fields_to_drop, axis = 1)
    return data

In [20]:
raw_train_data = drop_fields(raw_train_data)
raw_train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [21]:
y_test = [ for passenger in raw_test_data]
raw_test_data = drop_fields(raw_test_data)
raw_test_data.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,34.5,7.8292
1,3,female,47.0,7.0
2,2,male,62.0,9.6875
3,3,male,27.0,8.6625
4,3,female,22.0,12.2875


## Preprocess data
preprocess the data by standardizing continuous features with a mean of zero and a standard deviation of one.

In [None]:
from sklearn import preprocessing

# Make new columns for the categorical field and then add them to the main data
dummy_field = 'Sex'
dummies = pd.get_dummies(raw_data[dummy_field], prefix= dummy_field, drop_first = False)
preprocessed_data = pd.concat([raw_data, dummies], axis = 1)
# Drop the sex column 
preprocessed_data = preprocessed_data.drop(['Sex'], axis = 1)
pd.DataFrame(preprocessed_data).fillna(preprocessed_data.mean(), inplace= True)
print(preprocessed_data.head())

features_to_scale = ['Pclass', 'Age', 'Fare']
scaled_features = {}
for feature in features_to_scale:
    mean, std = preprocessed_data[feature].mean(), preprocessed_data[feature].std()
    scaled_features = [mean, std]
    preprocessed_data.loc[:, feature] = (preprocessed_data[feature] - mean) / std
    
# scaled_raw_data = preprocessing.scale(preprocessed_data, copy= False)
print(preprocessed_data[:5])


## Split data

Split the data into **random** 80% training and 20% validation and testing sets.

In [None]:
from sklearn.model_selection import train_test_split
# Divide the data 80% 20% random samples
y = preprocessed_data.pop('Survived')
X = preprocessed_data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10)
X_train = X
y_train = y
X_test = 
y_test = 
# Now lets create the validation set from the X_train and y_train
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.10)

In [None]:
# Prining the training and validation numbers
print('Number of training data is :{}'.format(len(X_train)))
print('Number of validation data is :{}'.format(len(X_train)*0.2))
print('Number of testing data is :{}'.format(len(X_test)))

## Simple Logistic Regression
We'll be using simple regression and RFE

## Save preprocessed data as a checkpoint

In [None]:
features = normalize(features)
labels = one_hot_encode(labels)
pickle.dump((features, labels), open(filename, 'wb'))

## Model Architecture 

So here we have a simple classification problem that 

In [None]:
print('Training data shape is {}'.format(X_train.shape))

In [None]:
print(X_train[:5])

In [None]:
########## Keras imports ###########
from keras.layers import Dense, Lambda, Dropout, Input
from keras.models import Sequential, load_model, Model

dropout = 0.2

# Keras model for regression 
model = Sequential()
# fc_1
model.add(Dense(64, input_shape=(5,)))
# dropout
model.add(Dropout(dropout))
# fc_2
model.add(Dense(32))
# dropout
model.add(Dropout(dropout))
# logits
model.add(Dense(1, activation='sigmoid'))

## Training the Model

In [None]:
# Configutring model training 
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# Training the model 
model_history = model.fit(X_train, y_train, validation_split=0.2,shuffle=True, epochs=500, verbose=2)

In [None]:
import matplotlib.pyplot as plt
# Visualizations will be shown in the notebook.
%matplotlib inline
# summarize history for accuracy
plt.plot(model_history.history['acc'])
plt.plot(model_history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper left')
plt.show()