In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
train_dataset = pd.read_csv('data/train.csv')
test_dataset = pd.read_csv('data/test.csv')

### Data preprocessing & Feature Engineering
Its always a good practice to go through the following checklists to preprocess our dataset
- Split the dataset into X (attributes) and y (label)
- Check if contains missing values
- Check if continuous attribute(s) contain any outliers
- Some misc properties of the dataset

In [3]:
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_dataset.info()
print('_')
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
_
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null flo

From the info object information, we can see that indeed the dataset contains missing values at the following column(s):
- Age
- Cabin

We will:
- apply **Imputer** with mean strategy to handle missing Age
- drop Cabin column since it is intuitively not useful in our analysis (as well as modeling)

In [94]:
X_train = train_dataset[[
    'Sex',
    'Pclass', 
    'Age', 
    'SibSp',
    'Parch',
    'Fare'
]].values
y_train = train_dataset[['Survived']].values

mean_fare = train_dataset.Fare.mean()
X_test = test_dataset[[
    'Sex',
    'Pclass',
    'Age',
    'SibSp',
    'Parch',
    'Fare'
]]
X_test['Fare'] = X_test['Fare'].fillna(mean_fare)
X_test = X_test.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [95]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
train_imputer_col = imputer.fit_transform(X_train[:, 2:3])
test_imputer_col = imputer.transform(X_test[:, 2:3])
X_train[:, 2:3] = train_imputer_col
X_test[:, 2:3] = test_imputer_col

In [96]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

sex_encoder = LabelEncoder()

X_train[:, 0] = sex_encoder.fit_transform(X_train[:, 0])
X_test[:, 0] = sex_encoder.transform(X_test[:, 0])

onehotencoder = OneHotEncoder(categorical_features = [0])
X_train = onehotencoder.fit_transform(X_train).toarray()
X_test = onehotencoder.transform(X_test).toarray()

# # Avoid dummy variables trap
X_train = X_train[:, 1:]
X_test = X_test[:, 1:]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [98]:
# Prior to feature scaling
print(X_train[0])

# Apply feature scaling (a must if using neural network)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

[ 1.    3.   22.    1.    0.    7.25]


In [99]:
# After feature scaling
X_train[0]

array([ 0.73769513,  0.82737724, -0.5924806 ,  0.43279337, -0.47367361,
       -0.50244517])

### Training model with artificial neural network (ANN), with automatic feature engineering (using library)

In [100]:
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.
  return f(*args, **kwds)


In [101]:
classifier = Sequential()
classifier.add(Dense(units=8, input_shape=(6,), activation='relu', kernel_initializer='uniform'))
classifier.add(Dense(units=16, activation='relu', kernel_initializer='uniform'))
classifier.add(Dense(units=1, activation='sigmoid', kernel_initializer='uniform'))

classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [102]:
classifier.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 8)                 56        
_________________________________________________________________
dense_2 (Dense)              (None, 16)                144       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 217
Trainable params: 217
Non-trainable params: 0
_________________________________________________________________


In [103]:
classifier.fit(X_train, y_train, batch_size=10, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x114b632b0>

### Validate the accuracy on testing dataset