In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
train_dataset = pd.read_csv('data/train.csv')

### Data preprocessing & Feature Engineering
Its always a good practice to go through the following checklists to preprocess our dataset
- Split the dataset into X (attributes) and y (label)
- Check if contains missing values
- Check if continuous attribute(s) contain any outliers
- Some misc properties of the dataset

In [3]:
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


From the info object information, we can see that indeed the dataset contains missing values at the following column(s):
- Age
- Cabin

We will:
- apply **Imputer** with mean strategy to handle missing Age
- drop Cabin column since it is intuitively not useful in our analysis (as well as modeling)

In [5]:
from collections import Counter

most_common_embark = Counter(train_dataset['Embarked'].tolist()).most_common(1)[0][0]

In [6]:
most_common_embark

'S'

In [7]:
from collections import Counter

train_dataset['Embarked'].fillna(most_common_embark, inplace=True)

X = train_dataset[[
    'Sex',
    'Pclass', 
    'Age', 
    'SibSp',
    'Parch',
    'Fare',
    'Embarked'
]].values
y = train_dataset[['Survived']].values

In [8]:
from sklearn.impute import SimpleImputer

age_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
age_imputer_col = age_imputer.fit_transform(X[:, 2:3])
X[:, 2:3] = age_imputer_col

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

sex_encoder = LabelEncoder()
embarked_encoder = LabelEncoder()

X[:, 0] = sex_encoder.fit_transform(X[:, 0])
X[:, 6] = embarked_encoder.fit_transform(X[:, 6])
onehotencoder = OneHotEncoder(categorical_features = [0, 6])
X = onehotencoder.fit_transform(X).toarray()

# # Avoid dummy variables trap
X = X[:, 1:]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [10]:
# Prior to feature scaling
print(X[0])

# Apply feature scaling (a must if using neural network)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

[ 1.    0.    0.    1.    3.   22.    1.    0.    7.25]


In [11]:
# After feature scaling
X[0]

array([ 0.73769513, -0.48204268, -0.30756234,  0.61583843,  0.82737724,
       -0.5924806 ,  0.43279337, -0.47367361, -0.50244517])

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

### Training model with Artificial Neural Network (ANN)
Summary:<br>
Accuracy: ~84%

In [13]:
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.
  return f(*args, **kwds)


In [34]:
ann = Sequential()
ann.add(Dense(units=8, input_shape=(9,), activation='relu', kernel_initializer='uniform'))
ann.add(Dense(units=16, activation='relu', kernel_initializer='uniform'))
ann.add(Dense(units=1, activation='sigmoid', kernel_initializer='uniform'))

ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [35]:
ann.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 8)                 80        
_________________________________________________________________
dense_5 (Dense)              (None, 16)                144       
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 17        
Total params: 241
Trainable params: 241
Non-trainable params: 0
_________________________________________________________________


In [36]:
ann.fit(X_train, y_train, batch_size=10, epochs=100, validation_data=(X_test, y_test))

Train on 712 samples, validate on 179 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1195e6710>

In [17]:
y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)

In [18]:
from sklearn.metrics import classification_report, confusion_matrix

cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(cm)
print(report)

[[99 12]
 [21 47]]
              precision    recall  f1-score   support

           0       0.82      0.89      0.86       111
           1       0.80      0.69      0.74        68

   micro avg       0.82      0.82      0.82       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.81      0.82      0.81       179



### Train model with random forest

In [39]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=10)
random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

  after removing the cwd from sys.path.


In [40]:
from sklearn.metrics import classification_report, confusion_matrix

cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(cm)
print(report)

[[100  11]
 [ 21  47]]
              precision    recall  f1-score   support

           0       0.83      0.90      0.86       111
           1       0.81      0.69      0.75        68

   micro avg       0.82      0.82      0.82       179
   macro avg       0.82      0.80      0.80       179
weighted avg       0.82      0.82      0.82       179



### Save the ANN model (keras)

In [21]:
from keras.models import model_from_json

# serialize model to JSON
model_json = ann.to_json()
with open("model/arch.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
ann.save_weights("model/weights.h5")
print("Saved model to disk")
 
# later...
# load json and create model
# json_file = open('model.json', 'r')
# loaded_model_json = json_file.read()
# json_file.close()
# loaded_model = model_from_json(loaded_model_json)
# # load weights into new model
# loaded_model.load_weights("model.h5")
# print("Loaded model from disk")

Saved model to disk


### Classify the testing set for submission to Kaggle
ANN appear to perform better, hence we will use it to classify the survival for the testing set

In [22]:
validation_dataset = pd.read_csv('data/test.csv')
validation_dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [23]:
validation_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


#### Transform input data into the format for model prediction

In [24]:
X_valid = validation_dataset[[
    'Sex',
    'Pclass', 
    'Age', 
    'SibSp',
    'Parch',
    'Fare',
    'Embarked'
]].values

fare_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

age_imputer_col = age_imputer.transform(X_valid[:, 2:3]) # No need to fit, just use it from train dataset
fare_imputer_col = fare_imputer.fit_transform(X_valid[:, 5:6])
X_valid[:, 2:3] = age_imputer_col
X_valid[:, 5:6] = fare_imputer_col

X_valid[:, 0] = sex_encoder.transform(X_valid[:, 0])
X_valid[:, 6] = embarked_encoder.transform(X_valid[:, 6])
X_valid = onehotencoder.transform(X_valid).toarray()

X_valid = X_valid[:, 1:] # Avoid dummy variables trap
X_valid = sc.transform(X_valid)

In [29]:
def map_sigmoid(prob):
    if prob < 0.5:
        return 0
    else:
        return 1
    
mapSigmoid = np.vectorize(map_sigmoid)

y_valid = random_forest.predict(X_valid)
y_valid = mapSigmoid(y_valid)

In [30]:
x_valid_passengerId = validation_dataset.PassengerId
y_valid_series = pd.Series(y_valid.reshape((418,)), name='Survived')

In [31]:
final_df = pd.concat([x_valid_passengerId, y_valid_series], axis=1)

In [32]:
final_df.to_csv('data/submission_v3.csv', index=False)