In [33]:
import pickle
import xgboost
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load saved model

In [29]:
# load xgboost model
model = pickle.loads(open('xgboost-model', 'rb').read())

In [30]:
!ls

gender_submission.csv    test.csv                 train.csv
iris.ipynb               test_xgboost.csv         train_xgboost.csv
model.tar.gz             titanic-prediction.ipynb validation_xgboost.csv
program.py               titanic.ipynb            xgboost-model


# Feature Engineering

In [51]:
# feature engineering
test_data = pd.read_csv('test.csv')

# remove irrelevant feature data
test_data.drop(labels=['Name', 'Ticket', 'Fare'], axis=1, inplace=True)

# extract cabin number
test_data['Cabin'] = test_data['Cabin'].apply(lambda x : str(x)[0])

# replace NaN with mean age
mean_age = test_data['Age'].mean()
test_data['Age'] = test_data['Age'].fillna(mean_age)

# round age
test_data['Age'] = test_data['Age'].apply(lambda x : int(x))

# label encode sex, cabin and embarked
label_encoder = LabelEncoder()
test_data['Sex'] = label_encoder.fit_transform(test_data['Sex'])
test_data['Cabin'] = label_encoder.fit_transform(test_data['Cabin'])
test_data['Embarked'] = label_encoder.fit_transform(test_data['Embarked'])

# print(test_data.isnull().sum())
print(test_data.head(10))

   PassengerId  Pclass  Sex  Age  SibSp  Parch  Cabin  Embarked
0          892       3    1   34      0      0      7         1
1          893       3    0   47      1      0      7         2
2          894       2    1   62      0      0      7         1
3          895       3    1   27      0      0      7         2
4          896       3    0   22      1      1      7         2
5          897       3    1   14      0      0      7         2
6          898       3    0   30      0      0      7         1
7          899       2    1   26      1      1      7         2
8          900       3    0   18      0      0      7         0
9          901       3    1   21      2      0      7         2


In [53]:
# input = np.loadtxt(open("test_submission_input.csv", "rb"), dtype='int', delimiter=",", skiprows=1)

# Predict

In [76]:
test_X = test_data.as_matrix(columns=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Cabin', 'Embarked'])
test_X = xgboost.DMatrix(test_X)
predictions = model.predict(test_X)

def transform(val):
    if val > 0.9:
        return 1
    else:
        return 0
        
predictions = list(map(transform, predictions))
print(predictions)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

  """Entry point for launching an IPython kernel.


# Create submission file

In [77]:
submission = pd.DataFrame({ 'PassengerId': test_data['PassengerId'],
                            'Survived': predictions })
submission.to_csv("submission.csv", index=False)