In [108]:
import pickle
import xgboost
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load saved model

In [109]:
# load xgboost model
model = pickle.loads(open('xgboost-model', 'rb').read())

In [110]:
!ls

gender_submission.csv     program.py                titanic-train.ipynb
iris.ipynb                submission.csv            train.csv
model (1).tar.gz          test.csv                  train_xgboost.csv
model (2).tar.gz          test_submission_input.csv validation_xgboost.csv
model (3).tar.gz          test_xgboost.csv          xgboost-model
model.tar.gz              titanic-prediction.ipynb


# Feature Engineering

In [111]:
# feature engineering
test_data = pd.read_csv('test.csv')

print(test_data.isnull().sum())

# remove irrelevant feature data
test_data.drop(labels=['Name', 'Ticket'], axis=1, inplace=True)

# extract cabin number
test_data['Cabin'] = test_data['Cabin'].apply(lambda x : str(x)[0])

# replace age NaN with mean age
mean_age = test_data['Age'].mean()
test_data['Age'] = test_data['Age'].fillna(mean_age)

# replace fare NaN with mean fare
mean_fare = test_data['Fare'].mean()
test_data['Fare'] = test_data['Fare'].fillna(mean_age)

# round age
test_data['Age'] = test_data['Age'].apply(lambda x : int(x))

# round fare
test_data['Fare'] = test_data['Fare'].apply(lambda x : int(x))

# label encode sex, cabin and embarked
label_encoder = LabelEncoder()
test_data['Sex'] = label_encoder.fit_transform(test_data['Sex'])
test_data['Cabin'] = label_encoder.fit_transform(test_data['Cabin'])
test_data['Embarked'] = label_encoder.fit_transform(test_data['Embarked'])

# print(test_data.isnull().sum())
print(test_data.head(10))

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
   PassengerId  Pclass  Sex  Age  SibSp  Parch  Fare  Cabin  Embarked
0          892       3    1   34      0      0     7      7         1
1          893       3    0   47      1      0     7      7         2
2          894       2    1   62      0      0     9      7         1
3          895       3    1   27      0      0     8      7         2
4          896       3    0   22      1      1    12      7         2
5          897       3    1   14      0      0     9      7         2
6          898       3    0   30      0      0     7      7         1
7          899       2    1   26      1      1    29      7         2
8          900       3    0   18      0      0     7      7         0
9          901       3    1   21      2      0    24      7         2


In [112]:
# input = np.loadtxt(open("test_submission_input.csv", "rb"), dtype='int', delimiter=",", skiprows=1)

# Predict

In [115]:
test_X = test_data.as_matrix(columns=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked'])
test_X = xgboost.DMatrix(test_X)
predictions = model.predict(test_X)

print(predictions)

def transform(val):
    if val > 0.7:
        return 1
    else:
        return 0
        
predictions = list(map(transform, predictions))
print(predictions)

[0.31566215 0.13372622 0.2725478  0.35959724 0.42197648 0.33560735
 0.7636277  0.30485174 0.73407435 0.31263408 0.27872428 0.6608438
 0.9385764  0.2822264  0.9156228  0.840545   0.3778072  0.32946458
 0.39828128 0.32844648 0.28030038 0.7868943  0.9414752  0.34922308
 0.8502118  0.2725478  0.9385764  0.32946458 0.4703792  0.43790224
 0.2822264  0.30501717 0.71748716 0.26700372 0.48883057 0.3742515
 0.5116903  0.4770357  0.3243136  0.5176841  0.3903571  0.6664828
 0.35607746 0.9344967  0.9156228  0.37165523 0.4332735  0.27872428
 0.92436177 0.68386996 0.4027541  0.48194265 0.8982416  0.8937806
 0.45053646 0.21577169 0.31566215 0.37165523 0.33106878 0.9374084
 0.3193708  0.57934326 0.3193708  0.74620646 0.5032693  0.9344967
 0.74620646 0.30573773 0.4321408  0.86472446 0.74620646 0.27823132
 0.50940424 0.49867374 0.9374084  0.3074426  0.33106878 0.865608
 0.33106878 0.74620646 0.88059014 0.24328718 0.6240067  0.27872428
 0.33106878 0.45118973 0.7636277  0.4770357  0.7636277  0.87291497
 0.

  """Entry point for launching an IPython kernel.


# Create submission file

In [116]:
submission = pd.DataFrame({ 'PassengerId': test_data['PassengerId'],
                            'Survived': predictions })
submission.to_csv("submission.csv", index=False)