In [1]:
# import libs
import pandas as pd
import numpy as np
import seaborn as sns

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
# Load in the train and test datasets
def get_file_location(filename):
    return "./data/{}".format(filename)

train = pd.read_csv(get_file_location("train.csv"))
test = pd.read_csv(get_file_location("test.csv"))

In [3]:
def create_feature_title(df):
    titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4}
    df["Title"] = df["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)
    df["Title"] = df["Title"].replace("Mlle", "Miss")
    df["Title"] = df["Title"].replace("Ms", "Miss")
    df["Title"] = df["Title"].replace("Mme", "Mrs")
    df["Title"] = df["Title"].map(lambda title: titles[title] if title in titles else 5).astype(int)
    return df


def convert_feature_age(df):
    df.loc[ df['Age'] <= 16, 'Age'] = 0
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
    df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
    df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
    df.loc[ df['Age'] > 64, 'Age'] = 4
    df["Age"] = df["Age"].astype(int)
    return df


def convert_feature_fare(df):
    df["Fare"] = df["Fare"].fillna(df["Fare"].mean())
    df.loc[ df['Fare'] <= 7.91, 'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare']   = 2
    df.loc[(df['Fare'] > 31) & (df['Fare'] <= 99), 'Fare']       = 3
    df.loc[(df['Fare'] > 99) & (df['Fare'] <= 250), 'Fare']      = 4
    df.loc[ df['Fare'] > 250, 'Fare'] = 5
    df["Fare"] = df["Fare"].astype(int)
    return df


def apply_feature_engineering(df):
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1   
    df["IsAlone"] = np.where(df["FamilySize"] == 1, 1, 0)
    df["Sex"] = df["Sex"].map({"male": 0, "female": 1}).astype(int)
    df["Embarked"] = df["Embarked"].map({"S": 0, "C": 1, "Q": 2}).astype(int)
    df = create_feature_title(df)
    df = convert_feature_age(df)    
    df = convert_feature_fare(df)    
    df = df.drop(["Cabin", "PassengerId", "Name", "Ticket", "Parch", "SibSp"], axis=1)
    return df


def fill_missing_embarked(df):
    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].value_counts().idxmax())
    return df


def fill_missing_age(df):
    mean_age = df["Age"].mean()
    std_age = df["Age"].std()
    null_age_count = df["Age"].isnull().sum()
    random_age = np.random.randint(mean_age-std_age, mean_age+std_age, size=null_age_count)
    df.loc[df["Age"].isnull(), "Age"] = random_age
    return df


def fill_missing_values(df):
    df = fill_missing_embarked(df)
    df = fill_missing_age(df)
    return df

In [4]:
# prepare data
train = fill_missing_values(train)
train = apply_feature_engineering(train)
test = fill_missing_values(test)
passenger_ids = test["PassengerId"]
test = apply_feature_engineering(test)

In [None]:
# model
y = train["Survived"]
X = train.drop(["Survived"], axis=1)
model = XGBClassifier(n_estimators=1000, learning_rate=0.05)
model.fit(X, y)

In [None]:
# test CV

# kfold = KFold(n_splits=10, random_state=7)
# results = cross_val_score(model, X, y, cv=kfold, n_jobs=-1)
# print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
predictions = model.predict(test)
submission = pd.DataFrame({"PassengerId": passenger_ids,
                           "Survived": predictions })
submission.to_csv(get_file_location("gender_submission.csv"), index=False)