In [11]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

In [12]:
def od_ordinal(X, object_cols):
    """Takes in X dataframe and a list of column names with object
       data type and returns an ordinal encoded X dataframe."""

    ordinal = OrdinalEncoder()
    X[object_cols] = pd.DataFrame(ordinal.fit_transform(X[object_cols]))
    return X

In [13]:
def normalize(x):
    return ' '.join([a.strip(',()[].\'"') for a in x.split(' ')])

def ticket_number(x):
    return x.split(' ')[-1]

def ticket_object(x):
    if len(x) == 1:
        return 'NONE'
    return '_'.join(x.split(' ')[:-1])

In [14]:
train_set = pd.read_csv('train.csv')

X = train_set.drop('Survived', axis=1)
y = train_set.Survived
X_test = pd.read_csv('test.csv')

X.Name = X.Name.apply(normalize)
X['Ticket_Number'] = X.Ticket.apply(ticket_number)
X.loc[X['Ticket_Number'] == 'LINE', 'Ticket_Number'] = np.nan
X['Ticket'] = X.Ticket.apply(ticket_object)

X_test.Name = X_test.Name.apply(normalize)
X_test['Ticket_Number'] = X_test.Ticket.apply(ticket_number)
X_test.loc[X_test['Ticket_Number'] == 'LINE', 'Ticket_Number'] = np.nan
X_test['Ticket'] = X_test.Ticket.apply(ticket_object)

object_cols = [col for col in X.columns if X[col].dtype=='object']

X = od_ordinal(X, object_cols)
X_test = od_ordinal(X_test, object_cols)

In [15]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

model = XGBRegressor(n_estimators=1000, learning_rate=0.05, early_stopping_rounds=10)
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=False)
yhat_valid = np.where(model.predict(X_valid) >= 0.5, 1, 0) 

cost = accuracy_score(y_valid, yhat_valid)

print(cost, model.best_ntree_limit)

0.8430493273542601 79


In [16]:
# whole train set

model_2 = XGBRegressor(n_estimators=69, learning_rate=0.05)

model_2.fit(X, y)

yhat_test = np.where(model_2.predict(X_test) >= 0.5, 1, 0) 

output = pd.DataFrame({'PassengerId': X_test.PassengerId,
                       'Survived': yhat_test,})

output.to_csv('submission2.csv', index=False)