In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

In [3]:
# Read data from Titanic dataset.

train_df = pd.read_csv('titanic_train.csv')
test_df = pd.read_csv('titanic_test.csv')

In [11]:
# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['survived','Age','SibSp','parch','fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['PassengerId','Pclass','Name','Sex','Ticket', 'Cabin','Embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

Using the prediction pipeline in a grid search
##############################################################################
 Grid search can also be performed on the different preprocessing steps
 defined in the ``ColumnTransformer`` object, together with the classifier's
 hyperparameters as part of the ``Pipeline``.
 We will search for both the imputer strategy of the numeric preprocessing
 and the regularization parameter of the logistic regression using
 :class:`sklearn.model_selection.GridSearchCV`.



In [5]:
# The columns that are inputted into our model (and later used to make predictions) are called "features." 
# In our case, those would be the columns used to determine the home price. Sometimes, you will use all columns 
# except the target as features. Other times you'll be better off with fewer features. For now, we'll build a model with 
# only a few features. Later on you'll see how to iterate and compare models built with different features.
y = train_df.Survived
titanic_features = ["Pclass","Sex","Age","SibSp","Parch","Ticket","Fare","Cabin","Embarked"]
X = train_df[titanic_features]

In [6]:
# Let's quickly review the data we'll be using to predict survival using the describe method and the head method, 
# which shows the top few rows.
X.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,891.0,714.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,20.125,0.0,0.0,7.9104
50%,3.0,28.0,0.0,0.0,14.4542
75%,3.0,38.0,1.0,0.0,31.0
max,3.0,80.0,8.0,6.0,512.3292


In [7]:
# Visually checking your data with these commands is an important part of a data scientist's job. 
# You'll frequently find surprises in the dataset that deserve further inspection.
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,female,35.0,1,0,113803,53.1,C123,S
4,3,male,35.0,0,0,373450,8.05,,S


# Decision Trees

In [8]:
# Pandas assigns a data type (called a dtype) to each column or Series. 
# Let's see a random sample of dtypes from our prediction data:
titanic_target = train_df.Survived
titanic_predictors = train_df.drop(['Survived'], axis=1)

test_predictors = titanic_target
train_predictors = titanic_predictors

train_predictors.dtypes.sample(10)

Age            float64
Cabin           object
Name            object
Embarked        object
Parch            int64
Pclass           int64
Ticket          object
Fare           float64
Sex             object
PassengerId      int64
dtype: object

In [9]:
# Scikit-learn is sensitive to the ordering of columns, so if the training dataset and test datasets get misaligned, your 
# results will be nonsense. This could happen if a categorical had a different number of values in the training data vs the 
# test data. Ensure the test data is encoded in the same manner as the training data with the align command:
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)
one_hot_encoded_test_predictors = pd.get_dummies(test_df)
final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
                                                                    join='left', 
                                                                    axis=1)

In [10]:
# We build and fit a model just as we would in scikit-learn.
from xgboost import XGBRegressor

my_model = XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(final_train, titanic_target, verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)