# Prediction for Survival of the Titanic disaster with XGBoost Model
(Feature Engineering (new feature Cabin_Prefix) and Hyperparameter tuning XGBoost (subsample decreased, reg_alpha increased))

## 1. Importing the libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from xgboost import XGBClassifier

## 2. Reading the datasets 

In [2]:
train = pd.read_csv('../00_data/train.csv')
test = pd.read_csv('../00_data/test.csv')

## 3. Exploring the datasets

### 3.1 Training set

In [3]:
print(train.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [4]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


##### Explore missing data 

In [5]:
print(train.isna().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


##### Missing data in Age

In [6]:
train_missing_age = train[train['Age'].isna()]
print(train_missing_age)

     PassengerId  Survived  Pclass                                      Name  \
5              6         0       3                          Moran, Mr. James   
17            18         1       2              Williams, Mr. Charles Eugene   
19            20         1       3                   Masselmani, Mrs. Fatima   
26            27         0       3                   Emir, Mr. Farred Chehab   
28            29         1       3             O'Dwyer, Miss. Ellen "Nellie"   
..           ...       ...     ...                                       ...   
859          860         0       3                          Razi, Mr. Raihed   
863          864         0       3         Sage, Miss. Dorothy Edith "Dolly"   
868          869         0       3               van Melkebeke, Mr. Philemon   
878          879         0       3                        Laleff, Mr. Kristo   
888          889         0       3  Johnston, Miss. Catherine Helen "Carrie"   

        Sex  Age  SibSp  Parch      Tic

##### Missing data in Cabin

In [7]:
train_missing_cabin = train[train['Cabin'].isna()]
print(train_missing_cabin)

     PassengerId  Survived  Pclass                                      Name  \
0              1         0       3                   Braund, Mr. Owen Harris   
2              3         1       3                    Heikkinen, Miss. Laina   
4              5         0       3                  Allen, Mr. William Henry   
5              6         0       3                          Moran, Mr. James   
7              8         0       3            Palsson, Master. Gosta Leonard   
..           ...       ...     ...                                       ...   
884          885         0       3                    Sutehall, Mr. Henry Jr   
885          886         0       3      Rice, Mrs. William (Margaret Norton)   
886          887         0       2                     Montvila, Rev. Juozas   
888          889         0       3  Johnston, Miss. Catherine Helen "Carrie"   
890          891         0       3                       Dooley, Mr. Patrick   

        Sex   Age  SibSp  Parch        

##### Missing data in Cabin grouped by Pclass

In [8]:
print(train_missing_cabin.groupby(['Pclass']).count()['PassengerId'])

Pclass
1     40
2    168
3    479
Name: PassengerId, dtype: int64


It seems that a missing value in Cabin number is correlated with Pclass as it occurs more often for Pclass 3 and 2.

##### Missing data in Cabin and survival

In [9]:
print(train_missing_cabin.groupby(['Survived']).count()['PassengerId'])

Survived
0    481
1    206
Name: PassengerId, dtype: int64


### 3.2 Test set

In [10]:
print(test.head())

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  


In [11]:
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None


In [12]:
print(test.isna().sum())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


## 4. Preprocessing

### 4.1 Feature Engineering

In [13]:
def drop_columns(df_list, col_list):
    for df in df_list:
        print(f'drop {col_list} from {df.to_string}')
        df.drop(columns=col_list, inplace=True)

#### 4.1.1 Drop unnecessary columns on Training set

In [14]:
print(train['Ticket'])
print(train['Ticket'].nunique())

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...       
886              211536
887              112053
888          W./C. 6607
889              111369
890              370376
Name: Ticket, Length: 891, dtype: object
681


Drop PassengerId, Name, Ticket and Embarked as these features don't supply useful information for the prediction of survival.

In [15]:
drop_columns([train], ['PassengerId', 'Name', 'Ticket', 'Embarked'])

drop ['PassengerId', 'Name', 'Ticket', 'Embarked'] from <bound method DataFrame.to_string of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry 

#### 4.1.2 Drop unnecessary columns on Test set

For the test set we keep the PassengerId, as we need it for the construction of the result dataframe containing the predictions for survival of a passenger made by the model.

In [16]:
drop_columns([test], ['Name', 'Ticket', 'Embarked'])

drop ['Name', 'Ticket', 'Embarked'] from <bound method DataFrame.to_string of      PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch  

#### 4.1.3 Cabin - Add a new column Cabin_Prefix to Training Set and Test Set

In [17]:
def insert_cabin_prefix(*df_list):
    for df in df_list:
        print(len(df.columns))
        #df.insert(len(df.columns), 'Cabin_Prefix', [get_cabin_prefix(val) for val in df['Cabin']])
        df.insert(len(df.columns), 'Cabin_Prefix', df['Cabin'].apply(lambda s: s[0] if isinstance(s, str) and len(s) > 0 else '_'))
        print(len(df.columns))
        print(df)

In [18]:
insert_cabin_prefix(train, test)

8
9
     Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Cabin_Prefix
0           0       3    male  22.0      1      0   7.2500   NaN            _
1           1       1  female  38.0      1      0  71.2833   C85            C
2           1       3  female  26.0      0      0   7.9250   NaN            _
3           1       1  female  35.0      1      0  53.1000  C123            C
4           0       3    male  35.0      0      0   8.0500   NaN            _
..        ...     ...     ...   ...    ...    ...      ...   ...          ...
886         0       2    male  27.0      0      0  13.0000   NaN            _
887         1       1  female  19.0      0      0  30.0000   B42            B
888         0       3  female   NaN      1      2  23.4500   NaN            _
889         1       1    male  26.0      0      0  30.0000  C148            C
890         0       3    male  32.0      0      0   7.7500   NaN            _

[891 rows x 9 columns]
8
9
     PassengerId  Pclass     Sex

In [19]:
print(train.groupby('Cabin_Prefix').count()['Pclass'])

Cabin_Prefix
A     15
B     47
C     59
D     33
E     32
F     13
G      4
T      1
_    687
Name: Pclass, dtype: int64


#### 4.1.4 Cabin - Drop column Cabin

In [20]:
drop_columns([train, test], ['Cabin'])

drop ['Cabin'] from <bound method DataFrame.to_string of      Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Cabin_Prefix
0           0       3    male  22.0      1      0   7.2500   NaN            _
1           1       1  female  38.0      1      0  71.2833   C85            C
2           1       3  female  26.0      0      0   7.9250   NaN            _
3           1       1  female  35.0      1      0  53.1000  C123            C
4           0       3    male  35.0      0      0   8.0500   NaN            _
..        ...     ...     ...   ...    ...    ...      ...   ...          ...
886         0       2    male  27.0      0      0  13.0000   NaN            _
887         1       1  female  19.0      0      0  30.0000   B42            B
888         0       3  female   NaN      1      2  23.4500   NaN            _
889         1       1    male  26.0      0      0  30.0000  C148            C
890         0       3    male  32.0      0      0   7.7500   NaN            _

[891 r

In [21]:
print(train.head())
print(test.head())

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin_Prefix
0         0       3    male  22.0      1      0   7.2500            _
1         1       1  female  38.0      1      0  71.2833            C
2         1       3  female  26.0      0      0   7.9250            _
3         1       1  female  35.0      1      0  53.1000            C
4         0       3    male  35.0      0      0   8.0500            _
   PassengerId  Pclass     Sex   Age  SibSp  Parch     Fare Cabin_Prefix
0          892       3    male  34.5      0      0   7.8292            _
1          893       3  female  47.0      1      0   7.0000            _
2          894       2    male  62.0      0      0   9.6875            _
3          895       3    male  27.0      0      0   8.6625            _
4          896       3  female  22.0      1      1  12.2875            _


### 4.2 Splitting the Training set in train and test data

In [22]:
train_train, train_test = train_test_split(train, test_size=0.2, random_state=0)

### 4.3 Creating X and y for model development

In [23]:
train_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_Prefix
140,0,3,female,,0,2,15.2458,_
439,0,2,male,31.0,0,0,10.5,_
817,0,2,male,31.0,1,1,37.0042,_
378,0,3,male,20.0,0,0,4.0125,_
491,0,3,male,21.0,0,0,7.25,_


In [24]:
X_train = train_train.iloc[:, 1:].values
y_train = train_train.iloc[:, 0].values

In [25]:
print(X_train)
print(y_train)

[[3 'female' nan ... 2 15.2458 '_']
 [2 'male' 31.0 ... 0 10.5 '_']
 [2 'male' 31.0 ... 1 37.0042 '_']
 ...
 [3 'male' nan ... 0 7.7333 '_']
 [3 'female' 36.0 ... 0 17.4 '_']
 [2 'male' 60.0 ... 1 39.0 '_']]
[0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 1 1 0 1 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0
 0 1 0 1 0 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 1 1 1 0 0 1 0 0 1 1 0 0 0 1 0 0 0
 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0
 1 0 1 1 1 0 0 0 1 0 1 0 0 1 1 1 0 0 1 0 1 0 0 0 0 1 0 1 0 1 0 1 1 1 0 0 0
 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 1 0 1 1 1 0 0 0 1 1 0 0 1 0 1 0 0 0 0 1 1 1
 0 1 0 1 0 1 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0
 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 1 0 1 0 1
 0 0 0 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 1 1 1 0 1 0
 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0
 1 0 1 0 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 1 0 0 0 0 1 1 0
 0 1 0 1 1 0 0 0 0 1 1 0 1 0 1 0 1 0 0 0 1

In [26]:
X_test = train_test.iloc[:, 1:].values
y_test = train_test.iloc[:, 0].values

In [27]:
print(X_test)
print(y_test)

[[3 'male' nan ... 0 14.4583 '_']
 [3 'male' nan ... 0 7.55 '_']
 [3 'male' 7.0 ... 1 29.125 '_']
 ...
 [1 'female' 31.0 ... 0 113.275 'D']
 [3 'male' 23.0 ... 0 7.8542 '_']
 [3 'male' 19.0 ... 0 8.05 '_']]
[0 0 0 1 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 1 0 1 0 1 0
 0 0 0 1 0 0 0 1 0 0 1 0 0 1 1 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0 1 1 1 1 0 0
 0 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0
 1 1 0 1 1 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1
 1 0 0 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0]


### 4.4 Creating X as input for prediction

In [28]:
print(test.head())

   PassengerId  Pclass     Sex   Age  SibSp  Parch     Fare Cabin_Prefix
0          892       3    male  34.5      0      0   7.8292            _
1          893       3  female  47.0      1      0   7.0000            _
2          894       2    male  62.0      0      0   9.6875            _
3          895       3    male  27.0      0      0   8.6625            _
4          896       3  female  22.0      1      1  12.2875            _


In [29]:
X_pred = test.iloc[:, 1:].values

In [30]:
print(X_pred)

[[3 'male' 34.5 ... 0 7.8292 '_']
 [3 'female' 47.0 ... 0 7.0 '_']
 [2 'male' 62.0 ... 0 9.6875 '_']
 ...
 [3 'male' 38.5 ... 0 7.25 '_']
 [3 'male' nan ... 0 8.05 '_']
 [3 'male' nan ... 1 22.3583 '_']]


### 4.5 Handle missing data 

#### 4.5.1 Training set (model development)

In [31]:
print(train_train.isna().sum())

Survived          0
Pclass            0
Sex               0
Age             141
SibSp             0
Parch             0
Fare              0
Cabin_Prefix      0
dtype: int64


##### Calculate treshold for missing values

In [32]:
treshold = len(train_train) * 0.05
print(treshold)
print(len(train_train))

35.6
712


As it is already obvious, the number of missing values in Age is far above the treshold so we cannot simply drop them. 

We have now two options: 
- drop feature Age as there are many missing values in it
- handle missing data by imputing mean e.g.

First we try building the model with feature Age and imputed values. 

##### Keep Age and impute missing values

In [33]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(np.reshape(X_train[:, 2], (len(X_train[:, 2]), 1)))
X_train[:, 2] = imputer.transform(np.reshape(X_train[:, 2], (len(X_train[:, 2]), 1))).reshape(-1)

In [34]:
print(X_train)

[[3 'female' 24.0 ... 2 15.2458 '_']
 [2 'male' 31.0 ... 0 10.5 '_']
 [2 'male' 31.0 ... 1 37.0042 '_']
 ...
 [3 'male' 24.0 ... 0 7.7333 '_']
 [3 'female' 36.0 ... 0 17.4 '_']
 [2 'male' 60.0 ... 1 39.0 '_']]


#### 4.5.2 Test set (model development)

In [35]:
print(train_test.isna().sum())

Survived         0
Pclass           0
Sex              0
Age             36
SibSp            0
Parch            0
Fare             0
Cabin_Prefix     0
dtype: int64


In [36]:
X_test[:, 2] = imputer.transform(np.reshape(X_test[:, 2], (len(X_test[:, 2]), 1))).reshape(-1)

In [37]:
print(X_test)

[[3 'male' 24.0 ... 0 14.4583 '_']
 [3 'male' 24.0 ... 0 7.55 '_']
 [3 'male' 7.0 ... 1 29.125 '_']
 ...
 [1 'female' 31.0 ... 0 113.275 'D']
 [3 'male' 23.0 ... 0 7.8542 '_']
 [3 'male' 19.0 ... 0 8.05 '_']]


#### 4.5.3 Test set (test set to predict)

In [38]:
print(test.isna().sum())

PassengerId      0
Pclass           0
Sex              0
Age             86
SibSp            0
Parch            0
Fare             1
Cabin_Prefix     0
dtype: int64


Here we have one additional column with missing values in Fare.

##### Calculate treshold for missing values

In [39]:
treshold = len(test) * 0.05
print(treshold)
print(len(test))

20.900000000000002
418


The missing value in Fare is below the treshold and the observation could be dropped. But as this observation belongs to the test set to predict, we have to deal with it and impute the missing value instead. The missing values in Age are filled by the imputer used for the training set.

##### Impute missing values for Age

In [40]:
X_pred[:, 2] = imputer.transform(np.reshape(X_pred[:, 2], (len(X_pred[:, 2]), 1))).reshape(-1)

In [41]:
print(X_pred)

[[3 'male' 34.5 ... 0 7.8292 '_']
 [3 'female' 47.0 ... 0 7.0 '_']
 [2 'male' 62.0 ... 0 9.6875 '_']
 ...
 [3 'male' 38.5 ... 0 7.25 '_']
 [3 'male' 24.0 ... 0 8.05 '_']
 [3 'male' 24.0 ... 1 22.3583 '_']]


##### Impute missing values for Fare

In [42]:
imputer_fare = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_fare.fit(np.reshape(X_pred[:, 5], (len(X_pred[:, 5]), 1)))
X_pred[:, 5] = imputer_fare.transform(np.reshape(X_pred[:, 5], (len(X_pred[:, 5]), 1))).reshape(-1)

### 4.6 Encode categorical values

#### 4.6.1 Training set (model development)

In [43]:
print(X_train)
print(np.shape(X_train))
print(X_train[0])

[[3 'female' 24.0 ... 2 15.2458 '_']
 [2 'male' 31.0 ... 0 10.5 '_']
 [2 'male' 31.0 ... 1 37.0042 '_']
 ...
 [3 'male' 24.0 ... 0 7.7333 '_']
 [3 'female' 36.0 ... 0 17.4 '_']
 [2 'male' 60.0 ... 1 39.0 '_']]
(712, 7)
[3 'female' 24.0 0 2 15.2458 '_']


Encode values in PClass (col 0), Sex (col 1) and Cabin_Prefix (col 6)

In [44]:
ct_X = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), [0, 1, 6])], remainder='passthrough')
ct_X.fit(X_train)
X_train = ct_X.transform(X_train)

In [45]:
print(X_train)
print(np.shape(X_train))
print(X_train[0])

[[0.0 0.0 1.0 ... 0 2 15.2458]
 [0.0 1.0 0.0 ... 0 0 10.5]
 [0.0 1.0 0.0 ... 1 1 37.0042]
 ...
 [0.0 0.0 1.0 ... 0 0 7.7333]
 [0.0 0.0 1.0 ... 1 0 17.4]
 [0.0 1.0 0.0 ... 1 1 39.0]]
(712, 18)
[0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 24.0 0 2 15.2458]


#### 4.6.2 Test set (model input)

In [46]:
X_test = ct_X.transform(X_test)

In [47]:
print(X_test)

[[0.0 0.0 1.0 ... 0 0 14.4583]
 [0.0 0.0 1.0 ... 0 0 7.55]
 [0.0 0.0 1.0 ... 4 1 29.125]
 ...
 [1.0 0.0 0.0 ... 1 0 113.275]
 [0.0 0.0 1.0 ... 0 0 7.8542]
 [0.0 0.0 1.0 ... 0 0 8.05]]


#### 4.6.3 Test set (test set to predict)

In [48]:
X_pred = ct_X.transform(X_pred)

In [49]:
print(X_pred)

[[0.0 0.0 1.0 ... 0 0 7.8292]
 [0.0 0.0 1.0 ... 1 0 7.0]
 [0.0 1.0 0.0 ... 0 0 9.6875]
 ...
 [0.0 0.0 1.0 ... 0 0 7.25]
 [0.0 0.0 1.0 ... 0 0 8.05]
 [0.0 0.0 1.0 ... 1 1 22.3583]]


### 4.7 Feature Scaling

#### 4.7.1 Training set (model development)

In [50]:
sc = StandardScaler()
X_train[:, -5:-1] = sc.fit_transform(X_train[:, -5:-1])

In [51]:
print(X_train)

[[0.0 0.0 1.0 ... -0.46445233851359824 1.9592640285249252 15.2458]
 [0.0 1.0 0.0 ... -0.46445233851359824 -0.4774101868632787 10.5]
 [0.0 1.0 0.0 ... 0.41270963767123453 0.7409269208308232 37.0042]
 ...
 [0.0 0.0 1.0 ... -0.46445233851359824 -0.4774101868632787 7.7333]
 [0.0 0.0 1.0 ... 0.41270963767123453 -0.4774101868632787 17.4]
 [0.0 1.0 0.0 ... 0.41270963767123453 0.7409269208308232 39.0]]


#### 4.7.2 Test set (model development)

In [52]:
X_test[:, -5:-1] = sc.transform(X_test[:, -5:-1])

In [53]:
print(X_test)

[[0.0 0.0 1.0 ... -0.46445233851359824 -0.4774101868632787 14.4583]
 [0.0 0.0 1.0 ... -0.46445233851359824 -0.4774101868632787 7.55]
 [0.0 0.0 1.0 ... 3.0441955662257327 0.7409269208308232 29.125]
 ...
 [1.0 0.0 0.0 ... 0.41270963767123453 -0.4774101868632787 113.275]
 [0.0 0.0 1.0 ... -0.46445233851359824 -0.4774101868632787 7.8542]
 [0.0 0.0 1.0 ... -0.46445233851359824 -0.4774101868632787 8.05]]


#### 4.7.3 Test set (test set to predict)

In [54]:
X_pred[:, -5:-1] = sc.transform(X_pred[:, -5:-1])

In [55]:
print(X_pred)

[[0.0 0.0 1.0 ... -0.46445233851359824 -0.4774101868632787 7.8292]
 [0.0 0.0 1.0 ... 0.41270963767123453 -0.4774101868632787 7.0]
 [0.0 1.0 0.0 ... -0.46445233851359824 -0.4774101868632787 9.6875]
 ...
 [0.0 0.0 1.0 ... -0.46445233851359824 -0.4774101868632787 7.25]
 [0.0 0.0 1.0 ... -0.46445233851359824 -0.4774101868632787 8.05]
 [0.0 0.0 1.0 ... 0.41270963767123453 0.7409269208308232 22.3583]]


### 4.8 Build the XGBoost model

In [56]:
classifier = XGBClassifier(n_estimators=1000, learning_rate=0.05, subsample=0.3, reg_alpha=3)
classifier.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)])

[0]	validation_0-logloss:0.67383
[1]	validation_0-logloss:0.65604
[2]	validation_0-logloss:0.63862
[3]	validation_0-logloss:0.62659
[4]	validation_0-logloss:0.61152
[5]	validation_0-logloss:0.60060
[6]	validation_0-logloss:0.58616
[7]	validation_0-logloss:0.57432
[8]	validation_0-logloss:0.56463
[9]	validation_0-logloss:0.55593
[10]	validation_0-logloss:0.54760
[11]	validation_0-logloss:0.53728
[12]	validation_0-logloss:0.53154
[13]	validation_0-logloss:0.52508
[14]	validation_0-logloss:0.51873
[15]	validation_0-logloss:0.51303
[16]	validation_0-logloss:0.50626
[17]	validation_0-logloss:0.50111
[18]	validation_0-logloss:0.49493
[19]	validation_0-logloss:0.49015
[20]	validation_0-logloss:0.48393
[21]	validation_0-logloss:0.47921
[22]	validation_0-logloss:0.47470
[23]	validation_0-logloss:0.47143
[24]	validation_0-logloss:0.46838
[25]	validation_0-logloss:0.46611
[26]	validation_0-logloss:0.46339
[27]	validation_0-logloss:0.45960
[28]	validation_0-logloss:0.45723
[29]	validation_0-loglos



[33]	validation_0-logloss:0.44349
[34]	validation_0-logloss:0.44023
[35]	validation_0-logloss:0.43846
[36]	validation_0-logloss:0.43554
[37]	validation_0-logloss:0.43395
[38]	validation_0-logloss:0.43308
[39]	validation_0-logloss:0.43087
[40]	validation_0-logloss:0.42911
[41]	validation_0-logloss:0.42817
[42]	validation_0-logloss:0.42741
[43]	validation_0-logloss:0.42549
[44]	validation_0-logloss:0.42478
[45]	validation_0-logloss:0.42304
[46]	validation_0-logloss:0.42292
[47]	validation_0-logloss:0.42044
[48]	validation_0-logloss:0.42061
[49]	validation_0-logloss:0.42076
[50]	validation_0-logloss:0.42043
[51]	validation_0-logloss:0.41991
[52]	validation_0-logloss:0.41929
[53]	validation_0-logloss:0.41740
[54]	validation_0-logloss:0.41605
[55]	validation_0-logloss:0.41352
[56]	validation_0-logloss:0.41314
[57]	validation_0-logloss:0.41189
[58]	validation_0-logloss:0.41096
[59]	validation_0-logloss:0.41103
[60]	validation_0-logloss:0.41042
[61]	validation_0-logloss:0.40929
[62]	validatio

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

### 4.9. Predict Test set (model development )

In [57]:
y_model_pred = classifier.predict(X_test)

In [58]:
print(np.concatenate((y_model_pred.reshape(len(y_model_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [0 1]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]

### 4.10. Print Confusion Matrix

In [59]:
print(confusion_matrix(y_test, y_model_pred))

[[100  10]
 [ 21  48]]


In [60]:
print(accuracy_score(y_test, y_model_pred) * 100)

82.68156424581005


### 4.11. Applying k-Fold Cross Validation

In [61]:
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 83.58 %
Standard Deviation: 2.01 %


### 4.11. Predict Test set (with unknown results)

In [62]:
y_pred = classifier.predict(X_pred)

In [63]:
print(y_pred)

[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


### 4.12. Create result dataframe and print it to csv

In [64]:
print(len(y_pred))

418


In [65]:
result = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_pred})
print(result)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]


In [66]:
result.to_csv('../00_data/xgboost_9.csv', index=False)