# Prediction for Survival of the Titanic desaster with Random Forest Tree Model

## 1. Importing the libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

## 2. Reading the datasets 

In [2]:
train = pd.read_csv('../00_data/train.csv')
test = pd.read_csv('../00_data/test.csv')

## 3. Exploring the datasets

### 3.1 Training set

In [3]:
print(train.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [4]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [5]:
print(train.isna().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


### 3.2 Test set

In [6]:
print(test.head())

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  


In [7]:
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None


In [8]:
print(test.isna().sum())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


## 4. Preprocessing

### 4.1 Drop unnecessary columns

#### 4.1.1 Training set

In [9]:
print(train['Ticket'])
print(train['Ticket'].nunique())

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...       
886              211536
887              112053
888          W./C. 6607
889              111369
890              370376
Name: Ticket, Length: 891, dtype: object
681


Drop PassengerId, Name, Ticket and Embarked as these features don't supply useful information for the prediction of survival.
Drop Cabin, too, as there are too many missing values.

In [10]:
train.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], inplace=True)

#### 4.1.2 Test set

For the test set we keep the PassengerId, as we need it for the construction of the result dataframe containing the predictions for survival of a passenger made by the model.

In [11]:
test.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked'], inplace=True)

### 4.2 Splitting the Training set in train and test data

In [12]:
train_train, train_test = train_test_split(train, test_size=0.2, random_state=0)

### 4.3 Creating X and y for model development

In [13]:
train_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
140,0,3,female,,0,2,15.2458
439,0,2,male,31.0,0,0,10.5
817,0,2,male,31.0,1,1,37.0042
378,0,3,male,20.0,0,0,4.0125
491,0,3,male,21.0,0,0,7.25


In [14]:
X_train = train_train.iloc[:, 1:].values
y_train = train_train.iloc[:, 0].values

In [15]:
print(X_train)
print(y_train)

[[3 'female' nan 0 2 15.2458]
 [2 'male' 31.0 0 0 10.5]
 [2 'male' 31.0 1 1 37.0042]
 ...
 [3 'male' nan 0 0 7.7333]
 [3 'female' 36.0 1 0 17.4]
 [2 'male' 60.0 1 1 39.0]]
[0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 1 1 0 1 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0
 0 1 0 1 0 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 1 1 1 0 0 1 0 0 1 1 0 0 0 1 0 0 0
 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0
 1 0 1 1 1 0 0 0 1 0 1 0 0 1 1 1 0 0 1 0 1 0 0 0 0 1 0 1 0 1 0 1 1 1 0 0 0
 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 1 0 1 1 1 0 0 0 1 1 0 0 1 0 1 0 0 0 0 1 1 1
 0 1 0 1 0 1 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0
 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 1 0 1 0 1
 0 0 0 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 1 1 1 0 1 0
 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0
 1 0 1 0 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 1 0 0 0 0 1 1 0
 0 1 0 1 1 0 0 0 0 1 1 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1
 1 

In [16]:
X_test = train_test.iloc[:, 1:].values
y_test = train_test.iloc[:, 0].values

In [17]:
print(X_test)
print(y_test)

[[3 'male' nan 0 0 14.4583]
 [3 'male' nan 0 0 7.55]
 [3 'male' 7.0 4 1 29.125]
 ...
 [1 'female' 31.0 1 0 113.275]
 [3 'male' 23.0 0 0 7.8542]
 [3 'male' 19.0 0 0 8.05]]
[0 0 0 1 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 1 0 1 0 1 0
 0 0 0 1 0 0 0 1 0 0 1 0 0 1 1 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0 1 1 1 1 0 0
 0 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0
 1 1 0 1 1 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1
 1 0 0 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0]


### 4.4 Creating X as input for prediction

In [18]:
print(test.head())

   PassengerId  Pclass     Sex   Age  SibSp  Parch     Fare
0          892       3    male  34.5      0      0   7.8292
1          893       3  female  47.0      1      0   7.0000
2          894       2    male  62.0      0      0   9.6875
3          895       3    male  27.0      0      0   8.6625
4          896       3  female  22.0      1      1  12.2875


In [19]:
X_pred = test.iloc[:, 1:].values

In [20]:
print(X_pred)

[[3 'male' 34.5 0 0 7.8292]
 [3 'female' 47.0 1 0 7.0]
 [2 'male' 62.0 0 0 9.6875]
 ...
 [3 'male' 38.5 0 0 7.25]
 [3 'male' nan 0 0 8.05]
 [3 'male' nan 1 1 22.3583]]


### 4.5 Handle missing data 

#### 4.5.1 Training set (model development)

In [21]:
print(train_train.isna().sum())

Survived      0
Pclass        0
Sex           0
Age         141
SibSp         0
Parch         0
Fare          0
dtype: int64


##### Calculate treshold for missing values

In [22]:
treshold = len(train_train) * 0.05
print(treshold)
print(len(train_train))

35.6
712


As it is already obvious, the number of missing values in Age is far above the treshold so we cannot simply drop them. 

We have now two options: 
- drop feature Age as there are many missing values in it
- handle missing data by imputing mean e.g.

First we try building the model with feature Age and imputed values. 

##### Keep Age and impute missing values

In [23]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(np.reshape(X_train[:, 2], (len(X_train[:, 2]), 1)))
X_train[:, 2] = imputer.transform(np.reshape(X_train[:, 2], (len(X_train[:, 2]), 1))).reshape(-1)

In [24]:
print(X_train)

[[3 'female' 24.0 0 2 15.2458]
 [2 'male' 31.0 0 0 10.5]
 [2 'male' 31.0 1 1 37.0042]
 ...
 [3 'male' 24.0 0 0 7.7333]
 [3 'female' 36.0 1 0 17.4]
 [2 'male' 60.0 1 1 39.0]]


#### 4.5.2 Test set (model development)

In [25]:
print(train_test.isna().sum())

Survived     0
Pclass       0
Sex          0
Age         36
SibSp        0
Parch        0
Fare         0
dtype: int64


In [26]:
X_test[:, 2] = imputer.transform(np.reshape(X_test[:, 2], (len(X_test[:, 2]), 1))).reshape(-1)

In [27]:
print(X_test)

[[3 'male' 24.0 0 0 14.4583]
 [3 'male' 24.0 0 0 7.55]
 [3 'male' 7.0 4 1 29.125]
 ...
 [1 'female' 31.0 1 0 113.275]
 [3 'male' 23.0 0 0 7.8542]
 [3 'male' 19.0 0 0 8.05]]


#### 4.5.3 Test set (test set to predict)

In [28]:
print(test.isna().sum())

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Fare            1
dtype: int64


Here we have one additional column with missing values in Fare.

##### Calculate treshold for missing values

In [29]:
treshold = len(test) * 0.05
print(treshold)
print(len(test))

20.900000000000002
418


The missing value in Fare is below the treshold and the observation could be dropped. But as this observation belongs to the test set to predict, we have to deal with it and impute the missing value instead. The missing values in Age are filled by the imputer used for the training set.

##### Impute missing values for Age

In [30]:
X_pred[:, 2] = imputer.transform(np.reshape(X_pred[:, 2], (len(X_pred[:, 2]), 1))).reshape(-1)

In [31]:
print(X_pred)

[[3 'male' 34.5 0 0 7.8292]
 [3 'female' 47.0 1 0 7.0]
 [2 'male' 62.0 0 0 9.6875]
 ...
 [3 'male' 38.5 0 0 7.25]
 [3 'male' 24.0 0 0 8.05]
 [3 'male' 24.0 1 1 22.3583]]


In [32]:
##### Impute missing values for Fare

In [33]:
imputer_fare = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_fare.fit(np.reshape(X_pred[:, 5], (len(X_pred[:, 5]), 1)))
X_pred[:, 5] = imputer.transform(np.reshape(X_pred[:, 5], (len(X_pred[:, 5]), 1))).reshape(-1)

### 4.6 Encode categorical values

#### 4.6.1 Training set (model development)

In [34]:
print(X_train)

[[3 'female' 24.0 0 2 15.2458]
 [2 'male' 31.0 0 0 10.5]
 [2 'male' 31.0 1 1 37.0042]
 ...
 [3 'male' 24.0 0 0 7.7333]
 [3 'female' 36.0 1 0 17.4]
 [2 'male' 60.0 1 1 39.0]]


Encode values in PClass (col 0) and Sex (col 1).

In [35]:
ct_X = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), [0, 1])], remainder='passthrough')
ct_X.fit(X_train)
X_train = ct_X.transform(X_train)

In [36]:
print(X_train)

[[0.0 0.0 1.0 ... 0 2 15.2458]
 [0.0 1.0 0.0 ... 0 0 10.5]
 [0.0 1.0 0.0 ... 1 1 37.0042]
 ...
 [0.0 0.0 1.0 ... 0 0 7.7333]
 [0.0 0.0 1.0 ... 1 0 17.4]
 [0.0 1.0 0.0 ... 1 1 39.0]]


#### 4.6.2 Test set (model input)

In [37]:
X_test = ct_X.transform(X_test)

In [38]:
print(X_test)

[[0.0 0.0 1.0 ... 0 0 14.4583]
 [0.0 0.0 1.0 ... 0 0 7.55]
 [0.0 0.0 1.0 ... 4 1 29.125]
 ...
 [1.0 0.0 0.0 ... 1 0 113.275]
 [0.0 0.0 1.0 ... 0 0 7.8542]
 [0.0 0.0 1.0 ... 0 0 8.05]]


#### 4.6.3 Test set (test set to predict)

In [39]:
X_pred = ct_X.transform(X_pred)

In [40]:
print(X_pred)

[[0.0 0.0 1.0 ... 0 0 7.8292]
 [0.0 0.0 1.0 ... 1 0 7.0]
 [0.0 1.0 0.0 ... 0 0 9.6875]
 ...
 [0.0 0.0 1.0 ... 0 0 7.25]
 [0.0 0.0 1.0 ... 0 0 8.05]
 [0.0 0.0 1.0 ... 1 1 22.3583]]


### 4.7 Feature Scaling

#### 4.7.1 Training set (model development)

In [41]:
sc = StandardScaler()
X_train[:, 2:] = sc.fit_transform(X_train[:, 2:])

In [42]:
print(X_train)

[[0.0 0.0 0.8932971498777942 ... -0.46445233851359824 1.9592640285249252
  -0.3316790433256366]
 [0.0 1.0 -1.119448327062046 ... -0.46445233851359824 -0.4774101868632787
  -0.4264054204191258]
 [0.0 1.0 -1.119448327062046 ... 0.41270963767123453 0.7409269208308232
  0.10261957931985559]
 ...
 [0.0 0.0 0.8932971498777942 ... -0.46445233851359824 -0.4774101868632787
  -0.4816288743161625]
 [0.0 0.0 0.8932971498777942 ... 0.41270963767123453 -0.4774101868632787
  -0.2886811164145163]
 [0.0 1.0 -1.119448327062046 ... 0.41270963767123453 0.7409269208308232
  0.1424558352520875]]


#### 4.7.2 Test set (model development)

In [43]:
X_test[:, 2:] = sc.transform(X_test[:, 2:])

In [44]:
print(X_test)

[[0.0 0.0 0.8932971498777942 ... -0.46445233851359824 -0.4774101868632787
  -0.3473975780218148]
 [0.0 0.0 0.8932971498777942 ... -0.46445233851359824 -0.4774101868632787
  -0.485287550392111]
 [0.0 0.0 0.8932971498777942 ... 3.0441955662257327 0.7409269208308232
  -0.05464959982697327]
 ...
 [1.0 0.0 -1.119448327062046 ... 0.41270963767123453 -0.4774101868632787
  1.6249881077075041]
 [0.0 0.0 0.8932971498777942 ... -0.46445233851359824 -0.4774101868632787
  -0.4792157049894731]
 [0.0 0.0 0.8932971498777942 ... -0.46445233851359824 -0.4774101868632787
  -0.4753075283627915]]


#### 4.7.3 Test set (test set to predict)

In [45]:
X_pred[:, 2:] = sc.transform(X_pred[:, 2:])

In [46]:
print(X_pred)

[[0.0 0.0 0.8932971498777942 ... -0.46445233851359824 -0.4774101868632787
  -0.479714706090939]
 [0.0 0.0 0.8932971498777942 ... 0.41270963767123453 -0.4774101868632787
  -0.49626557462436255]
 [0.0 1.0 -1.119448327062046 ... -0.46445233851359824 -0.4774101868632787
  -0.44262295621677006]
 ...
 [0.0 0.0 0.8932971498777942 ... -0.46445233851359824 -0.4774101868632787
  -0.49127556360970276]
 [0.0 0.0 0.8932971498777942 ... -0.46445233851359824 -0.4774101868632787
  -0.4753075283627915]
 [0.0 0.0 0.8932971498777942 ... 0.41270963767123453 0.7409269208308232
  -0.1897132299585662]]


### 4.8 Build the Random Forest Tree model

In [47]:
classifier = RandomForestClassifier(criterion='entropy', n_estimators=100, random_state=1000)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', random_state=1000)

### 4.9. Predict Test set (model development )

In [48]:
y_model_pred = classifier.predict(X_test)

In [49]:
print(np.concatenate((y_model_pred.reshape(len(y_model_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]

### 4.10. Print Confusion Matrix

In [50]:
print(confusion_matrix(y_test, y_model_pred))

[[100  10]
 [ 18  51]]


In [51]:
print(accuracy_score(y_test, y_model_pred) * 100)

84.35754189944134


### 4.11. Predict Test set (with unknown results)

In [52]:
y_pred = classifier.predict(X_pred)

In [53]:
print(y_pred)

[0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0 1 0 0
 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 1
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 1 0 0 0 0 0 1 1 1 0 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0
 1 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0
 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 1 1 0 0 0 1 0 1 0 0 0 0 1 1 1 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 0 0 1 0 0 0]


### 4.12. Create result dataframe and print it to csv

In [54]:
print(len(y_pred))

418


In [55]:
result = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_pred})
print(result)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         1
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]


In [56]:
result.to_csv('../00_data/pred_rand_forest.csv', index=False)