# Libraries used

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import sys
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import ydf
import math

# Data import

In [13]:
ds_test = pd.read_csv("C:\\Users\\Matheus Poletto\\Desktop\\Cientista de Dados\\KAGGLE\\TITANIC\\test.csv")
ds_train = pd.read_csv("C:\\Users\\Matheus Poletto\\Desktop\\Cientista de Dados\\KAGGLE\\TITANIC\\train.csv")
ds_complete = pd.concat([ds_test, ds_train])

In [14]:
ds_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Data treatment

## Verifing duplicates

In [15]:
ds_train.duplicated().sum()

0

## Verifing NA

In [16]:
ds_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Replacing NA AGE

In [17]:
# As we have NA in age, we'll replace it with the mean, taking in consideration the pclass, as the older passenger tend to be in higher classes
ds_train_pclass1 = ds_train[ds_train['Pclass'] == 1]
ds_train_pclass1_mean_age = ds_train_pclass1['Age'].mean().round(2)

ds_train_pclass2 = ds_train[ds_train['Pclass'] == 2]
ds_train_pclass2_mean_age = ds_train_pclass2['Age'].mean().round(2)

ds_train_pclass3 = ds_train[ds_train['Pclass'] == 3]
ds_train_pclass3_mean_age = ds_train_pclass3['Age'].mean().round(2)

print(ds_train_pclass1_mean_age)
print(ds_train_pclass2_mean_age)
print(ds_train_pclass3_mean_age)


38.23
29.88
25.14


In [18]:
# Changing Ages with NA to mean
ds_train.loc[(ds_train['Pclass'] == 1) & (ds_train['Age'].isna()), 'Age'] = ds_train_pclass1_mean_age
ds_train.loc[(ds_train['Pclass'] == 2) & (ds_train['Age'].isna()), 'Age'] = ds_train_pclass2_mean_age
ds_train.loc[(ds_train['Pclass'] == 3) & (ds_train['Age'].isna()), 'Age'] = ds_train_pclass3_mean_age



In [19]:
ds_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Droping irrelevant data

In [20]:
# We're dropping cabin since there's no pattern and there are a lot of NA
ds_train.drop('Cabin', axis=1, inplace=True)
ds_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.00,1,0,A/5 21171,7.2500,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.00,0,0,STON/O2. 3101282,7.9250,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,1,0,113803,53.1000,S
4,5,0,3,"Allen, Mr. William Henry",male,35.00,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.00,0,0,211536,13.0000,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.00,0,0,112053,30.0000,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,25.14,1,2,W./C. 6607,23.4500,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.00,0,0,111369,30.0000,C


## Replacing Embarked with mode

In [21]:
# We're changing Embarked NA to the mode
mode = ds_train['Embarked'].mode()[0]
ds_train['Embarked'].fillna(mode, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ds_train['Embarked'].fillna(mode, inplace=True)


In [22]:
# There were no changing in the correlation after the treatment

# Yggdrasil Decision Forests

In [23]:
ydf_model = ydf.GradientBoostedTreesLearner(
    label="Survived"
    
    ).train(ds_train)


Train model on 891 examples
Model trained in 0:00:00.166999


In [24]:
ydf_model.describe()

In [25]:
ydf_model.analyze(ds_train)

In [26]:
accuracy_ydf = ydf_model.evaluate(ds_train)

# Show the full evaluation report
print("Full evaluation report:")
accuracy_ydf

Full evaluation report:


Label \ Pred,0,1
0,523,72
1,26,270


In [27]:
ydf_pred = ydf_model.predict(ds_test)
ydf_pred = ydf_pred.round(0).astype(int)
ydf_pred

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [28]:
# Export the result
output = pd.DataFrame({'PassengerId': ds_test['PassengerId'], 'Survived': ydf_pred})
#output.to_csv('submissionrev21.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
