In [195]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import KNNImputer

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import sklearn
sklearn.set_config(display='diagram')

In [196]:
df = pd.read_csv('../data/titanic.csv',usecols=['Survived','Age','Fare','Sex','Pclass'])
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [197]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
dtype: int64

In [198]:
col_transf = make_column_transformer(
    (KNNImputer(),['Age']),
    (OrdinalEncoder(),['Sex']),
    remainder='passthrough'
)

col_transf

In [199]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.2500
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.9250
3,1,1,female,35.0,53.1000
4,0,3,male,35.0,8.0500
...,...,...,...,...,...
886,0,2,male,27.0,13.0000
887,1,1,female,19.0,30.0000
888,0,3,female,,23.4500
889,1,1,male,26.0,30.0000


In [200]:
col_transf.fit_transform(df)

array([[22.        ,  1.        ,  0.        ,  3.        ,  7.25      ],
       [38.        ,  0.        ,  1.        ,  1.        , 71.2833    ],
       [26.        ,  0.        ,  1.        ,  3.        ,  7.925     ],
       ...,
       [29.69911765,  0.        ,  0.        ,  3.        , 23.45      ],
       [26.        ,  1.        ,  1.        ,  1.        , 30.        ],
       [32.        ,  1.        ,  0.        ,  3.        ,  7.75      ]])

In [201]:
pipe_dt = make_pipeline(col_transf,DecisionTreeClassifier(criterion='entropy'))
pipe_dt

In [202]:
pipe_rf = make_pipeline(col_transf,RandomForestClassifier(n_estimators=500,criterion='entropy'))
pipe_rf

## Modelagem

In [209]:
x = df.drop('Survived',axis=1)
y = df.Survived

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [210]:
## decision tree
pipe_dt.fit(x_train,y_train)
y_pred = pipe_dt.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.76      0.73      0.75       105
           1       0.64      0.68      0.66        74

    accuracy                           0.71       179
   macro avg       0.70      0.70      0.70       179
weighted avg       0.71      0.71      0.71       179



In [211]:
## decision tree
pipe_rf.fit(x_train,y_train)
y_pred = pipe_rf.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83       105
           1       0.76      0.74      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



## RandomForest

In [212]:
## quantidade de estimadores

len(pipe_rf['randomforestclassifier'].estimators_)

500

In [213]:
pipe_rf['randomforestclassifier'].estimators_[0]

In [283]:
pessoa_teste = pd.DataFrame(x_train.iloc[15,:]).T
pessoa_teste

Unnamed: 0,Pclass,Sex,Age,Fare
772,2,female,57.0,10.5


In [284]:
pipe_rf['columntransformer'].transform(pessoa_teste)

array([[57.0, 0.0, 2, 10.5]], dtype=object)

In [285]:
pipe_rf['randomforestclassifier'].estimators_[0].predict([[57.0, 0.0, 2, 10.5]])

array([1.])

In [290]:
pipe_rf.predict_proba(pessoa_teste)

array([[0.642, 0.358]])

In [286]:
results = []

for i in range(len(pipe_rf['randomforestclassifier'].estimators_)):
    result = pipe_rf['randomforestclassifier'].estimators_[i].predict([[57.0, 0.0, 2, 10.5]])[0]
    results.append(result)
else:
    results = pd.Series(results)

In [287]:
results.value_counts(normalize=True)

0.0    0.642
1.0    0.358
dtype: float64