In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import KNNImputer

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import sklearn
sklearn.set_config(display='diagram')

In [2]:
df = pd.read_csv('../data/titanic.csv',usecols=['Survived','Age','Fare','Sex','Pclass'])
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [3]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
dtype: int64

In [4]:
col_transf = make_column_transformer(
    (KNNImputer(),['Age']),
    (OrdinalEncoder(),['Sex']),
    remainder='passthrough'
)

col_transf

In [5]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.2500
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.9250
3,1,1,female,35.0,53.1000
4,0,3,male,35.0,8.0500
...,...,...,...,...,...
886,0,2,male,27.0,13.0000
887,1,1,female,19.0,30.0000
888,0,3,female,,23.4500
889,1,1,male,26.0,30.0000


In [6]:
col_transf.fit_transform(df)

array([[22.        ,  1.        ,  0.        ,  3.        ,  7.25      ],
       [38.        ,  0.        ,  1.        ,  1.        , 71.2833    ],
       [26.        ,  0.        ,  1.        ,  3.        ,  7.925     ],
       ...,
       [29.69911765,  0.        ,  0.        ,  3.        , 23.45      ],
       [26.        ,  1.        ,  1.        ,  1.        , 30.        ],
       [32.        ,  1.        ,  0.        ,  3.        ,  7.75      ]])

In [7]:
pipe_dt = make_pipeline(col_transf,DecisionTreeClassifier(criterion='entropy'))
pipe_dt

In [8]:
pipe_rf = make_pipeline(col_transf,RandomForestClassifier(n_estimators=500,criterion='entropy'))
pipe_rf

## Modelagem

In [24]:
x = df.drop('Survived',axis=1)
y = df.Survived

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [25]:
## decision tree
pipe_dt.fit(x_train,y_train)
y_pred = pipe_dt.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.82      0.85       106
           1       0.77      0.85      0.81        73

    accuracy                           0.83       179
   macro avg       0.83      0.84      0.83       179
weighted avg       0.84      0.83      0.83       179



In [11]:
## random forest
pipe_rf.fit(x_train,y_train)
y_pred = pipe_rf.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83       105
           1       0.78      0.70      0.74        74

    accuracy                           0.79       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179



## RandomForest

In [27]:
## quantidade de estimadores

len(pipe_rf['randomforestclassifier'].estimators_)

500

In [25]:
pipe_rf

In [28]:
pipe_rf['randomforestclassifier'].estimators_[0]

In [29]:
pessoa_teste = pd.DataFrame(x_train.iloc[15,:]).T
pessoa_teste

Unnamed: 0,Pclass,Sex,Age,Fare
259,2,female,50.0,26.0


In [30]:
pipe_rf['columntransformer'].transform(pessoa_teste)

array([[50.0, 0.0, 2, 26.0]], dtype=object)

In [41]:
pipe_rf['randomforestclassifier'].estimators_[0].predict([[50.0, 0.0, 2, 26.0]])

array([1.])

In [42]:
pipe_rf.predict_proba(pessoa_teste)

array([[0.034, 0.966]])

In [48]:
pipe_rf.predict(pessoa_teste)

array([1])

In [43]:
results = []

for i in range(len(pipe_rf['randomforestclassifier'].estimators_)):
    result = pipe_rf['randomforestclassifier'].estimators_[i].predict([[50.0, 0.0, 2, 26.0]])[0]
    results.append(result)
else:
    results = pd.Series(results)

In [49]:
results.value_counts(normalize=True)

1.0    0.966
0.0    0.034
dtype: float64

In [65]:
pipe_rf['randomforestclassifier'].feature_importances_

array([0.31049808, 0.23323592, 0.09649357, 0.35977244])