# Aprendizado de Máquina - Trabalho 2

Nome: Matheus dos Santos Moura <br>
Matrícula: 2410003MCICMA

URL do vídeo: ...

| Questão | Tempo |
|---------|-------|
| 1       | 00:16 |
| 2       | 03:01 |
| 3       | 05:33 |
| 4       | 11:55 |
| 5       | 15:39 |

In [55]:
from pprint import pprint
import category_encoders
import numpy as np
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler

SEED = 42
sklearn.set_config(transform_output="pandas")

## 1 - Engenharia de Features

- Considere novamente o mesmo problema da questão 1 do T1.
- Utilize os mesmos conjuntos de dados, mas investigue outras técnicas de codificação.
- Utilize apenas um dos algoritmos de aprendizado de máquina.
- Realize uma análise comparativa deste experimento frente aos resultados do T1.

In [56]:
# Loading datasets

q1_column_names = [
    "ESCT",
    "NDEP",
    "RENDA",
    "TIPOR",
    "VBEM",
    "NPARC",
    "VPARC",
    "TEL",
    "IDADE",
    "RESMS",
    "ENTRADA",
    "CLASSE"
]

q1_df_train = pd.read_csv("../data/credtrain.txt", sep="\t", names=q1_column_names)
q1_df_test = pd.read_csv("../data/credtest.txt", sep="\t", names=q1_column_names)

q1_df_train.head()

Unnamed: 0,ESCT,NDEP,RENDA,TIPOR,VBEM,NPARC,VPARC,TEL,IDADE,RESMS,ENTRADA,CLASSE
0,1,0,360,0,313,9,52,0,25,48,0,1
1,0,0,350,1,468,10,65,0,33,6,0,1
2,0,0,1100,0,829,9,125,0,56,48,0,1
3,0,0,3000,0,552,12,76,1,31,60,0,1
4,1,0,1000,0,809,12,111,0,24,7,0,1


In [57]:
# Defining feature encoders

q1_transformers_list = [
    (
        'glmm',
        category_encoders.GLMMEncoder(),
        ['ESCT']
    ),
    (
        'woee',
        category_encoders.WOEEncoder(drop_invariant=True),
        ['TIPOR', 'TEL']
    ),
    (
        'tget',
        category_encoders.TargetEncoder(),
        ['NDEP']
    )
]

q1_fe_pipeline = ColumnTransformer(
    transformers=q1_transformers_list,
    remainder="passthrough",
    verbose_feature_names_out=False,
)

In [58]:
q1_df_train['ESCT'] = q1_df_train['ESCT'].astype(str)
q1_df_train['TIPOR'] = q1_df_train['TIPOR'].astype(str)
q1_df_train['TEL'] = q1_df_train['TEL'].astype(str)
q1_df_train['NDEP'] = q1_df_train['NDEP'].astype(str)
q1_df_train.head()

Unnamed: 0,ESCT,NDEP,RENDA,TIPOR,VBEM,NPARC,VPARC,TEL,IDADE,RESMS,ENTRADA,CLASSE
0,1,0,360,0,313,9,52,0,25,48,0,1
1,0,0,350,1,468,10,65,0,33,6,0,1
2,0,0,1100,0,829,9,125,0,56,48,0,1
3,0,0,3000,0,552,12,76,1,31,60,0,1
4,1,0,1000,0,809,12,111,0,24,7,0,1


In [59]:
# Encoding features
q1_encoded_df_train = q1_fe_pipeline.fit_transform(q1_df_train, q1_df_train.CLASSE.values)
q1_encoded_df_train.head()

Unnamed: 0,ESCT,TIPOR,TEL,NDEP,RENDA,VBEM,NPARC,VPARC,IDADE,RESMS,ENTRADA,CLASSE
0,0.379425,0.924514,-0.10581,0.48687,360,313,9,52,25,48,0,1
1,-0.267613,-1.274589,-0.10581,0.48687,350,468,10,65,33,6,0,1
2,-0.267613,0.924514,-0.10581,0.48687,1100,829,9,125,56,48,0,1
3,-0.267613,0.924514,0.691684,0.48687,3000,552,12,76,31,60,0,1
4,0.379425,0.924514,-0.10581,0.48687,1000,809,12,111,24,7,0,1


In [60]:
# Separating features matrix and target column
q1_X_train = q1_encoded_df_train.drop(["CLASSE"], axis = 1)
q1_y_train = q1_encoded_df_train.CLASSE.values
q1_X_train.head()

Unnamed: 0,ESCT,TIPOR,TEL,NDEP,RENDA,VBEM,NPARC,VPARC,IDADE,RESMS,ENTRADA
0,0.379425,0.924514,-0.10581,0.48687,360,313,9,52,25,48,0
1,-0.267613,-1.274589,-0.10581,0.48687,350,468,10,65,33,6,0
2,-0.267613,0.924514,-0.10581,0.48687,1100,829,9,125,56,48,0
3,-0.267613,0.924514,0.691684,0.48687,3000,552,12,76,31,60,0
4,0.379425,0.924514,-0.10581,0.48687,1000,809,12,111,24,7,0


In [61]:
# Scaling data
q1_encoded_column_names_backup = q1_X_train.columns
q1_scaler = MinMaxScaler()
q1_scaled_X_train = pd.DataFrame(q1_scaler.fit_transform(q1_X_train), 
                                 columns = q1_encoded_column_names_backup)
q1_scaled_X_train.head()

Unnamed: 0,ESCT,TIPOR,TEL,NDEP,RENDA,VBEM,NPARC,VPARC,IDADE,RESMS,ENTRADA
0,1.0,1.0,0.0,1.0,0.007792,0.003514,0.347826,0.003026,0.117647,0.114286,0.0
1,0.0,0.0,0.0,1.0,0.006494,0.045405,0.391304,0.022693,0.27451,0.014286,0.0
2,0.0,1.0,0.0,1.0,0.103896,0.142973,0.347826,0.113464,0.72549,0.114286,0.0
3,0.0,1.0,1.0,1.0,0.350649,0.068108,0.478261,0.039334,0.235294,0.142857,0.0
4,1.0,1.0,0.0,1.0,0.090909,0.137568,0.478261,0.092284,0.098039,0.016667,0.0


In [62]:
# Model training
q1_models = [
    GradientBoostingClassifier(random_state=SEED),
]

q1_training_accuracies = {}

for model in q1_models:
    model.fit(q1_scaled_X_train, q1_y_train)
    q1_training_accuracies[model.__class__.__name__] = model.score(
        q1_scaled_X_train, q1_y_train)

print("Training accuracies")
pprint(q1_training_accuracies, sort_dicts = False)

Training accuracies
{'GradientBoostingClassifier': 0.936}


In [63]:
# Processing test dataset

q1_df_test['ESCT'] = q1_df_test['ESCT'].astype(str)
q1_df_test['TIPOR'] = q1_df_test['TIPOR'].astype(str)
q1_df_test['TEL'] = q1_df_test['TEL'].astype(str)
q1_df_test['NDEP'] = q1_df_test['NDEP'].astype(str)

q1_encoded_df_test = q1_fe_pipeline.fit_transform(q1_df_test, q1_df_test.CLASSE.values)
q1_X_test = q1_encoded_df_test.drop(["CLASSE"], axis = 1)
q1_y_test = q1_encoded_df_test.CLASSE.values
q1_scaled_X_test = pd.DataFrame(q1_scaler.transform(q1_X_test),
                                columns = q1_encoded_column_names_backup)

q1_scaled_X_test.head()

Unnamed: 0,ESCT,TIPOR,TEL,NDEP,RENDA,VBEM,NPARC,VPARC,IDADE,RESMS,ENTRADA
0,-0.285248,-0.002455,0.049056,0.533025,0.025974,0.085946,0.391304,0.05295,0.333333,0.014286,0.0
1,0.95625,1.008897,0.049056,0.943545,0.066623,0.068108,0.130435,0.104387,0.470588,0.114286,0.091538
2,0.650785,1.008897,0.049056,0.943545,0.006494,0.050811,0.478261,0.024206,0.470588,0.0,0.0
3,0.95625,1.008897,0.049056,0.943545,0.15974,0.021892,0.0,0.526475,0.176471,0.114286,0.0
4,-0.285248,-0.002455,0.049056,0.943545,0.05039,0.025946,0.391304,0.015129,0.588235,0.171429,0.0


In [64]:
def evaluate_model(model, X_test, y_test):
    predicted_y = model.predict(X_test)
    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(
        y_test, predicted_y).ravel()
    pprint({
        "Model": model.__class__.__name__,
        "acc": sklearn.metrics.accuracy_score(y_test, predicted_y),
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "Classification Report": sklearn.metrics.classification_report(
            y_test, predicted_y, output_dict = True)
    }, sort_dicts = False)

In [65]:
evaluate_model(q1_models[0], q1_scaled_X_test, q1_y_test)

{'Model': 'GradientBoostingClassifier',
 'acc': 0.9046793760831889,
 'tp': 225,
 'tn': 297,
 'fp': 9,
 'fn': 46,
 'Classification Report': {'0': {'precision': 0.8658892128279884,
                                 'recall': 0.9705882352941176,
                                 'f1-score': 0.9152542372881356,
                                 'support': 306.0},
                           '1': {'precision': 0.9615384615384616,
                                 'recall': 0.8302583025830258,
                                 'f1-score': 0.8910891089108911,
                                 'support': 271.0},
                           'accuracy': 0.9046793760831889,
                           'macro avg': {'precision': 0.913713837183225,
                                         'recall': 0.9004232689385717,
                                         'f1-score': 0.9031716730995134,
                                         'support': 577.0},
                           'weighted avg': {'precision': 0.

### Comparação dos Resultados

| Métricas |   T1   |   T2   |
|----------|--------|--------|
| Acurácia | 0.9012 | 0.9047 |
| Precisão | 0.9573 | 0.9615 |
| Recall   | 0.8266 | 0.8302 |
| F1       | 0.8871 | 0.8911 |