# Aprendizado de Máquina - Trabalho 1

Nome: Matheus dos Santos Moura

Matrícula: 2410003MCICMA

URL do vídeo: 

Tempo das questões:

In [84]:
# Imports

from pprint import pprint
import category_encoders
import numpy as np
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [85]:
# Configs
SEED = 42
sklearn.set_config(transform_output="pandas")

## 1 - Predição de pagamento de empréstimos

- Criação de um modelo de classificação para inferir se uma requisição de empréstimo resultará em calote.
- É necessário realizar o pré-processamento dos dados (encoding e scaling).
- Devem ser utilizados os modelos de LR, DTC, RF, KNN e GradientBoosting.
- Os valores default dos hiperparâmetros podem ser mantidos.
- Gere a matriz de confusão e o `classification_report` de cada modelo.


In [86]:
# Creating dataframes

q1_column_names = [
    "ESCT",
    "NDEP",
    "RENDA",
    "TIPOR",
    "VBEM",
    "NPARC",
    "VPARC",
    "TEL",
    "IDADE",
    "RESMS",
    "ENTRADA",
    "CLASSE"
]

q1_df_train = pd.read_csv("./data/credtrain.txt", sep="\t", names=q1_column_names)
q1_df_test = pd.read_csv("./data/credtest.txt", sep="\t", names=q1_column_names)

q1_df_train.head()

Unnamed: 0,ESCT,NDEP,RENDA,TIPOR,VBEM,NPARC,VPARC,TEL,IDADE,RESMS,ENTRADA,CLASSE
0,1,0,360,0,313,9,52,0,25,48,0,1
1,0,0,350,1,468,10,65,0,33,6,0,1
2,0,0,1100,0,829,9,125,0,56,48,0,1
3,0,0,3000,0,552,12,76,1,31,60,0,1
4,1,0,1000,0,809,12,111,0,24,7,0,1


In [87]:
# Bugfix: Columns being detected as non categoricals
# q1_df_train['ESCT'] = q1_df_train['ESCT'].astype('category')
# q1_df_train['TIPOR'] = q1_df_train['TIPOR'].astype('category')
# q1_df_train['TEL'] = q1_df_train['TEL'].astype('category')
# q1_df_train['NDEP'] = q1_df_train['NDEP'].astype('category')

# q1_df_test['ESCT'] = q1_df_test['ESCT'].astype('category')
# q1_df_test['TIPOR'] = q1_df_test['TIPOR'].astype('category')
# q1_df_test['TEL'] = q1_df_test['TEL'].astype('category')
# q1_df_test['NDEP'] = q1_df_test['NDEP'].astype('category')

In [88]:
# Train data statistics
q1_df_train.describe()

Unnamed: 0,ESCT,NDEP,RENDA,TIPOR,VBEM,NPARC,VPARC,TEL,IDADE,RESMS,ENTRADA,CLASSE
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,0.512,0.126667,984.512,0.445333,562.031333,8.392667,102.247333,0.134,41.210667,31.528667,32.255333,0.476667
std,0.659909,0.572865,905.53292,0.497168,265.690646,3.609131,62.987325,0.340766,13.202266,52.662099,94.307614,0.499622
min,0.0,0.0,300.0,0.0,300.0,1.0,50.0,0.0,19.0,0.0,0.0,0.0
25%,0.0,0.0,467.5,0.0,410.0,6.0,66.0,0.0,31.0,6.0,0.0,0.0
50%,0.0,0.0,650.0,0.0,490.0,10.0,83.0,0.0,39.0,6.0,0.0,0.0
75%,1.0,0.0,1200.0,1.0,618.0,10.0,118.0,0.0,52.0,48.0,0.0,1.0
max,3.0,7.0,8000.0,1.0,4000.0,24.0,711.0,1.0,70.0,420.0,1300.0,1.0


In [89]:
# Defining Feature Encoding Pipeline

q1_transformers_list = [
    (
        "Categorical",
        category_encoders.OneHotEncoder(handle_unknown="error"),
        ["ESCT", "TIPOR", "TEL"]
    ),
    (
        "Ordinal",
        category_encoders.OrdinalEncoder(
            mapping = [
                {
                    "col": "NDEP",
                    "mapping": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7},
                }
            ]
        ),
        ["NDEP"]
    )
]

q1_transformers_list = [
    (
        'Categorical',
        OneHotEncoder(sparse_output = False, drop = "first"),
        ["ESCT", "TIPOR", "TEL"]
    ),
    (
        "Ordinal",
        OrdinalEncoder(categories=[[0, 1, 2, 3, 4, 5, 6, 7]]),
        ["NDEP"]
    )
]

q1_fe_pipeline = ColumnTransformer(
    transformers=q1_transformers_list,
    remainder="passthrough",
    verbose_feature_names_out=False,
)

In [90]:
q1_df_train.head()

Unnamed: 0,ESCT,NDEP,RENDA,TIPOR,VBEM,NPARC,VPARC,TEL,IDADE,RESMS,ENTRADA,CLASSE
0,1,0,360,0,313,9,52,0,25,48,0,1
1,0,0,350,1,468,10,65,0,33,6,0,1
2,0,0,1100,0,829,9,125,0,56,48,0,1
3,0,0,3000,0,552,12,76,1,31,60,0,1
4,1,0,1000,0,809,12,111,0,24,7,0,1


In [91]:
# Encoding datasets
q1_encoded_df_train = q1_fe_pipeline.fit_transform(q1_df_train)
q1_encoded_df_train.head()

Unnamed: 0,ESCT_1,ESCT_2,ESCT_3,TIPOR_1,TEL_1,NDEP,RENDA,VBEM,NPARC,VPARC,IDADE,RESMS,ENTRADA,CLASSE
0,1.0,0.0,0.0,0.0,0.0,0.0,360,313,9,52,25,48,0,1
1,0.0,0.0,0.0,1.0,0.0,0.0,350,468,10,65,33,6,0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,1100,829,9,125,56,48,0,1
3,0.0,0.0,0.0,0.0,1.0,0.0,3000,552,12,76,31,60,0,1
4,1.0,0.0,0.0,0.0,0.0,0.0,1000,809,12,111,24,7,0,1


In [92]:
# Splitting class
q1_X_train = q1_encoded_df_train.drop(["CLASSE"], axis = 1)
q1_y_train = q1_encoded_df_train.CLASSE.values
q1_X_train.head()

Unnamed: 0,ESCT_1,ESCT_2,ESCT_3,TIPOR_1,TEL_1,NDEP,RENDA,VBEM,NPARC,VPARC,IDADE,RESMS,ENTRADA
0,1.0,0.0,0.0,0.0,0.0,0.0,360,313,9,52,25,48,0
1,0.0,0.0,0.0,1.0,0.0,0.0,350,468,10,65,33,6,0
2,0.0,0.0,0.0,0.0,0.0,0.0,1100,829,9,125,56,48,0
3,0.0,0.0,0.0,0.0,1.0,0.0,3000,552,12,76,31,60,0
4,1.0,0.0,0.0,0.0,0.0,0.0,1000,809,12,111,24,7,0


In [93]:
# Scaling data
q1_encoded_column_names_backup = q1_X_train.columns
q1_scaler = MinMaxScaler()
q1_scaled_X_train = pd.DataFrame(q1_scaler.fit_transform(q1_X_train), 
                                 columns = q1_encoded_column_names_backup)

In [94]:
q1_scaled_X_train.head()

Unnamed: 0,ESCT_1,ESCT_2,ESCT_3,TIPOR_1,TEL_1,NDEP,RENDA,VBEM,NPARC,VPARC,IDADE,RESMS,ENTRADA
0,1.0,0.0,0.0,0.0,0.0,0.0,0.007792,0.003514,0.347826,0.003026,0.117647,0.114286,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.006494,0.045405,0.391304,0.022693,0.27451,0.014286,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.103896,0.142973,0.347826,0.113464,0.72549,0.114286,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.350649,0.068108,0.478261,0.039334,0.235294,0.142857,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.137568,0.478261,0.092284,0.098039,0.016667,0.0


In [95]:
# Model training
q1_models = [
    LogisticRegression(random_state=SEED),
    DecisionTreeClassifier(random_state=SEED),
    RandomForestClassifier(random_state=SEED),
    KNeighborsClassifier(),
    GradientBoostingClassifier(random_state=SEED),
]

q1_training_accuracies = {}

for model in q1_models:
    model.fit(q1_scaled_X_train, q1_y_train)
    q1_training_accuracies[model.__class__.__name__] = model.score(
        q1_scaled_X_train, q1_y_train)

pprint(q1_training_accuracies, sort_dicts = False)

{'LogisticRegression': 0.8933333333333333,
 'DecisionTreeClassifier': 1.0,
 'RandomForestClassifier': 1.0,
 'KNeighborsClassifier': 0.9066666666666666,
 'GradientBoostingClassifier': 0.9353333333333333}


### Model Evaluation

In [96]:
q1_encoded_column_names_backup

Index(['ESCT_1', 'ESCT_2', 'ESCT_3', 'TIPOR_1', 'TEL_1', 'NDEP', 'RENDA',
       'VBEM', 'NPARC', 'VPARC', 'IDADE', 'RESMS', 'ENTRADA'],
      dtype='object')

In [97]:
# Pre-processing test data

q1_encoded_df_test = q1_fe_pipeline.fit_transform(q1_df_test)
q1_X_test = q1_encoded_df_test.drop(["CLASSE"], axis = 1)
q1_y_test = q1_encoded_df_test.CLASSE.values
q1_scaled_X_test = pd.DataFrame(q1_scaler.transform(q1_X_test),
                                columns = q1_encoded_column_names_backup)

q1_scaled_X_test.head()

Unnamed: 0,ESCT_1,ESCT_2,ESCT_3,TIPOR_1,TEL_1,NDEP,RENDA,VBEM,NPARC,VPARC,IDADE,RESMS,ENTRADA
0,0.0,0.0,0.0,1.0,0.0,0.285714,0.025974,0.085946,0.391304,0.05295,0.333333,0.014286,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.066623,0.068108,0.130435,0.104387,0.470588,0.114286,0.091538
2,0.0,0.0,1.0,0.0,0.0,0.0,0.006494,0.050811,0.478261,0.024206,0.470588,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.15974,0.021892,0.0,0.526475,0.176471,0.114286,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.05039,0.025946,0.391304,0.015129,0.588235,0.171429,0.0


In [98]:
q1_scaler.transform(q1_X_test)

Unnamed: 0,ESCT_1,ESCT_2,ESCT_3,TIPOR_1,TEL_1,NDEP,RENDA,VBEM,NPARC,VPARC,IDADE,RESMS,ENTRADA
0,0.0,0.0,0.0,1.0,0.0,0.285714,0.025974,0.085946,0.391304,0.052950,0.333333,0.014286,0.000000
1,1.0,0.0,0.0,0.0,0.0,0.000000,0.066623,0.068108,0.130435,0.104387,0.470588,0.114286,0.091538
2,0.0,0.0,1.0,0.0,0.0,0.000000,0.006494,0.050811,0.478261,0.024206,0.470588,0.000000,0.000000
3,1.0,0.0,0.0,0.0,0.0,0.000000,0.159740,0.021892,0.000000,0.526475,0.176471,0.114286,0.000000
4,0.0,0.0,0.0,1.0,0.0,0.000000,0.050390,0.025946,0.391304,0.015129,0.588235,0.171429,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
572,0.0,0.0,0.0,1.0,0.0,0.000000,0.023377,0.026486,0.217391,0.059002,0.274510,0.200000,0.000000
573,1.0,0.0,0.0,0.0,0.0,0.000000,0.061429,0.107838,0.217391,0.122542,0.862745,0.114286,0.000000
574,0.0,0.0,0.0,0.0,0.0,0.428571,0.116883,0.052703,0.130435,0.143722,0.392157,0.014286,0.000000
575,1.0,0.0,0.0,0.0,0.0,0.000000,0.038961,0.073514,0.608696,0.027231,0.274510,0.014286,0.000000


In [99]:
def evaluate_model(model, X_test, y_test):
    predicted_y = model.predict(X_test)
    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(
        y_test, predicted_y).ravel()
    pprint({
        "Model": model.__class__.__name__,
        "acc": sklearn.metrics.accuracy_score(y_test, predicted_y),
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "Classification Report": sklearn.metrics.classification_report(
            y_test, predicted_y, output_dict = True)
    }, sort_dicts = False)

In [100]:
# Evaluation

for model in q1_models:
    evaluate_model(model, q1_scaled_X_test, q1_y_test)

{'Model': 'LogisticRegression',
 'acc': 0.8942807625649913,
 'tp': 219,
 'tn': 297,
 'fp': 9,
 'fn': 52,
 'Classification Report': {'0': {'precision': 0.8510028653295129,
                                 'recall': 0.9705882352941176,
                                 'f1-score': 0.9068702290076336,
                                 'support': 306.0},
                           '1': {'precision': 0.9605263157894737,
                                 'recall': 0.8081180811808119,
                                 'f1-score': 0.8777555110220441,
                                 'support': 271.0},
                           'accuracy': 0.8942807625649913,
                           'macro avg': {'precision': 0.9057645905594933,
                                         'recall': 0.8893531582374647,
                                         'f1-score': 0.8923128700148388,
                                         'support': 577.0},
                           'weighted avg': {'precision': 0.9024428

## 2 - Predição de Preços de Diamantes

- Construir modelos de regressão para predição do preço no dataset `diamonds.csv`.
- Realize o spliting dos dados na relação 80/20.
- Realize o encoding e scaling dos dados.
- Os seguintes modelos deverão ser utilizados: LR, Lasso, DecisionTreeRegressor, RF Regressor, KNN e GB.
- Os valores dos hiperparâmetros podem ser default.
- Como métricas de avaliação utilize MSE e $R^2$.

### Colunas do dataset:
- **price**: US dollar `[$326, $18,823]`
- **carat**: Weight of the diamond `[0.2, 5.01]`
- **cut**: Quality of the cut `{Fair, Good, Very Good, Premium, Ideal}`
- **color**: Diamond colour `[J (worst), D (best)]`
- **clarity**: How clear the diamond is `{I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best)}`
- **x**: Length `(0mm, 10.74mm)`
- **y**: Width `(0mm, 58.9mm)`
- **z**: Depth `(0mm, 31.8mm)`
- **depth**: Total depth percentage (z / mean(x, y)) `[43, 79]`
- **table**: Width of top of diamond relative to widest point `[43, 95]`

In [101]:
# Loading dataset

q2_df = pd.read_csv("./data/diamonds.csv", index_col=0)
q2_X_df = q2_df.drop(["price"], axis=1)
q2_y_df = q2_df.price.values
q2_X_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


In [102]:
# Defining Feature Encoding Pipeline
# Cat nominais:
# Cat ordinais: cut, color, clarity

q2_transformers_list = [
    (
        "Ordinal",
        category_encoders.OrdinalEncoder(
            mapping=[
                {
                    "col": "cut",
                    "mapping": {
                        "Fair": 1,
                        "Good": 2,
                        "Very Good": 3,
                        "Premium": 4,
                        "Ideal": 5,
                    }
                },
                {
                    "col": "color",
                    "mapping": {
                        "J": 1,
                        "I": 2,
                        "H": 3,
                        "G": 4,
                        "F": 5,
                        "E": 6,
                        "D": 7,
                    }
                },
                {
                    "col": "clarity",
                    "mapping": {
                        "I1": 1,
                        "SI2": 2,
                        "SI1": 3,
                        "VS2": 4,
                        "VS1": 5,
                        "VVS2": 6
                    }
                }
            ]
        ),
        ["cut", "color", "clarity"]
    )
]

q2_fe_pipeline = ColumnTransformer(
    transformers = q2_transformers_list,
    remainder = "passthrough",
    verbose_feature_names_out = False
)

In [103]:
# Executing feature encoding
q2_encoded_X = q2_fe_pipeline.fit_transform(q2_X_df)
q2_encoded_X.head()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z
1,5,6,2.0,0.23,61.5,55.0,3.95,3.98,2.43
2,4,6,3.0,0.21,59.8,61.0,3.89,3.84,2.31
3,2,6,5.0,0.23,56.9,65.0,4.05,4.07,2.31
4,4,2,4.0,0.29,62.4,58.0,4.2,4.23,2.63
5,2,1,2.0,0.31,63.3,58.0,4.34,4.35,2.75


In [104]:
# Creating volume feature (polynomial feature)
q2_encoded_X["volume"] = q2_encoded_X["x"] * q2_encoded_X["y"] * q2_encoded_X["z"]
q2_encoded_X.head()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z,volume
1,5,6,2.0,0.23,61.5,55.0,3.95,3.98,2.43,38.20203
2,4,6,3.0,0.21,59.8,61.0,3.89,3.84,2.31,34.505856
3,2,6,5.0,0.23,56.9,65.0,4.05,4.07,2.31,38.076885
4,4,2,4.0,0.29,62.4,58.0,4.2,4.23,2.63,46.72458
5,2,1,2.0,0.31,63.3,58.0,4.34,4.35,2.75,51.91725


In [105]:
# Splitting data

q2_encoded_X_train, q2_encoded_X_test, q2_y_train, q2_y_test = sklearn.model_selection.train_test_split(
    q2_encoded_X, q2_y_df, test_size=0.2, random_state=SEED)
q2_encoded_X_train.describe()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z,volume
count,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0
mean,3.905682,4.408927,3.208866,0.798233,61.743046,57.459548,5.732607,5.736434,3.539256,129.902611
std,1.115122,1.698757,1.864799,0.473342,1.428243,2.229325,1.120196,1.1475,0.708062,78.562955
min,1.0,1.0,-1.0,0.2,43.0,43.0,0.0,0.0,0.0,0.0
25%,3.0,3.0,2.0,0.4,61.0,56.0,4.72,4.73,2.91,65.30356
50%,4.0,4.0,3.0,0.7,61.8,57.0,5.7,5.71,3.53,114.808572
75%,5.0,6.0,4.0,1.04,62.5,59.0,6.54,6.54,4.04,170.917635
max,5.0,7.0,6.0,5.01,79.0,95.0,10.74,58.9,31.8,3840.59806


In [106]:
# Scaling data
q2_encoded_column_names_backup = q2_encoded_X_train.columns
q2_scaler = MinMaxScaler()
q2_scaled_X_train = pd.DataFrame(q2_scaler.fit_transform(q2_encoded_X_train),
                                 columns = q2_encoded_column_names_backup)
q2_scaled_X_test = pd.DataFrame(q2_scaler.transform(q2_encoded_X_test),
                                 columns = q2_encoded_column_names_backup)
q2_scaled_X_train.head()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z,volume
26547,0.25,0.666667,0.428571,0.376299,0.419444,0.403846,0.766294,0.139049,0.15,0.083715
9160,0.5,0.833333,0.428571,0.168399,0.472222,0.326923,0.611732,0.110187,0.12327,0.043521
14132,0.75,0.333333,0.714286,0.18711,0.541667,0.288462,0.613594,0.111036,0.128931,0.04601
15758,0.25,0.833333,0.428571,0.27027,0.513889,0.423077,0.671322,0.121732,0.138994,0.059495
24633,0.5,0.5,0.857143,0.274428,0.530556,0.269231,0.676909,0.124278,0.142453,0.062769


In [107]:
# Model training

q2_models = [
    LinearRegression(),
    Lasso(random_state=SEED),
    DecisionTreeRegressor(random_state=SEED),
    RandomForestRegressor(random_state=SEED),
    KNeighborsRegressor(),
    GradientBoostingRegressor(random_state=SEED)
]

q2_training_r2 = {}

for model in q2_models:
    model.fit(q2_scaled_X_train, q2_y_train)
    q2_training_r2[model.__class__.__name__] = model.score(
        q2_scaled_X_train, q2_y_train)

pprint(q2_training_r2, sort_dicts = False)

{'LinearRegression': 0.8746685161030123,
 'Lasso': 0.8736740987541102,
 'DecisionTreeRegressor': 0.9999947185273901,
 'RandomForestRegressor': 0.9973551333929837,
 'KNeighborsRegressor': 0.9794734490119786,
 'GradientBoostingRegressor': 0.9746423314302498}


### Model Evaluation

In [108]:
def evaluate_regressor(model, X_test, y_test):
    pred_y = model.predict(X_test)
    pprint({
        "Model": model.__class__.__name__,
        "MSE": sklearn.metrics.mean_squared_error(y_test, pred_y),
        "RMSE": sklearn.metrics.root_mean_squared_error(y_test, pred_y),
        "R^2": sklearn.metrics.r2_score(y_test, pred_y)
    }, sort_dicts = False)

In [109]:
for model in q2_models:
    evaluate_regressor(model, q2_scaled_X_test, q2_y_test)

{'Model': 'LinearRegression',
 'MSE': 1997700.6201057788,
 'RMSE': 1413.4003750196823,
 'R^2': 0.8743333297017382}
{'Model': 'Lasso',
 'MSE': 2015437.029741819,
 'RMSE': 1419.6608854729425,
 'R^2': 0.8732176091980877}
{'Model': 'DecisionTreeRegressor',
 'MSE': 511056.762815165,
 'RMSE': 714.8823419382836,
 'R^2': 0.967851638493765}


{'Model': 'RandomForestRegressor',
 'MSE': 290007.17983093625,
 'RMSE': 538.5231469778586,
 'R^2': 0.9817569077742924}
{'Model': 'KNeighborsRegressor',
 'MSE': 483213.8833333333,
 'RMSE': 695.135874008336,
 'R^2': 0.9696031131245394}
{'Model': 'GradientBoostingRegressor',
 'MSE': 415875.84530671925,
 'RMSE': 644.8843658414423,
 'R^2': 0.9738390566578474}


## 3 - Conjuntos desbalanceados (parte 1)