# Aprendizado de Máquina - Trabalho 1

Nome: Matheus dos Santos Moura

Matrícula: 2410003MCICMA

URL do vídeo: 

Tempo das questões:

In [2]:
# Imports

from pprint import pprint
import category_encoders
import numpy as np
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [3]:
# Configs
SEED = 42
sklearn.set_config(transform_output="pandas")

## 1 - Predição de pagamento de empréstimos

- Criação de um modelo de classificação para inferir se uma requisição de empréstimo resultará em calote.
- É necessário realizar o pré-processamento dos dados (encoding e scaling).
- Devem ser utilizados os modelos de LR, DTC, RF, KNN e GradientBoosting.
- Os valores default dos hiperparâmetros podem ser mantidos.
- Gere a matriz de confusão e o `classification_report` de cada modelo.


In [4]:
# Creating dataframes

q1_column_names = [
    "ESCT",
    "NDEP",
    "RENDA",
    "TIPOR",
    "VBEM",
    "NPARC",
    "VPARC",
    "TEL",
    "IDADE",
    "RESMS",
    "ENTRADA",
    "CLASSE"
]

q1_df_train = pd.read_csv("./data/credtrain.txt", sep="\t", names=q1_column_names)
q1_df_test = pd.read_csv("./data/credtest.txt", sep="\t", names=q1_column_names)

q1_df_train.head()

Unnamed: 0,ESCT,NDEP,RENDA,TIPOR,VBEM,NPARC,VPARC,TEL,IDADE,RESMS,ENTRADA,CLASSE
0,1,0,360,0,313,9,52,0,25,48,0,1
1,0,0,350,1,468,10,65,0,33,6,0,1
2,0,0,1100,0,829,9,125,0,56,48,0,1
3,0,0,3000,0,552,12,76,1,31,60,0,1
4,1,0,1000,0,809,12,111,0,24,7,0,1


In [8]:
# Bugfix: Columns being detected as non categoricals
# q1_df_train['ESCT'] = q1_df_train['ESCT'].astype('category')
# q1_df_train['TIPOR'] = q1_df_train['TIPOR'].astype('category')
# q1_df_train['TEL'] = q1_df_train['TEL'].astype('category')
# q1_df_train['NDEP'] = q1_df_train['NDEP'].astype('category')

# q1_df_test['ESCT'] = q1_df_test['ESCT'].astype('category')
# q1_df_test['TIPOR'] = q1_df_test['TIPOR'].astype('category')
# q1_df_test['TEL'] = q1_df_test['TEL'].astype('category')
# q1_df_test['NDEP'] = q1_df_test['NDEP'].astype('category')

In [5]:
# Train data statistics
q1_df_train.describe()

Unnamed: 0,ESCT,NDEP,RENDA,TIPOR,VBEM,NPARC,VPARC,TEL,IDADE,RESMS,ENTRADA,CLASSE
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,0.512,0.126667,984.512,0.445333,562.031333,8.392667,102.247333,0.134,41.210667,31.528667,32.255333,0.476667
std,0.659909,0.572865,905.53292,0.497168,265.690646,3.609131,62.987325,0.340766,13.202266,52.662099,94.307614,0.499622
min,0.0,0.0,300.0,0.0,300.0,1.0,50.0,0.0,19.0,0.0,0.0,0.0
25%,0.0,0.0,467.5,0.0,410.0,6.0,66.0,0.0,31.0,6.0,0.0,0.0
50%,0.0,0.0,650.0,0.0,490.0,10.0,83.0,0.0,39.0,6.0,0.0,0.0
75%,1.0,0.0,1200.0,1.0,618.0,10.0,118.0,0.0,52.0,48.0,0.0,1.0
max,3.0,7.0,8000.0,1.0,4000.0,24.0,711.0,1.0,70.0,420.0,1300.0,1.0


In [6]:
# Defining Feature Encoding Pipeline

q1_transformers_list = [
    (
        "Categorical",
        category_encoders.OneHotEncoder(handle_unknown="error"),
        ["ESCT", "TIPOR", "TEL"]
    ),
    (
        "Ordinal",
        category_encoders.OrdinalEncoder(
            mapping = [
                {
                    "col": "NDEP",
                    "mapping": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7},
                }
            ]
        ),
        ["NDEP"]
    )
]

q1_transformers_list = [
    (
        'Categorical',
        OneHotEncoder(sparse_output = False, drop = "first"),
        ["ESCT", "TIPOR", "TEL"]
    ),
    (
        "Ordinal",
        OrdinalEncoder(categories=[[0, 1, 2, 3, 4, 5, 6, 7]]),
        ["NDEP"]
    )
]

q1_fe_pipeline = ColumnTransformer(
    transformers=q1_transformers_list,
    remainder="passthrough",
    verbose_feature_names_out=False,
)

In [7]:
q1_df_train.head()

Unnamed: 0,ESCT,NDEP,RENDA,TIPOR,VBEM,NPARC,VPARC,TEL,IDADE,RESMS,ENTRADA,CLASSE
0,1,0,360,0,313,9,52,0,25,48,0,1
1,0,0,350,1,468,10,65,0,33,6,0,1
2,0,0,1100,0,829,9,125,0,56,48,0,1
3,0,0,3000,0,552,12,76,1,31,60,0,1
4,1,0,1000,0,809,12,111,0,24,7,0,1


In [8]:
# Encoding datasets
q1_encoded_df_train = q1_fe_pipeline.fit_transform(q1_df_train)
q1_encoded_df_train.head()

Unnamed: 0,ESCT_1,ESCT_2,ESCT_3,TIPOR_1,TEL_1,NDEP,RENDA,VBEM,NPARC,VPARC,IDADE,RESMS,ENTRADA,CLASSE
0,1.0,0.0,0.0,0.0,0.0,0.0,360,313,9,52,25,48,0,1
1,0.0,0.0,0.0,1.0,0.0,0.0,350,468,10,65,33,6,0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,1100,829,9,125,56,48,0,1
3,0.0,0.0,0.0,0.0,1.0,0.0,3000,552,12,76,31,60,0,1
4,1.0,0.0,0.0,0.0,0.0,0.0,1000,809,12,111,24,7,0,1


In [9]:
# Splitting class
q1_X_train = q1_encoded_df_train.drop(["CLASSE"], axis = 1)
q1_y_train = q1_encoded_df_train.CLASSE.values
q1_X_train.head()

Unnamed: 0,ESCT_1,ESCT_2,ESCT_3,TIPOR_1,TEL_1,NDEP,RENDA,VBEM,NPARC,VPARC,IDADE,RESMS,ENTRADA
0,1.0,0.0,0.0,0.0,0.0,0.0,360,313,9,52,25,48,0
1,0.0,0.0,0.0,1.0,0.0,0.0,350,468,10,65,33,6,0
2,0.0,0.0,0.0,0.0,0.0,0.0,1100,829,9,125,56,48,0
3,0.0,0.0,0.0,0.0,1.0,0.0,3000,552,12,76,31,60,0
4,1.0,0.0,0.0,0.0,0.0,0.0,1000,809,12,111,24,7,0


In [10]:
# Scaling data
q1_encoded_column_names_backup = q1_X_train.columns
q1_scaler = MinMaxScaler()
q1_scaled_X_train = pd.DataFrame(q1_scaler.fit_transform(q1_X_train), 
                                 columns = q1_encoded_column_names_backup)

In [11]:
q1_scaled_X_train.head()

Unnamed: 0,ESCT_1,ESCT_2,ESCT_3,TIPOR_1,TEL_1,NDEP,RENDA,VBEM,NPARC,VPARC,IDADE,RESMS,ENTRADA
0,1.0,0.0,0.0,0.0,0.0,0.0,0.007792,0.003514,0.347826,0.003026,0.117647,0.114286,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.006494,0.045405,0.391304,0.022693,0.27451,0.014286,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.103896,0.142973,0.347826,0.113464,0.72549,0.114286,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.350649,0.068108,0.478261,0.039334,0.235294,0.142857,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.137568,0.478261,0.092284,0.098039,0.016667,0.0


In [12]:
# Model training
q1_models = [
    LogisticRegression(random_state=SEED),
    DecisionTreeClassifier(random_state=SEED),
    RandomForestClassifier(random_state=SEED),
    KNeighborsClassifier(),
    GradientBoostingClassifier(random_state=SEED),
]

q1_training_accuracies = {}

for model in q1_models:
    model.fit(q1_scaled_X_train, q1_y_train)
    q1_training_accuracies[model.__class__.__name__] = model.score(
        q1_scaled_X_train, q1_y_train)

pprint(q1_training_accuracies, sort_dicts = False)

{'LogisticRegression': 0.8933333333333333,
 'DecisionTreeClassifier': 1.0,
 'RandomForestClassifier': 1.0,
 'KNeighborsClassifier': 0.9066666666666666,
 'GradientBoostingClassifier': 0.9353333333333333}


### Model Evaluation

In [13]:
q1_encoded_column_names_backup

Index(['ESCT_1', 'ESCT_2', 'ESCT_3', 'TIPOR_1', 'TEL_1', 'NDEP', 'RENDA',
       'VBEM', 'NPARC', 'VPARC', 'IDADE', 'RESMS', 'ENTRADA'],
      dtype='object')

In [14]:
# Pre-processing test data

q1_encoded_df_test = q1_fe_pipeline.fit_transform(q1_df_test)
q1_X_test = q1_encoded_df_test.drop(["CLASSE"], axis = 1)
q1_y_test = q1_encoded_df_test.CLASSE.values
q1_scaled_X_test = pd.DataFrame(q1_scaler.transform(q1_X_test),
                                columns = q1_encoded_column_names_backup)

q1_scaled_X_test.head()

Unnamed: 0,ESCT_1,ESCT_2,ESCT_3,TIPOR_1,TEL_1,NDEP,RENDA,VBEM,NPARC,VPARC,IDADE,RESMS,ENTRADA
0,0.0,0.0,0.0,1.0,0.0,0.285714,0.025974,0.085946,0.391304,0.05295,0.333333,0.014286,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.066623,0.068108,0.130435,0.104387,0.470588,0.114286,0.091538
2,0.0,0.0,1.0,0.0,0.0,0.0,0.006494,0.050811,0.478261,0.024206,0.470588,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.15974,0.021892,0.0,0.526475,0.176471,0.114286,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.05039,0.025946,0.391304,0.015129,0.588235,0.171429,0.0


In [15]:
q1_scaler.transform(q1_X_test)

Unnamed: 0,ESCT_1,ESCT_2,ESCT_3,TIPOR_1,TEL_1,NDEP,RENDA,VBEM,NPARC,VPARC,IDADE,RESMS,ENTRADA
0,0.0,0.0,0.0,1.0,0.0,0.285714,0.025974,0.085946,0.391304,0.052950,0.333333,0.014286,0.000000
1,1.0,0.0,0.0,0.0,0.0,0.000000,0.066623,0.068108,0.130435,0.104387,0.470588,0.114286,0.091538
2,0.0,0.0,1.0,0.0,0.0,0.000000,0.006494,0.050811,0.478261,0.024206,0.470588,0.000000,0.000000
3,1.0,0.0,0.0,0.0,0.0,0.000000,0.159740,0.021892,0.000000,0.526475,0.176471,0.114286,0.000000
4,0.0,0.0,0.0,1.0,0.0,0.000000,0.050390,0.025946,0.391304,0.015129,0.588235,0.171429,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
572,0.0,0.0,0.0,1.0,0.0,0.000000,0.023377,0.026486,0.217391,0.059002,0.274510,0.200000,0.000000
573,1.0,0.0,0.0,0.0,0.0,0.000000,0.061429,0.107838,0.217391,0.122542,0.862745,0.114286,0.000000
574,0.0,0.0,0.0,0.0,0.0,0.428571,0.116883,0.052703,0.130435,0.143722,0.392157,0.014286,0.000000
575,1.0,0.0,0.0,0.0,0.0,0.000000,0.038961,0.073514,0.608696,0.027231,0.274510,0.014286,0.000000


In [16]:
def evaluate_model(model, X_test, y_test):
    predicted_y = model.predict(X_test)
    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(
        y_test, predicted_y).ravel()
    pprint({
        "Model": model.__class__.__name__,
        "acc": sklearn.metrics.accuracy_score(y_test, predicted_y),
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "Classification Report": sklearn.metrics.classification_report(
            y_test, predicted_y, output_dict = True)
    }, sort_dicts = False)

In [17]:
# Evaluation

for model in q1_models:
    evaluate_model(model, q1_scaled_X_test, q1_y_test)

{'Model': 'LogisticRegression',
 'acc': 0.8942807625649913,
 'tp': 219,
 'tn': 297,
 'fp': 9,
 'fn': 52,
 'Classification Report': {'0': {'precision': 0.8510028653295129,
                                 'recall': 0.9705882352941176,
                                 'f1-score': 0.9068702290076336,
                                 'support': 306.0},
                           '1': {'precision': 0.9605263157894737,
                                 'recall': 0.8081180811808119,
                                 'f1-score': 0.8777555110220441,
                                 'support': 271.0},
                           'accuracy': 0.8942807625649913,
                           'macro avg': {'precision': 0.9057645905594933,
                                         'recall': 0.8893531582374647,
                                         'f1-score': 0.8923128700148388,
                                         'support': 577.0},
                           'weighted avg': {'precision': 0.9024428

## 2 - Predição de Preços de Diamantes

- Construir modelos de regressão para predição do preço no dataset `diamonds.csv`.
- Realize o spliting dos dados na relação 80/20.
- Realize o encoding e scaling dos dados.
- Os seguintes modelos deverão ser utilizados: LR, Lasso, DecisionTreeRegressor, RF Regressor, KNN e GB.
- Os valores dos hiperparâmetros podem ser default.
- Como métricas de avaliação utilize MSE e $R^2$.

### Colunas do dataset:
- **price**: US dollar `[$326, $18,823]`
- **carat**: Weight of the diamond `[0.2, 5.01]`
- **cut**: Quality of the cut `{Fair, Good, Very Good, Premium, Ideal}`
- **color**: Diamond colour `[J (worst), D (best)]`
- **clarity**: How clear the diamond is `{I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best)}`
- **x**: Length `(0mm, 10.74mm)`
- **y**: Width `(0mm, 58.9mm)`
- **z**: Depth `(0mm, 31.8mm)`
- **depth**: Total depth percentage (z / mean(x, y)) `[43, 79]`
- **table**: Width of top of diamond relative to widest point `[43, 95]`

In [18]:
# Loading dataset

q2_df = pd.read_csv("./data/diamonds.csv", index_col=0)
q2_X_df = q2_df.drop(["price"], axis=1)
q2_y_df = q2_df.price.values
q2_X_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


In [20]:
# Defining Feature Encoding Pipeline
# Cat nominais:
# Cat ordinais: cut, color, clarity

q2_transformers_list = [
    (
        "Ordinal",
        category_encoders.OrdinalEncoder(
            mapping=[
                {
                    "col": "cut",
                    "mapping": {
                        "Fair": 1,
                        "Good": 2,
                        "Very Good": 3,
                        "Premium": 4,
                        "Ideal": 5,
                    }
                },
                {
                    "col": "color",
                    "mapping": {
                        "J": 1,
                        "I": 2,
                        "H": 3,
                        "G": 4,
                        "F": 5,
                        "E": 6,
                        "D": 7,
                    }
                },
                {
                    "col": "clarity",
                    "mapping": {
                        "I1": 1,
                        "SI2": 2,
                        "SI1": 3,
                        "VS2": 4,
                        "VS1": 5,
                        "VVS2": 6
                    }
                }
            ]
        ),
        ["cut", "color", "clarity"]
    )
]

q2_fe_pipeline = ColumnTransformer(
    transformers = q2_transformers_list,
    remainder = "passthrough",
    verbose_feature_names_out = False
)

In [21]:
# Executing feature encoding
q2_encoded_X = q2_fe_pipeline.fit_transform(q2_X_df)
q2_encoded_X.head()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z
1,5,6,2.0,0.23,61.5,55.0,3.95,3.98,2.43
2,4,6,3.0,0.21,59.8,61.0,3.89,3.84,2.31
3,2,6,5.0,0.23,56.9,65.0,4.05,4.07,2.31
4,4,2,4.0,0.29,62.4,58.0,4.2,4.23,2.63
5,2,1,2.0,0.31,63.3,58.0,4.34,4.35,2.75


In [22]:
# Creating volume feature (polynomial feature)
q2_encoded_X["volume"] = q2_encoded_X["x"] * q2_encoded_X["y"] * q2_encoded_X["z"]
q2_encoded_X.head()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z,volume
1,5,6,2.0,0.23,61.5,55.0,3.95,3.98,2.43,38.20203
2,4,6,3.0,0.21,59.8,61.0,3.89,3.84,2.31,34.505856
3,2,6,5.0,0.23,56.9,65.0,4.05,4.07,2.31,38.076885
4,4,2,4.0,0.29,62.4,58.0,4.2,4.23,2.63,46.72458
5,2,1,2.0,0.31,63.3,58.0,4.34,4.35,2.75,51.91725


In [23]:
# Splitting data

q2_encoded_X_train, q2_encoded_X_test, q2_y_train, q2_y_test = sklearn.model_selection.train_test_split(
    q2_encoded_X, q2_y_df, test_size=0.2, random_state=SEED)
q2_encoded_X_train.describe()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z,volume
count,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0
mean,3.905682,4.408927,3.208866,0.798233,61.743046,57.459548,5.732607,5.736434,3.539256,129.902611
std,1.115122,1.698757,1.864799,0.473342,1.428243,2.229325,1.120196,1.1475,0.708062,78.562955
min,1.0,1.0,-1.0,0.2,43.0,43.0,0.0,0.0,0.0,0.0
25%,3.0,3.0,2.0,0.4,61.0,56.0,4.72,4.73,2.91,65.30356
50%,4.0,4.0,3.0,0.7,61.8,57.0,5.7,5.71,3.53,114.808572
75%,5.0,6.0,4.0,1.04,62.5,59.0,6.54,6.54,4.04,170.917635
max,5.0,7.0,6.0,5.01,79.0,95.0,10.74,58.9,31.8,3840.59806


In [24]:
# Scaling data
q2_encoded_column_names_backup = q2_encoded_X_train.columns
q2_scaler = MinMaxScaler()
q2_scaled_X_train = pd.DataFrame(q2_scaler.fit_transform(q2_encoded_X_train),
                                 columns = q2_encoded_column_names_backup)
q2_scaled_X_test = pd.DataFrame(q2_scaler.transform(q2_encoded_X_test),
                                 columns = q2_encoded_column_names_backup)
q2_scaled_X_train.head()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z,volume
26547,0.25,0.666667,0.428571,0.376299,0.419444,0.403846,0.766294,0.139049,0.15,0.083715
9160,0.5,0.833333,0.428571,0.168399,0.472222,0.326923,0.611732,0.110187,0.12327,0.043521
14132,0.75,0.333333,0.714286,0.18711,0.541667,0.288462,0.613594,0.111036,0.128931,0.04601
15758,0.25,0.833333,0.428571,0.27027,0.513889,0.423077,0.671322,0.121732,0.138994,0.059495
24633,0.5,0.5,0.857143,0.274428,0.530556,0.269231,0.676909,0.124278,0.142453,0.062769


In [25]:
# Model training

q2_models = [
    LinearRegression(),
    Lasso(random_state=SEED),
    DecisionTreeRegressor(random_state=SEED),
    RandomForestRegressor(random_state=SEED),
    KNeighborsRegressor(),
    GradientBoostingRegressor(random_state=SEED)
]

q2_training_r2 = {}

for model in q2_models:
    model.fit(q2_scaled_X_train, q2_y_train)
    q2_training_r2[model.__class__.__name__] = model.score(
        q2_scaled_X_train, q2_y_train)

pprint(q2_training_r2, sort_dicts = False)

{'LinearRegression': 0.8746685161030123,
 'Lasso': 0.8736740987541102,
 'DecisionTreeRegressor': 0.9999947185273901,
 'RandomForestRegressor': 0.9973551333929837,
 'KNeighborsRegressor': 0.9794734490119786,
 'GradientBoostingRegressor': 0.9746423314302498}


### Model Evaluation

In [26]:
def evaluate_regressor(model, X_test, y_test):
    pred_y = model.predict(X_test)
    pprint({
        "Model": model.__class__.__name__,
        "MSE": sklearn.metrics.mean_squared_error(y_test, pred_y),
        "RMSE": sklearn.metrics.root_mean_squared_error(y_test, pred_y),
        "R^2": sklearn.metrics.r2_score(y_test, pred_y)
    }, sort_dicts = False)

In [27]:
for model in q2_models:
    evaluate_regressor(model, q2_scaled_X_test, q2_y_test)

{'Model': 'LinearRegression',
 'MSE': 1997700.6201057788,
 'RMSE': 1413.4003750196823,
 'R^2': 0.8743333297017382}
{'Model': 'Lasso',
 'MSE': 2015437.029741819,
 'RMSE': 1419.6608854729425,
 'R^2': 0.8732176091980877}
{'Model': 'DecisionTreeRegressor',
 'MSE': 511056.762815165,
 'RMSE': 714.8823419382836,
 'R^2': 0.967851638493765}
{'Model': 'RandomForestRegressor',
 'MSE': 290007.17983093625,
 'RMSE': 538.5231469778586,
 'R^2': 0.9817569077742924}
{'Model': 'KNeighborsRegressor',
 'MSE': 483213.8833333333,
 'RMSE': 695.135874008336,
 'R^2': 0.9696031131245394}
{'Model': 'GradientBoostingRegressor',
 'MSE': 415875.84530671925,
 'RMSE': 644.8843658414423,
 'R^2': 0.9738390566578474}


## 3 - Conjuntos desbalanceados (parte 1)

- Realize os passos desta questão utilizando os conjuntos de dados: A602, A621, A627, A636 e A652.
- Os arquivos no formato `.parquet` já estão divididos em treino, teste e validação.
- Altere os valores dos dados para um problema de classificação binária: `1 if x != 0 else 0`.
- Mostre o quão desbalanceados os datasets são.
- Investigue se alguma técnica de balanceamento é efetiva para produzir um modelo de classificação com maior desempenho preditivo. Ou seja, se alguma técnica consegue melhorar o desempenho de algum dos modelos.
- As três alternativas devem ser testadas: undersampling, oversampling e threshold.
- Utilize apenas o algoritmo GradientBoostingClassifier.
- Para cada análise forneça a matriz de confusão e o relatório de classificação.

In [35]:
def get_atmoseer_dataframes(file_name):
    FILE_PATH = "./data/atmoseer/"
    df_train = pd.read_parquet(FILE_PATH + file_name + "_train.parquet.gzip")
    df_val = pd.read_parquet(FILE_PATH + file_name + "_val.parquet.gzip")
    df_test = pd.read_parquet(FILE_PATH + file_name + "_test.parquet.gzip")
    return df_train, df_val, df_test

In [32]:
# Loading dataset
file_name = "A602"
q3_df_train, q3_df_val, q3_df_test = get_atmoseer_dataframes(file_name)
q3_df_train.head()

Unnamed: 0,temperature,barometric_pressure,relative_humidity,wind_direction_u,wind_direction_v,hour_sin,hour_cos,precipitation
2019-12-04 00:00:00,0.373832,0.480114,0.686869,0.403547,0.44729,0.5,1.0,0.0
2019-12-04 01:00:00,0.370717,0.485795,0.686869,0.383545,0.444438,0.62941,0.982963,0.0
2019-12-04 02:00:00,0.376947,0.494318,0.686869,0.382324,0.431581,0.75,0.933013,0.0
2019-12-04 03:00:00,0.373832,0.491477,0.707071,0.431636,0.432161,0.853553,0.853553,0.0
2019-12-04 04:00:00,0.35514,0.477273,0.737374,0.468213,0.456114,0.933013,0.75,0.0


In [33]:
# Quantity of days with precipitation > 0.0
precipitation_data = q3_df_train[q3_df_train["precipitation"] > 0]
print(f"Percentage of samples with precipitation bigger than 0: {round(100 * len(precipitation_data) / len(q3_df_train), 2)}%")
precipitation_data.head()

Percentage of samples with precipitation bigger than 0: 8.89%


Unnamed: 0,temperature,barometric_pressure,relative_humidity,wind_direction_u,wind_direction_v,hour_sin,hour_cos,precipitation
2019-12-05 22:00:00,0.380062,0.25,0.89899,0.605141,0.450034,0.25,0.933013,1.0
2019-12-05 23:00:00,0.426791,0.269886,0.909091,0.492336,0.352343,0.37059,0.982963,4.2
2019-12-06 00:00:00,0.417445,0.269886,0.89899,0.502436,0.418591,0.5,1.0,1.2
2019-12-06 12:00:00,0.380062,0.3125,0.929293,0.643449,0.47675,0.5,0.0,1.6
2019-12-06 13:00:00,0.345794,0.318182,0.939394,0.604558,0.493127,0.37059,0.017037,4.4


In [34]:
# Binarizing class
COL_TO_BINARIZE = "precipitation"
q3_df_train[COL_TO_BINARIZE] = q3_df_train[COL_TO_BINARIZE].map(lambda x: 1 if x != 0 else 0)
q3_df_val[COL_TO_BINARIZE] = q3_df_val[COL_TO_BINARIZE].map(lambda x: 1 if x != 0 else 0)
q3_df_test[COL_TO_BINARIZE] = q3_df_test[COL_TO_BINARIZE].map(lambda x: 1 if x != 0 else 0)
q3_df_train[q3_df_train[COL_TO_BINARIZE] > 0].head()

Unnamed: 0,temperature,barometric_pressure,relative_humidity,wind_direction_u,wind_direction_v,hour_sin,hour_cos,precipitation
2019-12-05 22:00:00,0.380062,0.25,0.89899,0.605141,0.450034,0.25,0.933013,1
2019-12-05 23:00:00,0.426791,0.269886,0.909091,0.492336,0.352343,0.37059,0.982963,1
2019-12-06 00:00:00,0.417445,0.269886,0.89899,0.502436,0.418591,0.5,1.0,1
2019-12-06 12:00:00,0.380062,0.3125,0.929293,0.643449,0.47675,0.5,0.0,1
2019-12-06 13:00:00,0.345794,0.318182,0.939394,0.604558,0.493127,0.37059,0.017037,1


In [40]:
# Q3 eval
def _q3_split_class(df):
    return df.drop("precipitation", axis = 1), df.precipitation.values

def q3_evaluation(model, df_test, experiment_name = ""):
    X, y = _q3_split_class(df_test)
    y_pred = model.predict(X)

    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(
        y, y_pred).ravel()

    return {
        "Model": model.__class__.__name__ + " " + experiment_name,
        "acc": sklearn.metrics.accuracy_score(y, y_pred),
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "Classification Report": sklearn.metrics.classification_report(
            y, y_pred, output_dict = True)
    }

def train_model(model, df_train):
    X, y = _q3_split_class(df_train)
    return model.fit(X, y)

In [36]:
# Vanilla model
q3_df_train_val_concat = pd.concat([q3_df_train, q3_df_val])
q3_gbc = train_model(GradientBoostingClassifier(random_state=SEED),
                     q3_df_train_val_concat)
pprint(q3_evaluation(q3_gbc, q3_df_test, "VANILLA"), sort_dicts = False)

{'Model': 'GradientBoostingClassifier VANILLA',
 'acc': 0.87109375,
 'tp': 31,
 'tn': 638,
 'fp': 38,
 'fn': 61,
 'Classification Report': {'0': {'precision': 0.9127324749642346,
                                 'recall': 0.9437869822485208,
                                 'f1-score': 0.928,
                                 'support': 676.0},
                           '1': {'precision': 0.4492753623188406,
                                 'recall': 0.33695652173913043,
                                 'f1-score': 0.38509316770186336,
                                 'support': 92.0},
                           'accuracy': 0.87109375,
                           'macro avg': {'precision': 0.6810039186415375,
                                         'recall': 0.6403717519938256,
                                         'f1-score': 0.6565465838509317,
                                         'support': 768.0},
                           'weighted avg': {'precision': 0.8572141750119218,
 

### Oversampling

In [37]:
q3_y_zeros = q3_df_train_val_concat[q3_df_train_val_concat.precipitation == 0]
q3_y_ones = q3_df_train_val_concat[q3_df_train_val_concat.precipitation != 0]

In [38]:
q3_upsampled_ones = sklearn.utils.resample(
    q3_y_ones, replace=True, n_samples=len(q3_y_zeros), random_state=SEED)

q3_upsampled_df_train = pd.concat([q3_y_zeros, q3_upsampled_ones])
q3_gbc = train_model(GradientBoostingClassifier(random_state=SEED),
                     q3_upsampled_df_train)
pprint(q3_evaluation(q3_gbc, q3_df_test, "OVERSAMPLING"), sort_dicts = False)

{'Model': 'GradientBoostingClassifier OVERSAMPLING',
 'acc': 0.5924479166666666,
 'tp': 82,
 'tn': 373,
 'fp': 303,
 'fn': 10,
 'Classification Report': {'0': {'precision': 0.9738903394255874,
                                 'recall': 0.5517751479289941,
                                 'f1-score': 0.704438149197356,
                                 'support': 676.0},
                           '1': {'precision': 0.21298701298701297,
                                 'recall': 0.8913043478260869,
                                 'f1-score': 0.3438155136268344,
                                 'support': 92.0},
                           'accuracy': 0.5924479166666666,
                           'macro avg': {'precision': 0.5934386762063002,
                                         'recall': 0.7215397478775405,
                                         'f1-score': 0.5241268314120953,
                                         'support': 768.0},
                           'weighted avg': {'

### Undersampling

In [39]:
q3_downsampled_zeros = sklearn.utils.resample(
    q3_y_zeros, replace=False, n_samples=len(q3_y_ones), random_state=SEED)

q3_downsampled_df_train = pd.concat([q3_downsampled_zeros, q3_y_ones])
q3_gbc = train_model(GradientBoostingClassifier(random_state=SEED),
                     q3_downsampled_df_train)
pprint(q3_evaluation(q3_gbc, q3_df_test, "UNDERSAMPLING"), sort_dicts = False)

{'Model': 'GradientBoostingClassifier UNDERSAMPLING',
 'acc': 0.5677083333333334,
 'tp': 84,
 'tn': 352,
 'fp': 324,
 'fn': 8,
 'Classification Report': {'0': {'precision': 0.9777777777777777,
                                 'recall': 0.5207100591715976,
                                 'f1-score': 0.6795366795366795,
                                 'support': 676.0},
                           '1': {'precision': 0.20588235294117646,
                                 'recall': 0.9130434782608695,
                                 'f1-score': 0.336,
                                 'support': 92.0},
                           'accuracy': 0.5677083333333334,
                           'macro avg': {'precision': 0.5918300653594771,
                                         'recall': 0.7168767687162336,
                                         'f1-score': 0.5077683397683398,
                                         'support': 768.0},
                           'weighted avg': {'precision': 

### Ajuste de Limiar

In [40]:
q3_gbc = train_model(GradientBoostingClassifier(random_state=SEED),
                     q3_df_train)
thresholds = [i/20 for i in range(1, 20)]
threshold_results = {}
best_threshold_f1 = 0.0

for t in thresholds:
    pred_y = (q3_gbc.predict_proba(q3_df_val.drop("precipitation", axis=1))[:,1]>=t).astype(int)
    f1_score = sklearn.metrics.f1_score(q3_df_val.precipitation.values, pred_y, average='binary')
    threshold_results[t] = f1_score

    if f1_score > best_threshold_f1:
        best_threshold_f1 = f1_score

pred_y = (q3_gbc.predict_proba(q3_df_test.drop("precipitation", axis=1))[:,1]>=best_threshold_f1).astype(int)
tn, fp, fn, tp = sklearn.metrics.confusion_matrix(
        q3_df_test.precipitation.values, pred_y).ravel()

pprint({
        "Model": model.__class__.__name__ + " THRESHOLD",
        "acc": sklearn.metrics.accuracy_score(q3_df_test.precipitation.values, pred_y),
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "Classification Report": sklearn.metrics.classification_report(
            q3_df_test.precipitation.values, pred_y, output_dict = True)
}, sort_dicts=False)

pprint(threshold_results, sort_dicts=False)


{'Model': 'GradientBoostingRegressor THRESHOLD',
 'acc': 0.8307291666666666,
 'tp': 44,
 'tn': 594,
 'fp': 82,
 'fn': 48,
 'Classification Report': {'0': {'precision': 0.9252336448598131,
                                 'recall': 0.878698224852071,
                                 'f1-score': 0.9013657056145675,
                                 'support': 676.0},
                           '1': {'precision': 0.3492063492063492,
                                 'recall': 0.4782608695652174,
                                 'f1-score': 0.4036697247706422,
                                 'support': 92.0},
                           'accuracy': 0.8307291666666666,
                           'macro avg': {'precision': 0.6372199970330812,
                                         'recall': 0.6784795472086442,
                                         'f1-score': 0.6525177151926049,
                                         'support': 768.0},
                           'weighted avg': {'precis

## 4 - Conjuntos desbalanceados - parte II
- Dado os mesmos conjuntos de dados da questão anterior.
- Realize novamente o procedimento de binarização.
- Para cada conjunto de dados binários:
    1. Treine um modelo `C` de classificação binária.
    2. Treine um modelo `R` de regressão apenas nos pontos de dados que foram classificados como `1` por `C`.
    3. Para obter a predição de regressão para novos exemplos, primeiro compute a classficação do exemplo usando `C`. Em seguida, compute a regressão caso a classificação tenha resultado em `1`, nos outros casos retorne zero.
- Pode escolher o algoritmo que quiser.
- Reporte o resultado para os cinco conjuntos.
- Responda se o modelo produziu melhores resultados preditivos. (Comparados com o que?)

In [41]:
def q4_load_datasets(file_name):
    FILE_PATH = "./data/atmoseer/"
    return {
        "train": pd.read_parquet(FILE_PATH + file_name + "_train.parquet.gzip"),
        "validation": pd.read_parquet(FILE_PATH + file_name + "_val.parquet.gzip"),
        "test": pd.read_parquet(FILE_PATH + file_name + "_test.parquet.gzip")
    }

In [42]:
def q4_fit_regressor(datasets, regressor):
    df_train = pd.concat([datasets["train"], datasets["validation"]])
    X, y = _q3_split_class(df_train)
    regressor.fit(X, y)
    return regressor

In [43]:
def q4_evaluate_regressor(datasets, regressor, classifier = None):
    df_test = datasets["test"]
    X, y_test = _q3_split_class(df_test)
    pred_y = regressor.predict(X)

    if classifier is not None:
        c_pred_y = classifier.predict(X)
        pred_y = np.where(c_pred_y == 0, c_pred_y, pred_y)

    pprint({
        "Model": regressor.__class__.__name__,
        "MSE": sklearn.metrics.mean_squared_error(y_test, pred_y),
        "RMSE": sklearn.metrics.root_mean_squared_error(y_test, pred_y),
        "R^2": sklearn.metrics.r2_score(y_test, pred_y)
    }, sort_dicts = False)

In [44]:
def q4_binarize(datasets):
    COL_TO_BINARIZE = "precipitation"
    result = {k: v.copy() for k, v in datasets.items()}
    for df in result.values():
        df[COL_TO_BINARIZE] = df[COL_TO_BINARIZE].map(lambda x: 1 if x != 0 else 0)
    return result

In [45]:
def q4_fit_classifier(datasets, classifier):
    df_train = pd.concat([datasets["train"], datasets["validation"]])
    # y_zeros = df_train[df_train.precipitation == 0]
    # y_ones = df_train[df_train.precipitation != 0]
    # upsampled_ones = sklearn.utils.resample(
    #     y_ones, replace=True, n_samples=len(y_zeros), random_state=SEED)
    # upsampled_df_train = pd.concat([y_zeros, upsampled_ones])
    # X, y = _q3_split_class(upsampled_df_train)
    X, y = _q3_split_class(df_train)
    classifier.fit(X, y)
    return classifier

In [46]:
def q4_subset_datasets(datasets, classifier):
    partial_result = {k: v.copy() for k, v in datasets.items()}
    for df in partial_result.values():
        X, y = _q3_split_class(df)
        pred_y = classifier.predict(X)
        df["precipitation"] = pred_y
    return {k: v[v["precipitation"] == 1] for k, v in datasets.items()}

In [47]:
# Main function

def q4_main(dataset_name, classifier, regressor):
    datasets = q4_load_datasets(dataset_name)

    vanilla_regressor = q4_fit_regressor(datasets, regressor)
    q4_evaluate_regressor(datasets, vanilla_regressor)

    binarized_datasets = q4_binarize(datasets)

    classifier = q4_fit_classifier(binarized_datasets, classifier)
    subdatasets = q4_subset_datasets(datasets, classifier)

    regressor = q4_fit_regressor(subdatasets, regressor)
    q4_evaluate_regressor(datasets, regressor, classifier = classifier)


In [48]:
q4_main("A652",
        RandomForestClassifier(random_state=SEED), LinearRegression())

{'Model': 'LinearRegression',
 'MSE': 1.9652086726465654,
 'RMSE': 1.4018590059797615,
 'R^2': 0.010151392032235584}


{'Model': 'LinearRegression',
 'MSE': 1.948792995622264,
 'RMSE': 1.3959917605853782,
 'R^2': 0.018419743010694378}


## 5 - Calibração de modelos
- Considere o arquivo A652.
- Trasforme-o num problema de classificação binária.
- Ajuste um modelo `GradientBoostingClassifier`.
- Aplique alguma técnica de calibração de hiperparâmetros.
- Apresente gráficos para ilustrar o grau de calibração dos modelos antes e depois.
- Apresente uma análise dos resultados.

In [38]:
q5_df_train, q5_df_val, q5_df_test = get_atmoseer_dataframes("A652")
q5_df_train.head()

Unnamed: 0,temperature,barometric_pressure,relative_humidity,wind_direction_u,wind_direction_v,hour_sin,hour_cos,precipitation
2019-12-04 00:00:00,0.271654,0.508824,0.815217,0.363002,0.467453,0.5,1.0,0.0
2019-12-04 01:00:00,0.271654,0.514706,0.847826,0.3221,0.512614,0.62941,0.982963,0.0
2019-12-04 02:00:00,0.259843,0.520588,0.869565,0.38847,0.492906,0.75,0.933013,0.0
2019-12-04 03:00:00,0.271654,0.520588,0.869565,0.394502,0.478593,0.853553,0.853553,0.0
2019-12-04 04:00:00,0.271654,0.505882,0.836957,0.406453,0.495001,0.933013,0.75,0.0


In [42]:
# Binarizing class
COL_TO_BINARIZE = "precipitation"
q5_df_train[COL_TO_BINARIZE] = q5_df_train[COL_TO_BINARIZE].map(lambda x: 1 if x != 0 else 0)
q5_df_val[COL_TO_BINARIZE] = q5_df_val[COL_TO_BINARIZE].map(lambda x: 1 if x != 0 else 0)
q5_df_test[COL_TO_BINARIZE] = q5_df_test[COL_TO_BINARIZE].map(lambda x: 1 if x != 0 else 0)
q5_df_train[q5_df_train[COL_TO_BINARIZE] > 0].head()

Unnamed: 0,temperature,barometric_pressure,relative_humidity,wind_direction_u,wind_direction_v,hour_sin,hour_cos,precipitation
2019-12-05 23:00:00,0.519685,0.285294,0.945652,0.426757,0.472526,0.37059,0.982963,1
2019-12-06 00:00:00,0.393701,0.291176,1.0,0.403776,0.434751,0.5,1.0,1
2019-12-06 01:00:00,0.42126,0.279412,1.0,0.348247,0.466162,0.62941,0.982963,1
2019-12-06 14:00:00,0.30315,0.326471,0.891304,0.564284,0.471635,0.25,0.066987,1
2019-12-06 15:00:00,0.291339,0.355882,0.896739,0.523524,0.48255,0.146447,0.146447,1


In [44]:
q5_df_cat = pd.concat([q5_df_train, q5_df_val])
q5_gbc_vanilla = train_model(
    GradientBoostingClassifier(random_state=SEED),
    q5_df_cat)
pprint(q3_evaluation(q5_gbc_vanilla, q5_df_test, "Default Hyperparams"),
       sort_dicts = False)

{'Model': 'GradientBoostingClassifier Default Hyperparams',
 'acc': 0.9244319366270586,
 'tp': 157,
 'tn': 8712,
 'fp': 69,
 'fn': 656,
 'Classification Report': {'0': {'precision': 0.9299743808710503,
                                 'recall': 0.9921421250427058,
                                 'f1-score': 0.9600528954763348,
                                 'support': 8781.0},
                           '1': {'precision': 0.6946902654867256,
                                 'recall': 0.1931119311193112,
                                 'f1-score': 0.30221366698748797,
                                 'support': 813.0},
                           'accuracy': 0.9244319366270586,
                           'macro avg': {'precision': 0.8123323231788879,
                                         'recall': 0.5926270280810085,
                                         'f1-score': 0.6311332812319114,
                                         'support': 9594.0},
                           'weig

In [45]:
q5_X_cat, q5_y_cat = _q3_split_class(q5_df_cat)

In [48]:
q5_grid = {
        "loss": ['log_loss', 'exponential'],
        "learning_rate": [0.1, 0.05, 0.01],
        "n_estimators": [80, 100, 120]
    }
q5_grid_cv = sklearn.model_selection.GridSearchCV(
    estimator=q5_gbc_vanilla, param_grid=q5_grid, cv=5, scoring="f1")
q5_grid_cv.fit(q5_X_cat, q5_y_cat)
print(q5_grid_cv.best_params_)

{'learning_rate': 0.1, 'loss': 'log_loss', 'n_estimators': 120}


In [49]:
q5_gbc_vanilla = train_model(
    GradientBoostingClassifier(
        random_state=SEED, learning_rate=0.1, loss="log_loss", n_estimators=120
    ),
    q5_df_cat
)
pprint(
    q3_evaluation(q5_gbc_vanilla, q5_df_test, "Ajusted Hyperparams"), sort_dicts=False
)

{'Model': 'GradientBoostingClassifier Ajusted Hyperparams',
 'acc': 0.9238065457577652,
 'tp': 164,
 'tn': 8699,
 'fp': 82,
 'fn': 649,
 'Classification Report': {'0': {'precision': 0.9305733846812152,
                                 'recall': 0.9906616558478534,
                                 'f1-score': 0.9596778641954878,
                                 'support': 8781.0},
                           '1': {'precision': 0.6666666666666666,
                                 'recall': 0.2017220172201722,
                                 'f1-score': 0.3097261567516525,
                                 'support': 813.0},
                           'accuracy': 0.9238065457577652,
                           'macro avg': {'precision': 0.7986200256739409,
                                         'recall': 0.5961918365340128,
                                         'f1-score': 0.6347020104735701,
                                         'support': 9594.0},
                           'weigh

## 6 - Validação cruzada aninhada
- Considere novamente o conjunto de dados `Diamond` apresentado na parte 2.
- Realize a validação cruzada aninhada para encontrar uma boa combinação de hiperparâmetros para o modelo e um bom modelo para o conjunto de dados.
- Escolha dois dos algoritmos listados na parte 1?
- Estude a documentação do scikit-learn para decidir quais hiperparâmetros irá explorar.
- Você é livre para escolher entre usar GridSearch ou RandomSearch.
- Apresente uma análise dos resultados encontrados.

In [28]:
q6_X_train = q2_scaled_X_train
q6_y_train = q2_y_train
q6_X_test = q2_scaled_X_test
q6_y_test = q2_y_test

q6_X_train.head()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z,volume
26547,0.25,0.666667,0.428571,0.376299,0.419444,0.403846,0.766294,0.139049,0.15,0.083715
9160,0.5,0.833333,0.428571,0.168399,0.472222,0.326923,0.611732,0.110187,0.12327,0.043521
14132,0.75,0.333333,0.714286,0.18711,0.541667,0.288462,0.613594,0.111036,0.128931,0.04601
15758,0.25,0.833333,0.428571,0.27027,0.513889,0.423077,0.671322,0.121732,0.138994,0.059495
24633,0.5,0.5,0.857143,0.274428,0.530556,0.269231,0.676909,0.124278,0.142453,0.062769


In [29]:
q6_models = [KNeighborsRegressor(), GradientBoostingRegressor(random_state=SEED)]
q6_grids = [{"n_neighbors": [4, 6, 8]}, {"n_estimators": [80, 100, 120]}]
# q6_grids = [
#     {"n_neighbors": [4, 6, 8], "weights": ["uniform", "distance"]},
#     {
#         "loss": ["squared_error", "absolute_error", "huber", "quantile"],
#         "learning_rate": [0.1, 0.05, 0.01],
#         "n_estimators": [80, 100, 120]
#     }
# ]

In [30]:
NUMBER_OF_FOLDS = 3
inner_cv = sklearn.model_selection.KFold(
    n_splits=NUMBER_OF_FOLDS, shuffle=True, random_state=SEED
)
outer_cv = sklearn.model_selection.KFold(
    n_splits=NUMBER_OF_FOLDS, shuffle=True, random_state=SEED
)

nested_scores = {}

for model, grid in zip(q6_models, q6_grids):
    regressor = sklearn.model_selection.GridSearchCV(
        estimator=model, param_grid=grid, cv=inner_cv, scoring="neg_mean_squared_error"
    )
    nested_score = sklearn.model_selection.cross_val_score(
        regressor,
        X=q6_X_train,
        y=q6_y_train,
        cv=outer_cv,
        scoring="neg_mean_squared_error",
    )
    nested_scores[model.__class__.__name__] = nested_score.mean()

pprint(nested_scores)

{'GradientBoostingRegressor': -415140.47990796325,
 'KNeighborsRegressor': -570309.5462927558}


In [53]:
pprint(nested_scores)

{'GradientBoostingRegressor': 0.9737439310181987,
 'KNeighborsRegressor': 0.9676180965171909}


In [34]:
best_model_index = list(nested_scores.values()).index(max(nested_scores.values()))
best_model = q6_models[best_model_index]
final_regressor = sklearn.model_selection.GridSearchCV(
    estimator=best_model, param_grid=q6_grids[best_model_index], cv=inner_cv
)
final_regressor.fit(q6_X_train, q6_y_train)

print(f"Best Model: {best_model.__class__.__name__}")
print(f"Generalization Error (negative MSE): {max(nested_scores.values())}")
print(f"Best hyperparameters choice: {final_regressor.best_params_}")
print(f"* According to {inner_cv}")

Best Model: GradientBoostingRegressor
Generalization Error (negative MSE): -415140.47990796325
Best hyperparameters choice: {'n_estimators': 120}
* According to KFold(n_splits=3, random_state=42, shuffle=True)
