# Importações e configurações

In [74]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.impute import KNNImputer

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization

from tpot import TPOTClassifier

#import matplotlib.pyplot as plt

import warnings

In [75]:
warnings.filterwarnings('ignore')

In [76]:
RANDOM_SEED = 42

# Carregando os dados

In [77]:
df = pd.read_excel("../data/cleaned_datas/cleaned_data.xlsx")

In [78]:
df

Unnamed: 0,abastecimento de água_fonte/água não tratada,abastecimento de água_outros,abastecimento de água_poço/água não tratada,abastecimento de água_rede geral/água tratada,alimentação no ambiente escolar,aluno contemplado com bolsa?,como você acessa os serviços de saúde_plano de saúde (privado),como você acessa os serviços de saúde_público e privado,como você acessa os serviços de saúde_sistema público de saúde,como você acessa/acessou à educação básica (ensino médio)_bolsista de escola privada,...,sexo_feminino,sexo_masculino,turno_integral,turno_manhã,turno_noite,turno_tarde,você possui filhos entre 0 e 6 anos,você possui filhos maiores que 6 anos,você é chefe de família ou responsável pela própria subsistência?,"é cotista por renda inferior a 1,5 salário mínimo?"
0,False,False,True,False,70.0,False,False,False,True,False,...,True,False,False,True,False,False,0,0,False,False
1,False,False,False,True,25.0,True,False,False,True,False,...,True,False,False,False,False,True,0,0,False,False
2,False,False,True,False,180.0,True,False,False,True,False,...,True,False,False,True,False,False,0,0,False,True
3,False,False,False,True,150.0,False,False,False,True,False,...,True,False,False,True,False,False,0,0,False,False
4,False,False,False,True,50.0,True,False,False,True,False,...,False,True,True,False,False,False,0,0,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14533,False,False,False,True,45.0,True,True,False,False,False,...,False,True,False,False,False,True,0,0,False,False
14534,False,False,False,True,0.0,True,False,False,True,False,...,True,False,False,False,False,True,0,0,False,True
14535,False,True,False,False,10.0,True,False,False,True,False,...,True,False,False,False,True,False,0,2,True,True
14536,False,False,False,True,40.0,True,False,False,True,False,...,False,True,False,False,True,False,0,0,True,True


# Pré-processamento dos dados

## Dropando a coluna Relatos de vida (TEMPORÁRIO)

In [79]:
df = df.drop(columns=["relato de vida"])

## Transformando float64 em float

In [80]:
df.dtypes

abastecimento de água_fonte/água não tratada                            bool
abastecimento de água_outros                                            bool
abastecimento de água_poço/água não tratada                             bool
abastecimento de água_rede geral/água tratada                           bool
alimentação no ambiente escolar                                      float64
                                                                      ...   
turno_tarde                                                             bool
você possui filhos entre 0 e 6 anos                                    int64
você possui filhos maiores que 6 anos                                  int64
você é chefe de família ou responsável pela própria subsistência?       bool
é cotista por renda inferior a 1,5 salário mínimo?                      bool
Length: 83, dtype: object

## Analisando correlação entre features

Por enquanto, vou manter sem o drop destas colunas correlacionadas.

In [81]:
"""
def get_highly_correlated_features(correlation_matrix, threshold):
  correlated_pairs = []
  for i in range(len(correlation_matrix.columns)):
    for j in range(i):
      if abs(correlation_matrix.iloc[i, j]) > threshold:
        pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])
        coefficient = correlation_matrix.iloc[i, j]
        correlated_pairs.append((pair, coefficient))
  return sorted(correlated_pairs, key= lambda pair: pair[1], reverse=True)
"""

'\ndef get_highly_correlated_features(correlation_matrix, threshold):\n  correlated_pairs = []\n  for i in range(len(correlation_matrix.columns)):\n    for j in range(i):\n      if abs(correlation_matrix.iloc[i, j]) > threshold:\n        pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])\n        coefficient = correlation_matrix.iloc[i, j]\n        correlated_pairs.append((pair, coefficient))\n  return sorted(correlated_pairs, key= lambda pair: pair[1], reverse=True)\n'

In [82]:
"""
corr_matrix = df.corr().abs()
correlation_list = get_highly_correlated_features(corr_matrix, 0.95)
"""

'\ncorr_matrix = df.corr().abs()\ncorrelation_list = get_highly_correlated_features(corr_matrix, 0.95)\n'

In [83]:
#correlation_list[:10]

In [84]:
"""
f2drop = []
for feature_pair, _ in correlation_list:
  if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:
    f2drop.append(feature_pair[1])
"""

'\nf2drop = []\nfor feature_pair, _ in correlation_list:\n  if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:\n    f2drop.append(feature_pair[1])\n'

In [85]:
#f2drop

In [86]:
#df = df.drop(f2drop, axis='columns')

## Lidando com valores nulos

In [87]:
#imputer = KNNImputer(n_neighbors=3)

In [88]:
#df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

## Dividindo dados nos conjuntos de treino, validação e teste

In [89]:
labels = df[['aluno contemplado com bolsa?']].copy()
df = df.drop(columns=['aluno contemplado com bolsa?'])

In [90]:
#def convert_label(value):
#    return False if value <= 0.5 else True

In [91]:
labels

Unnamed: 0,aluno contemplado com bolsa?
0,False
1,True
2,True
3,False
4,True
...,...
14533,True
14534,True
14535,True
14536,True


In [92]:
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.1)#, random_state=RANDOM_SEED)

## Normalizando os dados numéricos

In [93]:
#numeric_columns = df.select_dtypes(include=['number']).columns

std_scaler = StandardScaler()
std_scaler = std_scaler.fit(X_train)

X_train = std_scaler.transform(X_train)
X_test = std_scaler.transform(X_test)

# Inteligências artificiais

## Funções de métricas

In [94]:
def get_overall_metrics(y_true, y_pred):
  tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
  acc = (tp+tn)/(tp+tn+fp+fn)
  tpr = tp/(tp+fn)
  fpr = fp/(fp+tn)
  precision = tp/(tp+fp)
  f1 = (2*tpr*precision)/(tpr+precision)
  return {'acc':acc,'tpr':tpr,'fpr':fpr,'precision':precision,'f1-score':f1}

In [95]:
def transform_prob_in_pred(probs):
    predict_list = list()
    for prob in probs:
        if prob[0] > prob[1]:
            predict_list.append(False)
        elif prob[0] < prob[1]:
            predict_list.append(True)

    predict_array = np.array(predict_list)
    return predict_array

## Modelos de aprendizagem de máquina clássicos

### Naive bayesian

#### Treinamento

In [96]:
model = GaussianNB()

In [97]:
model.fit(X_train, y_train)

#### Teste

In [98]:
y_pred_proba = model.predict_proba(X_test)

In [99]:
y_pred = model.predict(X_test)

In [100]:
class_labels = model.classes_
print(class_labels)

[False  True]


In [101]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.7668500687757909
Precision: 0.7674448024884744
Recall: 0.7668500687757909


In [102]:
get_overall_metrics(y_test, y_pred)

{'acc': 0.7668500687757909,
 'tpr': 0.7981438515081206,
 'fpr': 0.27871621621621623,
 'precision': 0.8065650644783119,
 'f1-score': 0.8023323615160349}

### Logistic regression

#### Treinamento

In [103]:
model = LogisticRegression()

In [104]:
model.fit(X_train, y_train)

#### Teste

In [105]:
class_labels = model.classes_
print(class_labels)

[False  True]


In [106]:
y_pred_proba = model.predict_proba(X_test)

In [107]:
y_pred = model.predict(X_test)

In [108]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.8108665749656121
Precision: 0.838664348089463
Recall: 0.8108665749656121


In [109]:
get_overall_metrics(y_test, y_pred)

{'acc': 0.8108665749656121,
 'tpr': 0.9756380510440835,
 'fpr': 0.42905405405405406,
 'precision': 0.7680365296803653,
 'f1-score': 0.8594787940725599}

### Random Forest

#### Treinamento

In [110]:
model = RandomForestClassifier(n_estimators=100)#, random_state=RANDOM_SEED)

In [111]:
model.fit(X_train, y_train)

#### Teste

In [112]:
class_labels = model.classes_
print(class_labels)

[False  True]


In [113]:
y_pred_proba = model.predict_proba(X_test)

In [114]:
y_pred = model.predict(X_test)

In [115]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.8088033012379643
Precision: 0.8170050271517402
Recall: 0.8088033012379643


In [116]:
get_overall_metrics(y_test, y_pred)

{'acc': 0.8088033012379643,
 'tpr': 0.9303944315545244,
 'fpr': 0.36824324324324326,
 'precision': 0.7862745098039216,
 'f1-score': 0.8522848034006376}

### Gradient Boost Machine

#### Treinamento

In [117]:
model = GradientBoostingClassifier(n_estimators=100)#, random_state=RANDOM_SEED)

In [118]:
model.fit(X_train, y_train)

#### Teste

In [119]:
class_labels = model.classes_
print(class_labels)

[False  True]


In [120]:
y_pred_proba = model.predict_proba(X_test)

In [121]:
y_pred = model.predict(X_test)

In [122]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.8246217331499313
Precision: 0.8510593253362161
Recall: 0.8246217331499313


In [123]:
get_overall_metrics(y_test, y_pred)

{'acc': 0.8246217331499313,
 'tpr': 0.9802784222737819,
 'fpr': 0.40202702702702703,
 'precision': 0.7802400738688827,
 'f1-score': 0.8688946015424165}

### Support Vector Machines

#### Treinamento

In [124]:
model = SVC(probability=True)#, random_state=RANDOM_SEED)

In [125]:
model.fit(X_train, y_train)

#### Teste

In [126]:
y_pred_proba = model.predict_proba(X_test)

In [127]:
y_pred = model.predict(X_test)

In [128]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.8046767537826685
Precision: 0.8328800772040927
Recall: 0.8046767537826685


In [129]:
get_overall_metrics(y_test, y_pred)

{'acc': 0.8046767537826685,
 'tpr': 0.9733178654292344,
 'fpr': 0.4408783783783784,
 'precision': 0.7627272727272727,
 'f1-score': 0.855249745158002}

### Cross Batch Model (XBM)

#### Treinamento

In [130]:
model = XGBClassifier(
    objective='binary:logistic',  # Problema de classificação binária
    booster='gbtree',             # Tipo de booster
    n_estimators=100,             # Número de árvores
    learning_rate=0.178,            # Taxa de aprendizado
    max_depth=6,                  # Profundidade máxima das árvores
    min_child_weight=10,           # Peso mínimo da criança
    gamma=0.5,                      # Redução mínima na função de perda para fazer uma divisão
    subsample=0.8,                # Fração de amostras usadas para ajustar cada árvore
    colsample_bytree=0.8,         # Fração de características usadas para cada árvore            
    alpha=1,                      # Regularização L1
    scale_pos_weight=1           # Peso para a classe positiva
    #random_state=RANDOM_SEED               # Semente para reprodutibilidade
    #n_jobs=6
)

In [131]:
model.fit(X_train, y_train)

#### Teste

In [132]:
y_pred_proba = model.predict_proba(X_test)

In [133]:
y_pred = model.predict(X_test)

In [134]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.8253094910591472
Precision: 0.8364266965290246
Recall: 0.8253094910591472


In [135]:
get_overall_metrics(y_test, y_pred)

{'acc': 0.8253094910591472,
 'tpr': 0.9477958236658933,
 'fpr': 0.3530405405405405,
 'precision': 0.7962962962962963,
 'f1-score': 0.8654661016949151}

## Modelos de aprendizagem profunda

Configurações opcionais:

In [136]:
#labels = tf.keras.utils.to_categorical(labels, num_classes=2)

In [137]:
#labels = to_categorical(labels, num_classes=2)

### Rede neural simples (MLP)

#### Treinamento

In [138]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),  # Camada de entrada com 1536 neurônios
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(2, activation='softmax')
])

In [139]:
optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [140]:
early_stopping = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)

In [141]:
model.fit(X_train, y_train, epochs=300, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/300


[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 874us/step - accuracy: 0.4973 - loss: 0.8079 - val_accuracy: 0.6611 - val_loss: 0.6262
Epoch 2/300
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 636us/step - accuracy: 0.6309 - loss: 0.6606 - val_accuracy: 0.7253 - val_loss: 0.5805
Epoch 3/300
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 629us/step - accuracy: 0.6855 - loss: 0.6069 - val_accuracy: 0.7726 - val_loss: 0.5308
Epoch 4/300
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 630us/step - accuracy: 0.7247 - loss: 0.5633 - val_accuracy: 0.7959 - val_loss: 0.4846
Epoch 5/300
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 648us/step - accuracy: 0.7586 - loss: 0.5219 - val_accuracy: 0.8047 - val_loss: 0.4565
Epoch 6/300
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 627us/step - accuracy: 0.7738 - loss: 0.4994 - val_accuracy: 0.8066 - val_loss: 0.4435
Epoch 7/300
[1m328/32

<keras.src.callbacks.history.History at 0x1a7fa01ac00>

#### Teste

In [142]:
y_pred_proba = model.predict(X_test)

[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 856us/step


In [143]:
y_pred = transform_prob_in_pred(y_pred_proba)

In [144]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.8088033012379643
Precision: 0.8394408798744663
Recall: 0.8088033012379643


In [145]:
get_overall_metrics(y_test, y_pred)

{'acc': 0.8088033012379643,
 'tpr': 0.9791183294663574,
 'fpr': 0.4391891891891892,
 'precision': 0.7644927536231884,
 'f1-score': 0.8585961342828078}

: 

## Aprendizagem de máquina automática (AutoML)

## Tpot

In [146]:
tpot = TPOTClassifier(verbosity=2, generations=5, population_size=20)
tpot.fit(X_train, y_train)

Optimization Progress:  15%|█▌        | 18/120 [01:00<07:26,  4.38s/pipeline]

In [None]:
print(tpot.score(X_test, y_test))

AttributeError: 'TPOTClassifier' object has no attribute 'fitted_pipeline_'