# Importações e configurações

In [510]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dropout

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization

#import matplotlib.pyplot as plt

import warnings

In [193]:
warnings.filterwarnings('ignore')

In [194]:
RANDOM_SEED = 42

# Carregando os dados

In [195]:
df = pd.read_excel("../data/cleaned_datas/cleaned_data.xlsx")

In [196]:
df

Unnamed: 0,abastecimento de água_fonte/água não tratada,abastecimento de água_outros,abastecimento de água_poço/água não tratada,abastecimento de água_rede geral/água tratada,alimentação no ambiente escolar,aluno contemplado com bolsa?,como você acessa os serviços de saúde_plano de saúde (privado),como você acessa os serviços de saúde_público e privado,como você acessa os serviços de saúde_sistema público de saúde,como você acessa/acessou à educação básica (ensino médio)_bolsista de escola privada,...,sexo_feminino,sexo_masculino,turno_integral,turno_manhã,turno_noite,turno_tarde,você possui filhos entre 0 e 6 anos,você possui filhos maiores que 6 anos,você é chefe de família ou responsável pela própria subsistência?,"é cotista por renda inferior a 1,5 salário mínimo?"
0,False,False,True,False,70.0,False,False,False,True,False,...,True,False,False,True,False,False,0,0,False,False
1,False,False,False,True,25.0,True,False,False,True,False,...,True,False,False,False,False,True,0,0,False,False
2,False,False,True,False,180.0,True,False,False,True,False,...,True,False,False,True,False,False,0,0,False,True
3,False,False,False,True,150.0,False,False,False,True,False,...,True,False,False,True,False,False,0,0,False,False
4,False,False,False,True,50.0,True,False,False,True,False,...,False,True,True,False,False,False,0,0,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14533,False,False,False,True,45.0,True,True,False,False,False,...,False,True,False,False,False,True,0,0,False,False
14534,False,False,False,True,0.0,True,False,False,True,False,...,True,False,False,False,False,True,0,0,False,True
14535,False,True,False,False,10.0,True,False,False,True,False,...,True,False,False,False,True,False,0,2,True,True
14536,False,False,False,True,40.0,True,False,False,True,False,...,False,True,False,False,True,False,0,0,True,True


# Pré-processamento dos dados

## Dropando a coluna Relatos de vida (TEMPORÁRIO)

In [197]:
df = df.drop(columns=["relato de vida"])

## Transformando float64 em float

In [198]:
df.dtypes

abastecimento de água_fonte/água não tratada                            bool
abastecimento de água_outros                                            bool
abastecimento de água_poço/água não tratada                             bool
abastecimento de água_rede geral/água tratada                           bool
alimentação no ambiente escolar                                      float64
                                                                      ...   
turno_tarde                                                             bool
você possui filhos entre 0 e 6 anos                                    int64
você possui filhos maiores que 6 anos                                  int64
você é chefe de família ou responsável pela própria subsistência?       bool
é cotista por renda inferior a 1,5 salário mínimo?                      bool
Length: 106, dtype: object

## Analisando correlação entre features

Por enquanto, vou manter sem o drop destas colunas correlacionadas.

In [199]:
"""
def get_highly_correlated_features(correlation_matrix, threshold):
  correlated_pairs = []
  for i in range(len(correlation_matrix.columns)):
    for j in range(i):
      if abs(correlation_matrix.iloc[i, j]) > threshold:
        pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])
        coefficient = correlation_matrix.iloc[i, j]
        correlated_pairs.append((pair, coefficient))
  return sorted(correlated_pairs, key= lambda pair: pair[1], reverse=True)
"""

'\ndef get_highly_correlated_features(correlation_matrix, threshold):\n  correlated_pairs = []\n  for i in range(len(correlation_matrix.columns)):\n    for j in range(i):\n      if abs(correlation_matrix.iloc[i, j]) > threshold:\n        pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])\n        coefficient = correlation_matrix.iloc[i, j]\n        correlated_pairs.append((pair, coefficient))\n  return sorted(correlated_pairs, key= lambda pair: pair[1], reverse=True)\n'

In [200]:
"""
corr_matrix = df.corr().abs()
correlation_list = get_highly_correlated_features(corr_matrix, 0.95)
"""

'\ncorr_matrix = df.corr().abs()\ncorrelation_list = get_highly_correlated_features(corr_matrix, 0.95)\n'

In [201]:
#correlation_list[:10]

In [202]:
"""
f2drop = []
for feature_pair, _ in correlation_list:
  if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:
    f2drop.append(feature_pair[1])
"""

'\nf2drop = []\nfor feature_pair, _ in correlation_list:\n  if feature_pair[0] not in f2drop and feature_pair[1] not in f2drop:\n    f2drop.append(feature_pair[1])\n'

In [203]:
#f2drop

In [204]:
#df = df.drop(f2drop, axis='columns')

## Dividindo dados nos conjuntos de treino, validação e teste

In [205]:
labels = df[['aluno contemplado com bolsa?']].copy()
df = df.drop(columns=['aluno contemplado com bolsa?'])

In [206]:
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.1)#, random_state=RANDOM_SEED)

## Normalizando os dados numéricos

In [207]:
#numeric_columns = df.select_dtypes(include=['number']).columns

std_scaler = StandardScaler()
std_scaler = std_scaler.fit(X_train)

X_train = std_scaler.transform(X_train)
X_test = std_scaler.transform(X_test)

# Inteligências artificiais

## Funções de métricas

In [208]:
def get_overall_metrics(y_true, y_pred):
  tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
  acc = (tp+tn)/(tp+tn+fp+fn)
  tpr = tp/(tp+fn)
  fpr = fp/(fp+tn)
  precision = tp/(tp+fp)
  f1 = (2*tpr*precision)/(tpr+precision)
  return {'acc':acc,'tpr':tpr,'fpr':fpr,'precision':precision,'f1-score':f1}

In [209]:
def transform_prob_in_pred(probs):
    predict_list = list()
    for prob in probs:
        if prob[0] > prob[1]:
            predict_list.append(False)
        elif prob[0] < prob[1]:
            predict_list.append(True)

    predict_array = np.array(predict_list)
    return predict_array

## Modelos de aprendizagem de máquina clássicos

### Naive bayesian

#### Treinamento

In [210]:
model = GaussianNB()

In [211]:
model.fit(X_train, y_train)

#### Teste

In [212]:
y_pred_proba = model.predict_proba(X_test)

In [213]:
y_pred = model.predict(X_test)

In [214]:
class_labels = model.classes_
print(class_labels)

[False  True]


In [215]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.7414030261348006
Precision: 0.7454890821064801
Recall: 0.7414030261348006


In [216]:
get_overall_metrics(y_test, y_pred)

{'acc': 0.7414030261348006,
 'tpr': 0.7446300715990454,
 'fpr': 0.262987012987013,
 'precision': 0.7938931297709924,
 'f1-score': 0.768472906403941}

### Logistic regression

#### Treinamento

In [217]:
model = LogisticRegression()

In [218]:
model.fit(X_train, y_train)

#### Teste

In [219]:
class_labels = model.classes_
print(class_labels)

[False  True]


In [220]:
y_pred_proba = model.predict_proba(X_test)

In [221]:
y_pred = model.predict(X_test)

In [222]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.8046767537826685
Precision: 0.8278823321831952
Recall: 0.8046767537826685


In [223]:
get_overall_metrics(y_test, y_pred)

{'acc': 0.8046767537826685,
 'tpr': 0.9606205250596659,
 'fpr': 0.4074675324675325,
 'precision': 0.7623106060606061,
 'f1-score': 0.8500527983104541}

### Random Forest

#### Treinamento

In [224]:
model = RandomForestClassifier(n_estimators=100)#, random_state=RANDOM_SEED)

In [225]:
model.fit(X_train, y_train)

#### Teste

In [226]:
class_labels = model.classes_
print(class_labels)

[False  True]


In [227]:
y_pred_proba = model.predict_proba(X_test)

In [228]:
y_pred = model.predict(X_test)

In [229]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.8039889958734525
Precision: 0.8122430306122802
Recall: 0.8039889958734525


In [230]:
get_overall_metrics(y_test, y_pred)

{'acc': 0.8039889958734525,
 'tpr': 0.9212410501193318,
 'fpr': 0.3555194805194805,
 'precision': 0.7790110998990918,
 'f1-score': 0.8441771459814106}

### Gradient Boost Machine

#### Treinamento

In [231]:
model = GradientBoostingClassifier(n_estimators=100)#, random_state=RANDOM_SEED)

In [232]:
model.fit(X_train, y_train)

#### Teste

In [233]:
class_labels = model.classes_
print(class_labels)

[False  True]


In [234]:
y_pred_proba = model.predict_proba(X_test)

In [235]:
y_pred = model.predict(X_test)

In [236]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.8053645116918845
Precision: 0.8277930050939069
Recall: 0.8053645116918845


In [237]:
get_overall_metrics(y_test, y_pred)

{'acc': 0.8053645116918845,
 'tpr': 0.9594272076372315,
 'fpr': 0.4042207792207792,
 'precision': 0.7635327635327636,
 'f1-score': 0.8503437334743523}

### Support Vector Machines

#### Treinamento

In [238]:
model = SVC(probability=True)#, random_state=RANDOM_SEED)

In [239]:
model.fit(X_train, y_train)

#### Teste

In [240]:
y_pred_proba = model.predict_proba(X_test)

In [241]:
y_pred = model.predict(X_test)

In [242]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.797799174690509
Precision: 0.8335617165616245
Recall: 0.797799174690509


In [243]:
get_overall_metrics(y_test, y_pred)

{'acc': 0.797799174690509,
 'tpr': 0.9785202863961814,
 'fpr': 0.44805194805194803,
 'precision': 0.7481751824817519,
 'f1-score': 0.8479834539813857}

## Modelos de aprendizagem profunda

Configurações opcionais:

In [244]:
#labels = tf.keras.utils.to_categorical(labels, num_classes=2)

In [245]:
#labels = to_categorical(labels, num_classes=2)

### Rede neural simples (MLP)

In [292]:
X_train.shape

(13084, 105)

#### Treinamento

In [555]:
model = Sequential([
    Dense(2048, activation='relu', input_shape=(X_train.shape[1],)),  # Camada de entrada com 1536 neurônios
    BatchNormalization(),  # Normalização da camada
    Dropout(0.05),  # Dropout para regularização
    Dense(1536, activation='relu'),  # Primeira camada oculta com 1536 neurônios
    BatchNormalization(),  # Normalização da camada
    Dropout(0.05),  # Dropout para regularização
    Dense(1024, activation='relu'),  # Segunda camada oculta com 1024 neurônios
    BatchNormalization(),  # Normalização da camada
    Dropout(0.05),  # Dropout para regularização
    Dense(512, activation='relu'),  # Terceira camada oculta com 512 neurônios
    BatchNormalization(),  # Normalização da camada
    Dropout(0.05),  # Dropout para regularização
    Dense(256, activation='relu'),  # Quarta camada oculta com 256 neurônios
    BatchNormalization(),  # Normalização da camada
    Dropout(0.05),  # Dropout para regularização
    Dense(128, activation='relu'),  # Quinta camada oculta com 128 neurônios
    Dense(64, activation='relu'),  # Sexta camada oculta com 64 neurônios
    Dense(32, activation='relu'),  # Sétima camada oculta com 32 neurônios
    Dense(2, activation='softmax')  # Camada de saída para duas classes
])

In [556]:
optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [557]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [558]:
model.fit(X_train, y_train, epochs=300, batch_size=32, validation_split=0.1, callbacks=[early_stopping])

Epoch 1/300
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - accuracy: 0.6757 - loss: 0.5844 - val_accuracy: 0.7861 - val_loss: 0.4591
Epoch 2/300
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.8016 - loss: 0.4218 - val_accuracy: 0.8044 - val_loss: 0.4484
Epoch 3/300
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.8212 - loss: 0.3780 - val_accuracy: 0.8060 - val_loss: 0.4389
Epoch 4/300
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.8315 - loss: 0.3557 - val_accuracy: 0.7907 - val_loss: 0.4651
Epoch 5/300
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.8492 - loss: 0.3366 - val_accuracy: 0.7892 - val_loss: 0.4685
Epoch 6/300
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.8475 - loss: 0.3252 - val_accuracy: 0.7892 - val_loss: 0.4991


<keras.src.callbacks.history.History at 0x1d28e5f2540>

#### Teste

In [559]:
y_pred_proba = model.predict(X_test)

[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [560]:
y_pred = transform_prob_in_pred(y_pred_proba)

In [561]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.8005502063273727
Precision: 0.8196921780440116
Recall: 0.8005502063273727


In [562]:
get_overall_metrics(y_test, y_pred)

{'acc': 0.8005502063273727,
 'tpr': 0.9498806682577565,
 'fpr': 0.4025974025974026,
 'precision': 0.7624521072796935,
 'f1-score': 0.8459086078639744}