# EP Inteligência Artificial
## Aluno: Natanael Magalhães Cardoso, 8914122

## Itens 1 e 2

* Construir uma rede neural do tipo MLP com número configurável de camadas escondidas que seja treinável pelo algoritmo SGD (3.0)
  * A única diferença dessa entrega e o trabalho realizado no laboratório é o "S" do SGD, que envolve o treinamento utilizando:
  * Grupos de exemplos a cada iteração, ao invés do dataset de treino inteiro. Esses grupos são tipicamente chamados de "batches".
  * A learning rate inicial deve ser reduzida a cada passo para auxiliar a convergência. Formalmente ela deve ser quadraticamente somável.
  * Para lr_0 < 1.0, a expressão de decrescimento lr_i = lr_{i-1}/(1+i) é suficiente.
* Adicionar um termo de regularização do tipo L2 ao treinamento (2.0)

In [16]:
# PCS3438 - Inteligência Artificial - 2023/2
# Template para aula de laboratório em Redes Neurais - 20/09/2023

from typing import List
import numpy as np
from sklearn.datasets import load_breast_cancer


def sigmoid(x: np.ndarray):
  return 1 / (1 + np.exp(-x))


def sigmoid_derivative(x: np.ndarray):
  return x * (1 - x)


def mse_loss(y: np.ndarray, y_hat: np.ndarray):
  return np.mean(np.power(y - y_hat, 2))


def mse_loss_derivative(y: np.ndarray, y_hat: np.ndarray):
  return y_hat - y



class Layer:
  def __init__(self, input_dim: int, output_dim: int, reg_strength: float = 0.0):
    self.weights = 2 * np.random.random((input_dim, output_dim)) - 1
    self.biases = np.zeros((1, output_dim))
    self.input: np.ndarray | None = None
    self.output: np.ndarray | None = None
    self.reg_strength = reg_strength  # Força da regularização L2

  def forward(self, input_data) -> np.ndarray:
    self.input = input_data
    raw_output = np.dot(input_data, self.weights) + self.biases
    self.output = sigmoid(raw_output)
    return self.output

  def backward(self, output_error: np.ndarray, learning_rate: float) -> np.ndarray:
    local_gradient = sigmoid_derivative(self.output)
    layer_error = output_error * local_gradient

    # Termo de regularização L2
    reg_term = 2 * self.reg_strength * self.weights

    # Atualiza os pesos e biases usando gradiente descendente com termo de regularização L2
    self.weights -= (np.dot(self.input.T, layer_error) + reg_term) * learning_rate
    self.biases -= np.sum(layer_error, axis=0, keepdims=True) * learning_rate

    # Retorna o erro para a camada anterior
    return np.dot(layer_error, self.weights.T)


def forward(input: np.ndarray, layers: list[Layer]):
  """
  Args:
    input (np.ndarray): Input data
    layers (list[Layer]): List of layers

  Returns:
    np.ndarray: Output of the MLP model
  """
  current_input = input
  for layer in layers:
    current_input = layer.forward(current_input)
  return current_input


def backward(
  y: np.ndarray, y_hat: np.ndarray, layers: list[Layer], learning_rate: float
) -> None:
  """
  Args:
    y (np.ndarray): Ground truth
    y_hat (np.ndarray): Predicted values
    layers (list[Layer]): List of layers
    learning_rate (float): Learning rate
  """
  output_error = mse_loss_derivative(y, y_hat)
  for layer in reversed(layers):
    output_error = layer.backward(output_error, learning_rate)
    

def train_sgd(
  layers: List[Layer], 
  X: np.ndarray, 
  y: np.ndarray, 
  epochs: int, 
  lr: float, 
  batch_size: int, 
  reg_strength: float,
  verbose: bool = True,
):
  # Treinar o modelo MLP
  for epoch in range(epochs):
    # Embaralhamento SGD
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X_train_shuffled = X[indices]
    y_train_shuffled = y[indices]

    for i in range(0, X.shape[0], batch_size):
      # Criação dos batches
      X_batch = X_train_shuffled[i:i+batch_size]
      y_batch = y_train_shuffled[i:i+batch_size]

      # Passo forward
      y_hat = forward(X_batch, layers)

      # Loss com o termo de regularização
      loss = mse_loss(y_batch.reshape(-1, 1), y_hat) + 0.5 * reg_strength * sum(np.sum(layer.weights**2) for layer in layers)

      # Backward
      backward(y_batch.reshape(-1, 1), y_hat, layers, lr)

    # Update learning rate
    lr = lr / (1 + epoch)

    if verbose and epoch % 1000 == 0:
      print(f"Epoch {epoch} Loss: {np.mean(loss)}")

## Item 3

* Obter o dataset Breast Cancer Winsconsin Dataset (https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html) (3.0)
  * Realizar a divisão do dataset em subset de treino (80%) e de teste (20%)
  * Realizar a normalização das features (Z-Score ou Max-min)
  * Realizar o treinamento do MLP desenvolvido para classificação do dataset de teste (já com a regularização)
  * Reportar a acurácia da classificação
  * Reportar a matriz de confusão da classificação

In [17]:
def standard_scaler(train_data, test_data=None):
  # Calcular a média e o desvio padrão do conjunto de treino
  mean = np.mean(train_data, axis=0)
  std = np.std(train_data, axis=0)

  # Aplicar a normalização Z-Score ao conjunto de treino
  train_data_normalized = (train_data - mean) / std

  if test_data is not None:
    # Aplicar a mesma normalização ao conjunto de teste
    test_data_normalized = (test_data - mean) / std
    return train_data_normalized, test_data_normalized
  else:
    return train_data_normalized
  
  
def train_test_split(
  data: np.ndarray, 
  labels: np.ndarray, 
  test_size: float = 0.25, 
  random_state: int = None
):
  rng = np.random.default_rng(random_state)

  num_samples = len(data)
  num_test = int(test_size * num_samples)

  indices = np.arange(num_samples)
  rng.shuffle(indices)

  test_indices = indices[:num_test]
  train_indices = indices[num_test:]

  X_train, X_test = data[train_indices], data[test_indices]
  y_train, y_test = labels[train_indices], labels[test_indices]

  return X_train, X_test, y_train, y_test


def accuracy_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
  # Contar o número de predições corretas
  correct_predictions = np.sum(y_true == y_pred)
  # Calcular a acurácia
  accuracy = correct_predictions / len(y_true)

  return accuracy


def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
  # Calcular a matriz de confusão
  unique_labels = np.unique(np.concatenate([y_true, y_pred]))
  num_labels = len(unique_labels)
  confusion_mat = np.zeros((num_labels, num_labels), dtype=int)

  for true_label, pred_label in zip(y_true, y_pred):
    true_idx = np.where(unique_labels == true_label)[0][0]
    pred_idx = np.where(unique_labels == pred_label)[0][0]
    confusion_mat[true_idx, pred_idx] += 1

  return confusion_mat


def test_3():
  # Carregar o conjunto de dados Breast Cancer Wisconsin
  data = load_breast_cancer()
  X = data.data
  y = data.target

  # Dividir o conjunto de dados em treino e teste (80% treino, 20% teste)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Normalizar as features usando Z-Score
  X_train_normalized, X_test_normalized = standard_scaler(X_train, X_test)

  # Definir hiperparâmetros
  layer_sizes = [X.shape[1], 10, 5, 1]
  epochs = 10000
  lr_0 = 0.1
  batch_size = 64
  lr = lr_0
  reg_strength = 0.001

  # Inicializar camadas com regularização
  layers = [Layer(layer_sizes[i], layer_sizes[i + 1], reg_strength) for i in range(len(layer_sizes) - 1)]

  train_sgd(layers, X_train_normalized, y_train, epochs, lr, batch_size, reg_strength)

  # Testar o modelo no conjunto de teste
  y_test_hat = forward(X_test_normalized, layers)
  y_test_hat_binary = (y_test_hat > 0.5).astype(int).flatten()

  # Calcular e imprimir a acurácia
  accuracy = accuracy_score(y_test, y_test_hat_binary)
  print(f"\nAcurácia: {accuracy:.4f}")

  # Calcular e imprimir a matriz de confusão
  conf_matrix = confusion_matrix(y_test, y_test_hat_binary)
  tn, fp, fn, tp = conf_matrix.ravel()
  print("\nMatriz de Confusão:")
  print(conf_matrix)
  print('Verdadeiro Negativo:', tn)
  print('Verdadeito Positivo:', tp)
  print('Falso Positivo:', fp)
  print('Falso Negativo:', fn)
  
test_3()

Epoch 0 Loss: 0.19279935262281278
Epoch 1000 Loss: 0.07605301964698802
Epoch 2000 Loss: 0.08916311712198355
Epoch 3000 Loss: 0.12029573452909006
Epoch 4000 Loss: 0.07381216851748475
Epoch 5000 Loss: 0.08865671909001557
Epoch 6000 Loss: 0.17089502855718594
Epoch 7000 Loss: 0.11202978619948985
Epoch 8000 Loss: 0.13263213383350625
Epoch 9000 Loss: 0.10019762431368218

Acurácia: 0.9558

Matriz de Confusão:
[[35  3]
 [ 2 73]]
Verdadeiro Negativo: 35
Verdadeito Positivo: 73
Falso Positivo: 3
Falso Negativo: 2


## Item 4

* Utilizar a técnica de k-fold cross-validation para selecionar os hiperparâmetros alpha (coeficiente da regularização L2) e learning rate. (2.0)
* Plotar a variação da loss do dataset de validação vs cada hiperparâmetro

In [18]:
def k_fold(layers, X, y, k, epochs, learning_rate, batch_size, reg_strength, verbose=True):
  fold_size = len(X) // k
  accuracies = []
  for i in range(k):
    if verbose:
      print(f'Fold {i+1} of {k}')
    # Separar o conjunto em folds de treino e validação
    val_start = i * fold_size
    val_end = (i + 1) * fold_size
    X_val_fold = X[val_start:val_end]
    y_val_fold = y[val_start:val_end]
    X_train_fold = np.concatenate([X[:val_start], X[val_end:]])
    y_train_fold = np.concatenate([y[:val_start], y[val_end:]])

    # Normalizar as features usando Z-Score
    X_train_fold, X_val_fold = standard_scaler(X_train_fold, X_val_fold)

    # Treinar o modelo MLP
    train_sgd(layers, X_train_fold, y_train_fold, epochs, learning_rate, batch_size, reg_strength, verbose=False)

    # Testar o modelo no conjunto de validação
    y_val_hat = forward(X_val_fold, layers)
    y_val_hat_binary = (y_val_hat > 0.5).astype(int).flatten()

    # Calcular acurácia
    accuracy = accuracy_score(y_val_fold, y_val_hat_binary)
    accuracies.append(accuracy)
  return accuracies


def grid_search_l2_lr(layer_sizes, X, y, k, epochs, reg_strengths, learning_rates, batch_size):
  results = []
  for reg_strength in reg_strengths:
    layers = [Layer(layer_sizes[i], layer_sizes[i + 1], reg_strength) for i in range(len(layer_sizes) - 1)]
    for learning_rate in learning_rates:
      print(f'Hyperparams: reg strength: {reg_strength} | learning rate: {learning_rate}')
      acc = k_fold(layers, X, y, k, epochs, learning_rate, batch_size, reg_strength, verbose=False)
      mean_acc = np.mean(acc)
      results.append({
        'reg_strength': reg_strength,
        'learning_rate': learning_rate,
        'mean_accuracy': mean_acc
      })
  return sorted(results, key=lambda x: x['mean_accuracy'], reverse=True)


def test_4():
  # Carregar o conjunto de dados Breast Cancer Wisconsin
  data = load_breast_cancer()
  X = data.data
  y = data.target
  k = 5  # Número de pastas
  reg_strengths = [0.001, 0.01, 0.1]  # Coeficientes de regularização L2
  learning_rates = [0.01, 0.1]  # Taxas de aprendizado
  layer_sizes = [X.shape[1], 10, 5, 1]
  epochs = 1000
  batch_size = 64

  print('Realizando busca pelo melhor conjunto de hiperparâmetros')
  results = grid_search_l2_lr(layer_sizes, X, y, k, epochs, reg_strengths, learning_rates, batch_size)

  # Imprimir os resultados
  print("\nResultados da otimização:")
  for i, result in enumerate(results):
    print(f"{i+1}) Reg. Strength: {result['reg_strength']}, Learning Rate: {result['learning_rate']}, Mean Accuracy: {result['mean_accuracy']:.4f}")
  
  print(f'\nO melhor conjunto de hiperparâmetros é: Reg. Strength: {results[0]["reg_strength"]} e Learning Rate: {results[0]["learning_rate"]}')

test_4()


Realizando busca pelo melhor conjunto de hiperparâmetros
Hyperparams: reg strength: 0.001 | learning rate: 0.01
Hyperparams: reg strength: 0.001 | learning rate: 0.1
Hyperparams: reg strength: 0.01 | learning rate: 0.01
Hyperparams: reg strength: 0.01 | learning rate: 0.1
Hyperparams: reg strength: 0.1 | learning rate: 0.01
Hyperparams: reg strength: 0.1 | learning rate: 0.1

Resultados da otimização:
1) Reg. Strength: 0.1, Learning Rate: 0.1, Mean Accuracy: 0.9681
2) Reg. Strength: 0.01, Learning Rate: 0.1, Mean Accuracy: 0.9664
3) Reg. Strength: 0.001, Learning Rate: 0.1, Mean Accuracy: 0.9628
4) Reg. Strength: 0.01, Learning Rate: 0.01, Mean Accuracy: 0.7434
5) Reg. Strength: 0.001, Learning Rate: 0.01, Mean Accuracy: 0.7221
6) Reg. Strength: 0.1, Learning Rate: 0.01, Mean Accuracy: 0.7150

O melhor conjunto de hiperparâmetros é: Reg. Strength: 0.1 e Learning Rate: 0.1


## Item 5

* Utilizar a técnica de k-fold cross-validation para selecionar o número de camadas escondidas e o número de neurônios em cada camada. (2.0)

In [19]:
def grid_search_layers_units(layers, units, X, y, k, epochs, reg_strength, learning_rate, batch_size):
  results = []
  for n_layer in layers:
    for n_units in units:
      layer_sizes = [X.shape[1]] + [n_units] * n_layer + [1]
      layers = [Layer(layer_sizes[i], layer_sizes[i + 1], reg_strength) for i in range(len(layer_sizes) - 1)]
      print(f'Hyperparams: layers: {n_layer} | units: {n_units}')
      acc = k_fold(layers, X, y, k, epochs, learning_rate, batch_size, reg_strength, verbose=False)
      mean_acc = np.mean(acc)
      results.append({
        'n_layers': n_layer,
        'n_units': n_units,
        'mean_accuracy': mean_acc
      })
  return sorted(results, key=lambda x: x['mean_accuracy'], reverse=True)


def test_5():
  # Carregar o conjunto de dados Breast Cancer Wisconsin
  data = load_breast_cancer()
  X = data.data
  y = data.target
  k = 5  # Número de folds
  reg_strength = 0.001
  learning_rate = 0.1
  epochs = 1000
  batch_size = 64
  layers = [1, 2, 3]
  units = [2, 4, 8]

  print('Realizando busca pelo melhor conjunto de hiperparâmetros')
  results = grid_search_layers_units(layers, units, X, y, k, epochs, reg_strength, learning_rate, batch_size)

  # Imprimir os resultados
  print("\nResultados da otimização:")
  for i, result in enumerate(results):
    print(f"{i+1}) Layers: {result['n_layers']}, Neurons: {result['n_units']}, Mean Accuracy: {result['mean_accuracy']:.4f}")
  
  print(f'\nO melhor conjunto de hiperparâmetros é: Layers: {results[0]["n_layers"]} e Neurons: {results[0]["n_units"]}')
  
test_5()

Realizando busca pelo melhor conjunto de hiperparâmetros
Hyperparams: layers: 1 | units: 2
Hyperparams: layers: 1 | units: 4
Hyperparams: layers: 1 | units: 8
Hyperparams: layers: 2 | units: 2
Hyperparams: layers: 2 | units: 4
Hyperparams: layers: 2 | units: 8
Hyperparams: layers: 3 | units: 2
Hyperparams: layers: 3 | units: 4
Hyperparams: layers: 3 | units: 8

Resultados da otimização:
1) Layers: 1, Neurons: 8, Mean Accuracy: 0.9735
2) Layers: 1, Neurons: 4, Mean Accuracy: 0.9699
3) Layers: 1, Neurons: 2, Mean Accuracy: 0.9664
4) Layers: 2, Neurons: 8, Mean Accuracy: 0.9575
5) Layers: 2, Neurons: 4, Mean Accuracy: 0.9487
6) Layers: 3, Neurons: 8, Mean Accuracy: 0.8991
7) Layers: 3, Neurons: 4, Mean Accuracy: 0.7735
8) Layers: 2, Neurons: 2, Mean Accuracy: 0.7699
9) Layers: 3, Neurons: 2, Mean Accuracy: 0.6301

O melhor conjunto de hiperparâmetros é: Layers: 1 e Neurons: 8
