In [89]:
"""
Atividade PEL219 - Multi Layer Perceptron [teste manual] - Gabriel Melo. Matrícula: 125.304-6

Versão Jupyter, para rodar será necessário apenas adicionar os datasets no diretório de execução.
"""

import pandas as pd
import numpy as np
import time
import math
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPClassifier  # just for comparison

from sklearn.metrics import accuracy_score, confusion_matrix, multilabel_confusion_matrix, precision_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, OneHotEncoder

from tqdm import tqdm

In [90]:
# loading the dataset into memory
df_diabetes = pd.read_csv("pima-indians-diabetes.data.csv", header=None) 

X = df_diabetes.iloc[:, :-1].values

scaler = Normalizer()


X_scaled = scaler.fit_transform(X)
y = df_diabetes.iloc[:, -1].values

In [91]:
x_train, x_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42)

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((537, 8), (537,), (231, 8), (231,))

# modelo do scikit learn para comparação

In [92]:
mlp = MLPClassifier(activation='tanh', hidden_layer_sizes=(6), max_iter=10000)

mlp.fit(x_train, y_train)

out_pima = mlp.predict(x_test)

mlp.coefs_, mlp.intercepts_

([array([[-0.05941509, -0.64572209, -0.12900098, -0.81922695, -0.86825654,
          -0.00277135],
         [ 0.22656516,  0.27772738,  0.31376222,  0.29479553, -0.67544632,
          -0.00854736],
         [-0.77103761,  0.45441258,  0.90319854, -0.26998709,  0.64903419,
           0.66089376],
         [-0.73926521,  0.16783226,  0.58753992,  0.88288875,  0.69030112,
           0.9531464 ],
         [-0.1318258 ,  0.42442908, -0.54618005, -0.24285198,  0.76925826,
          -0.27781866],
         [ 0.31110435,  0.24189466,  0.78678566, -0.48151502, -0.28303616,
           0.22626403],
         [ 0.19229393,  0.188037  , -0.64301082, -0.26438211, -0.67962921,
           0.51823934],
         [ 0.16798271, -0.15668607,  0.60115427, -0.50564691,  0.37991681,
          -0.39193808]]),
  array([[ 0.7443777 ],
         [-0.82579041],
         [-0.66211925],
         [-0.2405405 ],
         [-0.38033637],
         [-0.32755307]])],
 [array([ 0.37119433,  0.01748752, -0.47975683,  0.01873927

In [93]:
accuracy_score(y_test, out_pima), precision_score(y_test, out_pima), f1_score(y_test, out_pima), confusion_matrix(y_test, out_pima)

(0.658008658008658,
 0.5333333333333333,
 0.16842105263157894,
 array([[144,   7],
        [ 72,   8]]))

## rede manual, implmentação arcaica do backpropagation

In [94]:
def tanh_derivative(activation: np.array):
    return 1-np.tanh(activation)**2

In [95]:
target = np.array(y_test[0])

input_layer = x_test[0]  # first layer is the input size 8
hidden_layer1 = np.random.rand(6, 8)
hidden_layer2 = np.random.rand(4, 6)
out_layer = np.random.rand(1, 4)

bias_h1 = np.random.rand(hidden_layer1.shape[0])
bias_h2 = np.random.rand(hidden_layer2.shape[0])
bias_out = np.random.rand(out_layer.shape[0])

learning_rate = 0.1

target, y_test[0], bias_h1.shape, bias_h2.shape, bias_out.shape

(array(0), np.int64(0), (6,), (4,), (1,))

In [96]:
neth1 = np.dot(hidden_layer1, input_layer) + bias_h1.T
activationh1 = np.tanh(neth1)
neth1, activationh1, hidden_layer1, input_layer

(array([1.17086917, 1.48187881, 0.88653433, 1.7326292 , 1.29595071,
        1.39539952]),
 array([0.82455061, 0.9018194 , 0.70967775, 0.93936587, 0.86067709,
        0.88435318]),
 array([[0.31023127, 0.50088648, 0.91407622, 0.33503491, 0.35925256,
         0.34662731, 0.23783755, 0.29107883],
        [0.00721813, 0.14915239, 0.40950027, 0.04751513, 0.86235814,
         0.71900869, 0.30546914, 0.4448057 ],
        [0.39837967, 0.11585927, 0.65891073, 0.67113776, 0.05549212,
         0.34538988, 0.27873822, 0.84630068],
        [0.80970715, 0.19295683, 0.9019686 , 0.85904008, 0.45906738,
         0.58074043, 0.11432786, 0.40701624],
        [0.14729265, 0.63797463, 0.04673697, 0.22042855, 0.63883872,
         0.86147232, 0.80705973, 0.80301586],
        [0.67731652, 0.34925978, 0.82508814, 0.93332152, 0.57985159,
         0.206018  , 0.60992382, 0.91729177]]),
 array([0.02601374, 0.42489113, 0.25146618, 0.14307559, 0.82376852,
        0.14741121, 0.00186432, 0.18643182]))

In [97]:
neth2 = np.dot(hidden_layer2, activationh1) + bias_h2.T
activationh2 = np.tanh(neth2)
neth2, activationh2, hidden_layer2

(array([2.53732809, 3.25404917, 2.93230099, 3.3817988 ]),
 array([0.98757125, 0.99702182, 0.99433976, 0.99769253]),
 array([[0.24312196, 0.02260397, 0.24603318, 0.16666804, 0.57067439,
         0.79506192],
        [0.81292969, 0.47994531, 0.36912505, 0.78882859, 0.50517197,
         0.67068625],
        [0.63531996, 0.73295327, 0.01619535, 0.81334873, 0.31970285,
         0.48833356],
        [0.40840209, 0.88396651, 0.66664572, 0.66507953, 0.55395127,
         0.61279575]]))

In [98]:
netol = np.dot(out_layer, activationh2) + bias_out.T
activationol = np.tanh(netol)
predict = np.heaviside(activationol, 0.5)

netol, activationol, predict, out_layer

(array([2.31468356]),
 array([0.98066684]),
 array([1.]),
 array([[0.47079427, 0.13980706, 0.85913935, 0.313083  ]]))

## fitting do modelo via backpropagation

In [99]:
# CAMADA DE SAIDA um paralelo no slide é o e_m
e_out = np.sum(target - activationol) * tanh_derivative(netol)  # target - ativacao * derivada da rede da camada atual
deltaw_outh2 = np.apply_along_axis(lambda activation_layer: learning_rate * e_out * activation_layer, 0, np.expand_dims(neth2, axis=0).repeat([1], axis=0))
delta_bias_out = bias_out * tanh_derivative(netol) 

e_m, deltaw_outh2, delta_bias_out

(array([-0.00613418]),
 array([[-0.00952823, -0.01221968, -0.01101144, -0.01269941]]),
 array([0.02082016]))

In [100]:
# 2 HIDDEN LAYER um paralelo no slide é o e_k aqui é e_h2

e_h2 = np.dot(e_out,out_layer) * tanh_derivative(neth2)  # adiciona 1 no w0 para o coeficiente linear
delta_bias_h2 = bias_h2 * tanh_derivative(neth2) 
deltaw_h2h1 = np.apply_along_axis(lambda activation_layer: learning_rate * e_h2 * activation_layer, 0, np.expand_dims(neth1, axis=0).repeat([hidden_layer2.shape[0]], axis=0))

e_h2, hidden_layer2 + deltaw_h2h1, bias_h2 + delta_bias_h2

(array([-4.36734158e-04, -3.12247721e-05, -3.64194498e-04, -5.41950243e-05]),
 array([[0.24307082, 0.02253925, 0.24599446, 0.16659237, 0.57061779,
         0.79500097],
        [0.81292604, 0.47994068, 0.36912228, 0.78882318, 0.50516792,
         0.6706819 ],
        [0.63527732, 0.7328993 , 0.01616306, 0.81328563, 0.31965565,
         0.48828274],
        [0.40839574, 0.88395848, 0.66664091, 0.66507014, 0.55394425,
         0.61278819]]),
 array([0.81056938, 0.12076557, 0.26790071, 0.13192009]))

In [101]:
# 1 HIDDEN LAYER um paralelo no slide é o e_j aqui é e_h1

e_h1 = np.dot(e_h2, hidden_layer2) * tanh_derivative(neth1)
delta_bias_h1 = bias_h1  * tanh_derivative(neth1)
deltaw_h1_input = np.apply_along_axis(lambda activation_layer: learning_rate * e_h1 * activation_layer, 0, np.expand_dims(input_layer, axis=0).repeat([hidden_layer1.shape[0]], axis=0))

e_h1, hidden_layer1 + deltaw_h1_input, bias_h1 + delta_bias_h1

(array([-1.23269298e-04, -6.34298056e-05, -7.99155803e-05, -5.05270338e-05,
        -1.06665405e-04, -1.26225830e-04]),
 array([[0.31023095, 0.50088124, 0.91407312, 0.33503315, 0.3592424 ,
         0.34662549, 0.23783753, 0.29107653],
        [0.00721797, 0.14914969, 0.40949868, 0.04751422, 0.86235292,
         0.71900775, 0.30546913, 0.44480452],
        [0.39837946, 0.11585587, 0.65890872, 0.67113662, 0.05548554,
         0.3453887 , 0.27873821, 0.8462992 ],
        [0.80970702, 0.19295469, 0.90196733, 0.85903936, 0.45906322,
         0.58073969, 0.11432785, 0.40701529],
        [0.14729237, 0.6379701 , 0.04673429, 0.22042703, 0.63882993,
         0.86147075, 0.80705971, 0.80301387],
        [0.67731619, 0.34925442, 0.82508496, 0.93331971, 0.57984119,
         0.20601614, 0.6099238 , 0.91728942]]),
 array([0.35700517, 0.48498343, 0.46432298, 0.82700779, 0.21822588,
        0.25355011]))