# Backpropagation

In [14]:
import numpy as np
import pandas as pd
import time

Pada post-test kali ini akan membandingkan dua jenis fungsi aktivasi yang biasa digunakan dalam backpropogation

In [52]:
#Fungsi Aktivasi Sigmoid dengan turunannya
def sig(X):
  return [1 / (1 + np.exp(-x)) for x in X]

def sigd(X):
  output = []

  for i, x in enumerate(X):
      s = sig([x])[0]
      output.append(s * (1 - s))
  return output

#Fungsi Aktivasi Hyperbolic Tangent dengan turunannya
def tanh(X):
  return [np.tanh(x) for x in X]

def tanhd(X):
  output = []

  for x in X:
      t = np.tanh(x)
      output.append(1 - t ** 2)
  return output



In [53]:
def onehot_enc(lbl, min_val=0):
  mi = min(lbl)
  enc = np.full((len(lbl), max(lbl) - mi + 1), min_val, np.int8)

  for i, x in enumerate(lbl):
    enc[i, x - mi] = 1

  return enc

def onehot_dec(enc, mi=0):
  return [np.argmax(e) + mi for e in enc]

### a) Fungsi *Training* Backpropagation

Tulis kode ke dalam *cell* di bawah ini:

In [54]:
import numpy as np
import time

def bp_fit_sig(X, target, layer_conf, max_epoch, max_error=0.1, learn_rate=0.1, print_per_epoch=100):
    start_time = time.time()
    np.random.seed(1)

    n = [np.empty(j + 1) if i < len(layer_conf) - 1 else np.empty(j) for i, j in enumerate(layer_conf)]
    w = [np.random.rand(layer_conf[i] + 1, layer_conf[i + 1]) for i in range(len(layer_conf) - 1)]
    dw = [np.zeros((layer_conf[i] + 1, layer_conf[i + 1])) for i in range(len(layer_conf) - 1)]
    d = [np.zeros(s) for s in layer_conf[1:]]

    epoch = 0
    mse = 1

    for i in range(0, len(n) - 1):
        n[i][-1] = 1

    while (max_epoch == -1 or epoch < max_epoch) and mse > max_error:
        epoch += 1
        mse = 0

        for r in range(len(X)):
            n[0][:-1] = X[r]
            for L in range(1, len(layer_conf)):
                nin = np.dot(n[L - 1], w[L - 1])
                n[L][:len(nin)] = sig(nin)

            e = target[r] - n[-1]
            mse += np.sum(e ** 2)

            d[-1] = e * sigd(n[-1])
            dw[-1] = learn_rate * np.outer(n[-2], d[-1])

            for L in range(len(layer_conf) - 2, 0, -1):
                din = np.dot(d[L], w[L][:-1].T)
                d[L - 1] = din * sigd(n[L][:-1])
                dw[L - 1] = learn_rate * np.outer(n[L - 1], d[L - 1])

            for i in range(len(w)):
                w[i] += dw[i]

        mse /= len(X)

        if print_per_epoch > -1 and epoch % print_per_epoch == 0:
            print(f'Epoch {epoch}, MSE: {mse}')

    execution_time = time.time() - start_time
    print("Waktu eksekusi: %s detik" % execution_time)

    return w, epoch, mse


In [65]:
def bp_fit_tanh(X, target, layer_conf, max_epoch, max_error=.1, learn_rate=.1, print_per_epoch=100):
    start_time = time.time()
    np.random.seed(1)
    nin = [np.empty(i) for i in layer_conf]
    n = [np.empty(j + 1) if i < len(layer_conf) - 1 else np.empty(j) for i, j in enumerate(layer_conf)]
    w = [np.random.rand(layer_conf[i] + 1, layer_conf[i + 1]) for i in range(len(layer_conf) - 1)]
    dw = [np.empty((layer_conf[i] + 1, layer_conf[i + 1])) for i in range(len(layer_conf) - 1)]
    d = [np.empty(s) for s in layer_conf[1:]]
    din = [np.empty(s) for s in layer_conf[1:-1]]
    epoch = 0
    mse = 1
    for i in range(0, len(n)-1):
        n[i][-1] = 1
    while (max_epoch == -1 or epoch < max_epoch) and mse > max_error:
        epoch += 1
        mse = 0
        for r in range(len(X)):
            n[0][:-1] = X[r]
            for L in range(1, len(layer_conf)):
                nin[L] = np.dot(n[L-1], w[L-1])
                n[L][:len(nin[L])] = tanh(nin[L])
            e = target[r] - n[-1]
            mse += sum(e ** 2)
            d[-1] = e * tanhd(nin[-1])
            dw[-1] = learn_rate * d[-1] * n[-2].reshape((-1, 1))
            for L in range(len(layer_conf) - 1, 1, -1):
                din[L-2] = np.dot(d[L-1], np.transpose(w[L-1][:-1]))
                d[L-2] = din[L-2] * np.array(tanhd(nin[L-1]))
                dw[L-2] = (learn_rate * d[L-2]) * n[L-2].reshape((-1, 1))
            w[L-1] += dw[L-1]
        mse /= len(X)
        if print_per_epoch > -1 and epoch % print_per_epoch == 0:
            print(f'Epoch {epoch}, MSE: {mse}')
    execution = time.time() - start_time
    print("Waktu eksekusi: %s detik" % execution)
    return w, epoch, mse

### b) Fungsi *Testing* Backpropagation

Tulis kode ke dalam *cell* di bawah ini:

In [55]:
def bp_predict_sig(X, w):
  n = [np.empty(len(i)) for i in w]
  nin = [np.empty(len(i[0])) for i in w]
  predict = []
  n.append(np.empty(len(w[-1][0])))
  for x in X:
    n[0][:-1] = x
    for L in range(0, len(w)):
      nin[L] = np.dot(n[L], w[L])
      n[L + 1][:len(nin[L])] = sig(nin[L])
    predict.append(n[-1].copy())
  return predict

In [66]:
#Membuat fungsi testing backpropagation dengan menggunakan fungsi aktivasi tanh
def bp_predict_tanh(X, w):
    n = [np.empty(len(i)) for i in w]
    nin = [np.empty(len(i[0])) for i in w]
    predict = []
    n.append(np.empty(len(w[-1][0])))
    for x in X:
        n[0][:-1] = x
        for L in range(0, len(w)):
            nin[L] = np.dot(n[L], w[L])
            n[L + 1][:len(nin[L])] = tanh(nin[L])
        predict.append(n[-1].copy())
    return predict

### c) Klasifikasi dataset wine


Lakukan pelatihan pada dataset wine dengan menggunakan 2 fungsi pelatihan yang telah dibuat!

Konfigurasi kedua pelatihan harus sama (epoch, hidden layer, learning rate, dll).
Akurasi yang diharapkan di setiap pelatihan adalah > 0.98

In [22]:
import numpy as np

In [67]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import accuracy_score

wine = datasets.load_wine()
X = minmax_scale(wine.data)
Y = onehot_enc(wine.target)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.3,random_state=1)
#Isi jumlah layer yang digunakan dengan jumlah hidden layer 8
w, ep, mse = bp_fit_sig(X_train, y_train, layer_conf=(13, 8, 3),learn_rate=0.1, max_epoch=100, max_error=0.1, print_per_epoch=25)

print(f'Epochs: {ep}, MSE: {mse}')

predict = bp_predict_sig(X_test, w)
predict = onehot_dec(predict)
y_test = onehot_dec(y_test)
accuracy = accuracy_score(predict, y_test)

print('Output:', predict)
print('True :', y_test)
print('Accuracy:', accuracy)

Epoch 25, MSE: 0.4425340061469445
Epoch 50, MSE: 0.3322178999354772
Epoch 75, MSE: 0.13894317248490345
Epoch 100, MSE: 0.14104871228706387
Waktu eksekusi: 2.1015806198120117 detik
Epochs: 100, MSE: 0.14104871228706387
Output: [2, 1, 0, 1, 0, 2, 1, 0, 2, 1, 0, 0, 1, 0, 1, 1, 2, 0, 1, 0, 0, 1, 2, 0, 0, 2, 0, 0, 0, 2, 1, 2, 2, 0, 1, 1, 1, 0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0]
True : [2, 1, 0, 1, 0, 2, 1, 0, 2, 1, 0, 0, 1, 0, 1, 1, 2, 0, 1, 0, 0, 1, 2, 1, 0, 2, 0, 0, 0, 2, 1, 2, 2, 0, 1, 1, 1, 1, 1, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 0]
Accuracy: 0.9444444444444444


In [72]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import accuracy_score

wine = datasets.load_wine()
X = minmax_scale(wine.data)
Y = onehot_enc(wine.target)

X_train, X_test, y_train, y_test = train_test_split(X, Y,
test_size=.3,random_state=1)
#Isi jumlah layer yang digunakan dengan jumlah hidden layer 8
w, ep, mse = bp_fit_tanh(X_train, y_train, layer_conf=(13, 8, 3), learn_rate=0.1, max_epoch=10000, max_error=0.01, print_per_epoch=1000)

print(f'Epochs: {ep}, MSE: {mse}')

predict = bp_predict_tanh(X_test, w)
predict = onehot_dec(predict)
y_test = onehot_dec(y_test)
accuracy = accuracy_score(predict, y_test)

print('Output:', predict)
print('True :', y_test)
print('Accuracy:', accuracy)

Epoch 1000, MSE: 0.6849287877208196
Epoch 2000, MSE: 0.6056967502504933
Epoch 3000, MSE: 0.565134862766249
Epoch 4000, MSE: 0.5418064255715432
Epoch 5000, MSE: 0.5270358877580862
Epoch 6000, MSE: 0.5168828761371292
Epoch 7000, MSE: 0.5094293569626595
Epoch 8000, MSE: 0.5036698397311112
Epoch 9000, MSE: 0.49903787576269937
Epoch 10000, MSE: 0.4951935006667101
Waktu eksekusi: 109.00974559783936 detik
Epochs: 10000, MSE: 0.4951935006667101
Output: [2, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 2, 0, 1, 0, 0, 1, 2, 0, 0, 2, 0, 0, 0, 2, 1, 0, 2, 0, 1, 1, 1, 0, 1, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
True : [2, 1, 0, 1, 0, 2, 1, 0, 2, 1, 0, 0, 1, 0, 1, 1, 2, 0, 1, 0, 0, 1, 2, 1, 0, 2, 0, 0, 0, 2, 1, 2, 2, 0, 1, 1, 1, 1, 1, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 0]
Accuracy: 0.8333333333333334


# Pertanyaan

1.  Apa perbedaan dari penggunaan fungsi aktivasi sigmoid dengan fungsi aktivasi hyperbolic tangent?
2. Coba jelaskan alasan dari perbedaan tersebut sebisa kalian

# Jawaban

1.  Fungsi aktivasi sigmoid dan hyperbolic tangent (tanh) memiliki perbedaan signifikan dalam rentang output, sifat simetris, dan respons terhadap input. Sigmoid menghasilkan output dalam rentang (0, 1) dan tidak simetris, yang dapat menyebabkan bias dalam representasi data, terutama jika input tidak terpusat di sekitar nol. Di sisi lain, tanh menghasilkan output dalam rentang (-1, 1) dan simetris, memberikan representasi yang lebih seimbang dan efektif untuk data yang berpusat di nol. Derivatif tanh juga lebih informatif dan tidak mengalami masalah vanishing gradient secepat sigmoid, membuat tanh lebih disukai dalam lapisan tersembunyi jaringan saraf. Sementara sigmoid sering digunakan di lapisan output untuk klasifikasi biner, tanh lebih umum di lapisan tersembunyi karena kemampuannya untuk mempercepat konvergensi selama pelatihan.

2.  Perbedaan utama antara kedua fungsi ini terletak pada cara mereka merespons input dan seberapa baik mereka dapat mempertahankan gradien saat dilatih. Fungsi tanh mengatasi beberapa kelemahan fungsi sigmoid dengan memberikan rentang output yang lebih baik, derivate yang lebih informatif, dan simetri yang lebih menguntungkan. Ini membuat tanh menjadi pilihan yang lebih baik dalam banyak situasi, terutama untuk lapisan tersembunyi dalam jaringan saraf yang dalam.