<a href="https://colab.research.google.com/github/notnsas/cs229-ml-from-scratch/blob/version-1/cs229_ml_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CS229 ML IMPLEMENTATION**

In [1]:
import numpy as np
import pandas as pd

## **Linear Regression**

In [2]:
# Model Linear Regressi
class LinearRegression:
  def __init__(self, X, y=None, lr=0.01):
    # Params untuk menyimpan parameter dan gradientnya
    self.params = {}

    # inisiasi parameter
    # Menambah col baru dengan semua isinya bernilai 1 ke x
    # yang digunakan agar kita memiliki intercept di rumus linear regresi
    # yang menggunakan vector form
    self.X = np.concatenate((np.ones([np.shape(X)[0], 1]), X),axis=1)
    self.y = y
    # learning rate sebagai konstanta belajar seberapa cepat mengupdate gradient
    self.lr = lr
    # Inisiasi parameter linear regresi dengan menggunakan nilai random
    self.params['W'] = np.random.rand(1, self.X.shape[1])

  def predict(self, X, mode="test"):
    """Untuk Prediksi y Berdasarkan input X menggunakan model linear regresi"""
    if mode == "test":
      # Menambah col baru dengan semua isinya bernilai 1 ke x
      # yang digunakan agar kita memiliki intercept di rumus linear regresi
      # yang menggunakan vector form
      X = np.concatenate((np.ones([np.shape(X)[0], 1]), X),axis=1)
    # Memprediksi y berdasarkan X menggunakan model yang sudah dibuat
    output = np.dot(X, self.params["W"].T)
    return output

  def loss(self, output, y):
    """Menghitung loss"""
    # Menghitung loss menggunakan mean squared error dan dikali 1/2 agar
    # gradientnya nanti gampang dihitung
    return 1/2 * np.mean(np.square(output - y))

  def batch(self, batch_size):
    """Menggunakan mini-batch untuk menghitung loss, output, dan gradien"""
    # Meninisiasi loss dan menshuffle X & y
    loss = 0
    # Menshuffle menggunakan fungsi np.random.permutation() sebagai index
    # lalu menggunakan index tersebut untuk menshuffle X dan y
    indices = np.random.permutation(len(self.y))
    X_shuffle = self.X[indices]
    y_shuffle = self.y[indices]

    # Membagi data menjadi minibatch yg diitung loss nya
    for i in range(0, len(self.y), batch_size):
      end = min(i + batch_size, len(self.y))
      # Membagi data per batch dengan banyak n (training example) sesuai
      # dengan batch_size yang dipilih
      X_batch = X_shuffle[i:end, :]
      y_batch = y_shuffle[i:end]
      # Kalulasi loss per batch
      a = self.calculate(X_batch, y_batch, "train", "gradient_descent")[0]

      # Melakukan running average dari loss sebelumnya agar mendapatkan rata
      # rata loss dari mini batch
      loss = loss + a * (end - i)
    # Return total loss dari loss yang didapatkan menggunakan running average
    # dan dibagi total training (n)
    return loss / len(self.y)


  def calculate(self, X, y, mode="test", solver_type="least_square"):
    """Untuk kalkulasi gradien output dan least square dari model"""
    # Mendapatkan output dari linear regresi
    output = self.predict(X, mode=mode)
    # Bila modenya train
    if mode == "train":
      # Training menggunakan gradient descent
      # Menghitung loss menggunakan output dan y
      loss = self.loss(output, y)

      # Menentukan solver type gradient descent atau metode least square
      if solver_type == "gradient_descent":
        # Menghitung gradient menggunakan rumus dibawah
        self.params['dW'] = np.dot((y[:, np.newaxis] - output).T, X) / len(y)
        # Update parameter berdasarkan gradient dikali konstanta belajar (lr/learning rate)
        self.params['W'] += self.lr * self.params['dW']
      # Bila menggunakan metode least square / metode kuadrat terkecil
      else:
        # Perhitungan menggunakan metode kuadrat terkecil agar mendapatkan
        # Parameter optimal untuk model secara langsung agar meminimalkan
        # loss function
        a = np.linalg.inv(np.dot(X.T, X))
        b = np.dot(X.T, y)
        c = np.dot(a, b)

        # Mengupdate parameter berdasarkan parameter optimal
        self.params['W'] = c[:, np.newaxis].T

    # Return output bila mode test
    else:
      return output

    # Return loss serta output
    return loss, output

  def train(self, solver_type="least_square", epoch=50, batch_size=216):
    """Untuk train model linear regresi"""
    if solver_type == "gradient_descent":
      # Bila solver type gradient descent update parameter dilakukan sampai
      # epoch/iterasi tertentu
      for i in range(epoch):
        # Menghitung loss dari model yang di update parameternya
        loss = self.batch(batch_size)
        print(f"[Epoch {i + 1}] : loss = {loss}")
    else:
      # Menghitung loss yang sudah di update parameternya menggunakan least square/metode kuadrat terkecil
      loss, _ = self.calculate(self.X, self.y, "train", "least_square")
      print(f"loss : {loss}")

In [3]:
# Mengambil data untuk feature X dan label y yaitu 'carat' sebagai kolom yang akan di tebak, lalu menormalisasikanya
X = numeric_data[:, 1:] / np.max(numeric_data[:, 1:],axis=0)
y = numeric_data[:, 0]

# Ngesplit data menjadi train dan testing
X_train, y_train, X_test, y_test = X[:40000, :], y[:40000], X[40000:, :], y[40000:]

# Membuat objek linear regresi dengan data tersebut
model = LinearRegression(X_train, y_train, 0.1)

NameError: name 'numeric_data' is not defined

In [None]:
# Melatih model linear regresi dan melihat loss dari model tersebut ke data
# Melatih menggunakan metode gradient descent (Karena datanya terlalu banyak)
model.train("gradient_descent", epoch=50, batch_size=512)

In [None]:
# Melakukan Testing
# Mendapatkan prediksi model berdasarkan X_test yaitu test data
output = model.predict(X_test)
print(f"Hasil output atau prediksi y berdasarkan X adalah : \n{output}")
print(f"\nLabel atau y yang asli sebagai perbandingan : \n{y_test}")

# Menghitung seberapa bagus model menggunakan test data menggunakan Mean Squared Error
mse = np.mean(np.square(output - y_test))
print(f"\n\nMean Squared Error dari Testing data Menggunakan Gradient Descent adalah = {mse}")

Menggunakan metode kuadrat terkecil sebagai metode optimasi

In [None]:
# Mengambil 15000 data pertama untuk feature X dan 5000 label y yaitu 'carat' sebagai kolom yang akan di tebak
# Data hanya 15000 yang dipakai karena metode kuadrat terkecil tidak dapat dipakai untuk data yang banyak
# Ngesplit data menjadi train dan testing
X_train, y_train, X_test, y_test = X[:15000, :], y[:15000], X[15000:20000, :], y[15000:20000]

# Membuat objek linear regresi dengan data tersebut
model = LinearRegression(X_train, y_train, 0.1)

In [None]:
# Melatih model linear regresi dan melihat loss dari model tersebut ke data
# Melatih menggunakan metode kuadrat terkecil menggunakan 1000 data
model.train("least_square")

In [None]:
# Melakukan Testing
# Mendapatkan prediksi model berdasarkan X_test yaitu test data
output = model.predict(X_test)
print(f"Hasil output atau prediksi y berdasarkan X adalah : \n{output}")
print(f"\nLabel atau y yang asli sebagai perbandingan : \n{y_test}")

# Menghitung seberapa bagus model menggunakan test data menggunakan Mean Squared Error
mse = np.mean(np.square(output - y_test))
print(f"\n\nMean Squared Error dari Testing data Menggunakan Metode Kuadrat Terkecil adalah = {mse}")

## **Logistic Regression**

In [4]:
import numpy as np
class LogisticRegression:
  def __init__(self, X, y=None, lr=0.01):
    # Params untuk menyimpan parameter dan gradientnya
    self.params = {}

    # inisiasi parameter
    # Menambah col baru dengan semua isinya bernilai 1 ke x
    # yang digunakan agar kita memiliki intercept di rumus linear regresi
    # yang menggunakan vector form
    self.X = np.concatenate((np.ones([np.shape(X)[0], 1]), X),axis=1)
    self.y = y
    # learning rate sebagai konstanta belajar seberapa cepat mengupdate gradient
    self.lr = lr
    # Inisiasi parameter linear regresi dengan menggunakan nilai random
    self.params['W'] = np.random.rand(1, self.X.shape[1])

  def predict(self, X, mode="test"):
    """Untuk Prediksi y Berdasarkan input X menggunakan model linear regresi"""
    if mode == "test":
      # Menambah col baru dengan semua isinya bernilai 1 ke x
      # yang digunakan agar kita memiliki intercept di rumus linear regresi
      # yang menggunakan vector form
      X = np.concatenate((np.ones([np.shape(X)[0], 1]), X),axis=1)
      output = self.forward(X, mode)
      output = self.sigmoid(output, mode)
      return np.round(output)
    # Memprediksi y berdasarkan X menggunakan model yang sudah dibuat
    output = self.forward(X, mode)
    output = self.sigmoid(output, mode)
    return output

  def forward(self, X, mode="test"):
    return np.dot(X, self.params["W"].T)

  def sigmoid(self, X, mode="test"):
    return 1 / (1 + np.exp(-X))

  def loss(self, output, y):
    """Menghitung loss"""
    # Menghitung loss menggunakan mean squared error dan dikali 1/2 agar
    # gradientnya nanti gampang dihitung
    # return 1/2 * np.mean(np.square(output - y))
    loss = y * np.log(output) + (1 - y) * np.log((1 - output))
    loss = np.sum(loss)
    # print(loss)
    return loss

  def batch(self, batch_size):
    """Menggunakan mini-batch untuk menghitung loss, output, dan gradien"""
    # Meninisiasi loss dan menshuffle X & y
    loss = 0
    # Menshuffle menggunakan fungsi np.random.permutation() sebagai index
    # lalu menggunakan index tersebut untuk menshuffle X dan y
    indices = np.random.permutation(len(self.y))
    X_shuffle = self.X[indices]
    y_shuffle = self.y[indices]

    # Membagi data menjadi minibatch yg diitung loss nya
    for i in range(0, len(self.y), batch_size):
      end = min(i + batch_size, len(self.y))
      # Membagi data per batch dengan banyak n (training example) sesuai
      # dengan batch_size yang dipilih
      X_batch = X_shuffle[i:end, :]
      y_batch = y_shuffle[i:end]
      # Kalulasi loss per batch
      a = self.calculate(X_batch, y_batch, "train", "gradient_descent")[0]

      # Melakukan running average dari loss sebelumnya agar mendapatkan rata
      # rata loss dari mini batch
      loss = loss + a * (end - i)
    # Return total loss dari loss yang didapatkan menggunakan running average
    # dan dibagi total training (n)
    return loss / len(self.y)


  def calculate(self, X, y, mode="test", solver_type="gradient_descent"):
    """Untuk kalkulasi gradien output dan least square dari model"""
    # Mendapatkan output dari linear regresi
    output = self.predict(X, mode=mode)
    # Bila modenya train
    if mode == "train":
      # Training menggunakan gradient descent
      # Menghitung loss menggunakan output dan y
      loss = self.loss(output, y)

      # Menentukan solver type gradient descent atau metode least square
      if solver_type == "gradient_descent":
        # Menghitung gradient menggunakan rumus dibawah
        self.params['dW'] = np.dot((y[:, np.newaxis] - output).T, X) / len(y)
        # Update parameter berdasarkan gradient dikali konstanta belajar (lr/learning rate)
        self.params['W'] += self.lr * self.params['dW']
      # Bila menggunakan metode least square / metode kuadrat terkecil
      else:
        # Perhitungan menggunakan metode kuadrat terkecil agar mendapatkan
        # Parameter optimal untuk model secara langsung agar meminimalkan
        # loss function
        a = np.linalg.inv(np.dot(X.T, X))
        b = np.dot(X.T, y)
        c = np.dot(a, b)

        # Mengupdate parameter berdasarkan parameter optimal
        self.params['W'] = c[:, np.newaxis].T

    # Return output bila mode test
    else:
      return output

    # Return loss serta output
    return loss, output

  def train(self, solver_type="gradient_descent", epoch=50, batch_size=216):
    """Untuk train model linear regresi"""
    if solver_type == "gradient_descent":
      # Bila solver type gradient descent update parameter dilakukan sampai
      # epoch/iterasi tertentu
      for i in range(epoch):
        # Menghitung loss dari model yang di update parameternya
        loss = self.batch(batch_size)
        print(f"[Epoch {i + 1}] : loss = {loss}")
    else:
      # Menghitung loss yang sudah di update parameternya menggunakan least square/metode kuadrat terkecil
      loss, _ = self.calculate(self.X, self.y, "train", "least_square")
      print(f"loss : {loss}")


In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("fedesoriano/stroke-prediction-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/stroke-prediction-dataset


In [6]:
import pandas as pd
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df = df.select_dtypes(include='number')
df = df.fillna(df.mean())

In [7]:
X = df[[i for i in df.select_dtypes(include='number').columns if i!= "stroke"]]
y = df['stroke']


In [8]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
standard_scaler = StandardScaler()

# Fit and transform the data
X = pd.DataFrame(standard_scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi
0,-1.298312,1.051434,-0.328602,4.185032,2.706375,1.001234e+00
1,0.716371,0.786070,-0.328602,-0.238947,2.121559,4.615554e-16
2,-0.255478,1.626390,-0.328602,4.185032,-0.005028,4.685773e-01
3,1.118363,0.255342,-0.328602,-0.238947,1.437358,7.154182e-01
4,-1.647136,1.582163,3.043196,-0.238947,1.501184,-6.357112e-01
...,...,...,...,...,...,...
5105,-0.864089,1.626390,3.043196,-0.238947,-0.494658,4.615554e-16
5106,0.394863,1.670617,-0.328602,-0.238947,0.420775,1.442949e+00
5107,-0.793720,-0.363842,-0.328602,-0.238947,-0.511443,2.217363e-01
5108,0.048497,0.343796,-0.328602,-0.238947,1.328257,-4.278451e-01


In [9]:
X, y = X.to_numpy(), y.to_numpy()

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y ,
                                   random_state=104,
                                   test_size=0.25,
                                   shuffle=True)

In [11]:
model = LogisticRegression(X_train, y_train)


In [12]:
uniques, counts = np.unique(y, return_counts=True)
percentages = dict(zip(uniques, counts * 100 / len(y)))

In [13]:
percentages

{np.int64(0): np.float64(95.12720156555773),
 np.int64(1): np.float64(4.87279843444227)}

In [14]:
model.train()

[Epoch 1] : loss = -41911.14953012513
[Epoch 2] : loss = -39850.051757193294
[Epoch 3] : loss = -37908.12258186405
[Epoch 4] : loss = -36155.51906089623
[Epoch 5] : loss = -34348.28473667739
[Epoch 6] : loss = -32709.77932680555
[Epoch 7] : loss = -31136.51627181972
[Epoch 8] : loss = -29601.97380991357
[Epoch 9] : loss = -28292.8580004503
[Epoch 10] : loss = -27073.500990517325
[Epoch 11] : loss = -25849.003187281047
[Epoch 12] : loss = -24726.414122659873
[Epoch 13] : loss = -23642.792028528573
[Epoch 14] : loss = -22654.487077803613
[Epoch 15] : loss = -21819.7591529841
[Epoch 16] : loss = -20945.811366876147
[Epoch 17] : loss = -20202.0445049793
[Epoch 18] : loss = -19503.17960977193
[Epoch 19] : loss = -18863.798936752697
[Epoch 20] : loss = -18237.925734276792
[Epoch 21] : loss = -17733.355492656607
[Epoch 22] : loss = -17184.67709608335
[Epoch 23] : loss = -16711.07935669439
[Epoch 24] : loss = -16250.171035412024
[Epoch 25] : loss = -15905.11732550905
[Epoch 26] : loss = -15521

In [15]:
output = model.predict(X_test)


In [16]:
acc = np.mean(output == y_test)
acc

np.float64(0.9530516431924883)

In [17]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=16)

# fit the model with data
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

In [18]:
np.mean(y_pred == y_test)

np.float64(0.9530516431924883)

In [19]:
rows = 5
cols = 10

# Create a zero matrix
arr = np.zeros((rows, cols), dtype=int)

# Randomly assign one '1' per row
for i in range(rows):
    j = np.random.randint(cols)  # random column index
    arr[i, j] = 1
arr[1, 1] = 1
arr[2, 2] = 1
print(arr)
import numpy as np

arrs = np.arange(0, 5).reshape(-1, 1)

print(arrs)


[[1 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 1 0 0 0 0]
 [0 0 1 0 0 0 0 0 1 0]
 [0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0]]
[[0]
 [1]
 [2]
 [3]
 [4]]


In [20]:
a = np.random.randint(0, 10, (4,4))
a

array([[0, 7, 2, 8],
       [7, 4, 9, 9],
       [3, 1, 3, 1],
       [5, 3, 8, 9]])

In [21]:
np.argmax(a, axis=1)

array([3, 2, 0, 3])

## **Generalized Linear Model**

The generalized linear model that is used is using multinomial as the label, which the model is called softmax

In [22]:
class Softmax:
  def __init__(self, X, y=None, lr=0.01):
    # Params untuk menyimpan parameter dan gradientnya
    self.params = {}

    # inisiasi parameter
    # Menambah col baru dengan semua isinya bernilai 1 ke x
    # yang digunakan agar kita memiliki intercept di rumus linear regresi
    # yang menggunakan vector form
    self.X = np.concatenate((np.ones([np.shape(X)[0], 1]), X),axis=1)
    self.y = y
    # learning rate sebagai konstanta belajar seberapa cepat mengupdate gradient
    self.lr = lr
    # Inisiasi parameter linear regresi dengan menggunakan nilai random
    self.params['W'] = np.random.rand(len(np.unique(y)), self.X.shape[1])

  def predict(self, X, mode="test"):
    """Untuk Prediksi y Berdasarkan input X menggunakan model linear regresi"""
    if mode == "test":
      # Menambah col baru dengan semua isinya bernilai 1 ke x
      # yang digunakan agar kita memiliki intercept di rumus linear regresi
      # yang menggunakan vector form
      X = np.concatenate((np.ones([np.shape(X)[0], 1]), X),axis=1)
      output = self.forward(X, mode)
      output = self.softmax(output)
      output = np.argmax(output, axis=1)
      return output
    # Memprediksi y berdasarkan X menggunakan model yang sudah dibuat
    output = self.forward(X, mode)
    output = self.softmax(output)
    return output

  def forward(self, X, mode="test"):
    return np.dot(X, self.params["W"].T)

  def softmax(self, X):
    return np.exp(X) / np.sum(np.exp(X), axis=1, keepdims=True)

  # def loss(self, output, y):
  #   """Menghitung loss"""
  #   y = y.flatten()
  #   num_train = len(y)
  #   # Menghitung loss menggunakan mean squared error dan dikali 1/2 agar
  #   # gradientnya nanti gampang dihitung
  #   # return 1/2 * np.mean(np.square(output - y))
  #   loss = np.log(output[np.arange(num_train), y].reshape(num_train, 1))
  #   loss = np.sum(loss)
  #   # print(loss)
  #   return loss

  def batch(self, batch_size):
    """Menggunakan mini-batch untuk menghitung loss, output, dan gradien"""
    # Meninisiasi loss dan menshuffle X & y
    loss = 0
    # Menshuffle menggunakan fungsi np.random.permutation() sebagai index
    # lalu menggunakan index tersebut untuk menshuffle X dan y
    indices = np.random.permutation(len(self.y))
    X_shuffle = self.X[indices]
    y_shuffle = self.y[indices]

    # Membagi data menjadi minibatch yg diitung loss nya
    for i in range(0, len(self.y), batch_size):
      end = min(i + batch_size, len(self.y))
      # Membagi data per batch dengan banyak n (training example) sesuai
      # dengan batch_size yang dipilih
      X_batch = X_shuffle[i:end, :]
      y_batch = y_shuffle[i:end]
      # Kalulasi loss per batch
      a = self.calculate(X_batch, y_batch, "train", "gradient_descent")[0]

      # Melakukan running average dari loss sebelumnya agar mendapatkan rata
      # rata loss dari mini batch
      loss = loss + a * (end - i)
    # Return total loss dari loss yang didapatkan menggunakan running average
    # dan dibagi total training (n)
    return loss / len(self.y)


  def calculate(self, X, y, mode="test", solver_type="gradient_descent"):
    """Untuk kalkulasi gradien output dan least square dari model"""
    # Mendapatkan output dari linear regresi
    num_train = X.shape[0]
    output = self.predict(X, mode=mode)
    theta = output
    f = theta - np.max(theta, axis=1, keepdims=True)
    scores = np.exp(f) / np.sum(np.exp(f), axis=1, keepdims=True)
    true_scores = scores[np.arange(num_train).flatten(), y.flatten()]
    loss = -np.sum(np.log(true_scores)) / num_train
    # Bila modenya train
    if mode == "train":
      # Training menggunakan gradient descent
      # Menghitung loss menggunakan output dan y


      # Menentukan solver type gradient descent atau metode least square
      if solver_type == "gradient_descent":
        # Menghitung gradient menggunakan rumus dibawah


         # Shift scores to avoid numerical instability
        print(f"X : {X.shape}")
        w_shape = self.params["W"].shape
        print(f"W : {w_shape}")


        print(f"scpres : {scores.shape}")
        print(f"y : {y.shape}")

        print(f"true ; {scores.shape}")

        dx = np.copy(scores)
        dx[np.arange(num_train), y] -= 1
        dx = dx / num_train
        # self.params['dW'] = np.dot((y[:, np.newaxis] - output).T, X) / len(y)
        # Update parameter berdasarkan gradient dikali konstanta belajar (lr/learning rate)
        print(f"X shape : {X.shape}")
        print(f"dx shape : {dx.shape}")
        self.params['W'] -= self.lr * (dx.T @ X)
      # Bila menggunakan metode least square / metode kuadrat terkecil
      else:
        # Perhitungan menggunakan metode kuadrat terkecil agar mendapatkan
        # Parameter optimal untuk model secara langsung agar meminimalkan
        # loss function
        a = np.linalg.inv(np.dot(X.T, X))
        b = np.dot(X.T, y)
        c = np.dot(a, b)

        # Mengupdate parameter berdasarkan parameter optimal
        self.params['W'] = c[:, np.newaxis].T

    # Return output bila mode test
    else:
      return output

    # Return loss serta output
    return loss, output

  def train(self, solver_type="gradient_descent", epoch=50, batch_size=216):
    """Untuk train model linear regresi"""
    if solver_type == "gradient_descent":
      # Bila solver type gradient descent update parameter dilakukan sampai
      # epoch/iterasi tertentu
      for i in range(epoch):
        # Menghitung loss dari model yang di update parameternya
        loss = self.batch(batch_size)
        print(f"[Epoch {i + 1}] : loss = {loss}")
    else:
      # Menghitung loss yanag sudah di update parameternya menggunakan least square/metode kuadrat terkecil
      loss, _ = self.calculate(self.X, self.y, "train", "least_square")
      print(f"loss : {loss}")


In [23]:
softmax_scratch = Softmax(X_train, y_train)
softmax_scratch.train()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
y : (216,)
true ; (216, 2)
X shape : (216, 7)
dx shape : (216, 2)
X : (216, 7)
W : (2, 7)
scpres : (216, 2)
y : (216,)
true ; (216, 2)
X shape : (216, 7)
dx shape : (216, 2)
X : (216, 7)
W : (2, 7)
scpres : (216, 2)
y : (216,)
true ; (216, 2)
X shape : (216, 7)
dx shape : (216, 2)
X : (216, 7)
W : (2, 7)
scpres : (216, 2)
y : (216,)
true ; (216, 2)
X shape : (216, 7)
dx shape : (216, 2)
X : (216, 7)
W : (2, 7)
scpres : (216, 2)
y : (216,)
true ; (216, 2)
X shape : (216, 7)
dx shape : (216, 2)
X : (216, 7)
W : (2, 7)
scpres : (216, 2)
y : (216,)
true ; (216, 2)
X shape : (216, 7)
dx shape : (216, 2)
X : (160, 7)
W : (2, 7)
scpres : (160, 2)
y : (160,)
true ; (160, 2)
X shape : (160, 7)
dx shape : (160, 2)
[Epoch 11] : loss = 0.44461996519941177
X : (216, 7)
W : (2, 7)
scpres : (216, 2)
y : (216,)
true ; (216, 2)
X shape : (216, 7)
dx shape : (216, 2)
X : (216, 7)
W : (2, 7)
scpres : (216, 2)
y : (216,)
true ; (216, 2)
X sh

## **import data for multiclass classification**

In [24]:
df_multi = pd.read_csv("https://raw.githubusercontent.com/MachineLearningBCAM/Datasets/refs/heads/main/data/multi_class_datasets/iris.csv")
df_multi.columns = df_multi.iloc[0]
df_multi = df_multi.drop(0)

col_x =  [i for i in df_multi.columns if i != "class"]
df_multi[col_x] = df_multi[col_x].astype(float)
df_multi

Unnamed: 0,sepal-length-in-cm,sepal-width-in-cm,petal-length-in-cm,petal-width-in-cm,class
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,Iris-virginica
147,6.3,2.5,5.0,1.9,Iris-virginica
148,6.5,3.0,5.2,2.0,Iris-virginica
149,6.2,3.4,5.4,2.3,Iris-virginica


In [25]:
df_multi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 1 to 150
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   sepal-length-in-cm  150 non-null    float64
 1   sepal-width-in-cm   150 non-null    float64
 2   petal-length-in-cm  150 non-null    float64
 3   petal-width-in-cm   150 non-null    float64
 4   class               150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [26]:
# Preprocessing
# Encoding
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
df_multi['class']= label_encoder.fit_transform(df_multi['class'])

df_multi['class'].unique()

array([0, 1, 2])

In [27]:
X = df_multi[col_x]
y = df_multi['class']

In [28]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
standard_scaler = StandardScaler()

# Fit and transform the data
X = pd.DataFrame(standard_scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,sepal-length-in-cm,sepal-width-in-cm,petal-length-in-cm,petal-width-in-cm
0,-0.900681,1.032057,-1.341272,-1.312977
1,-1.143017,-0.124958,-1.341272,-1.312977
2,-1.385353,0.337848,-1.398138,-1.312977
3,-1.506521,0.106445,-1.284407,-1.312977
4,-1.021849,1.263460,-1.341272,-1.312977
...,...,...,...,...
145,1.038005,-0.124958,0.819624,1.447956
146,0.553333,-1.281972,0.705893,0.922064
147,0.795669,-0.124958,0.819624,1.053537
148,0.432165,0.800654,0.933356,1.447956


In [29]:
X, y = X.to_numpy(), y.to_numpy()

In [30]:
# Split data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y ,
                                   random_state=104,
                                   test_size=0.25,
                                   shuffle=True)

## **Using the softmax model made from scratch and training it**

In [31]:
# Initialize
softmax_scratch = Softmax(X_train, y_train, 0.5)
# Train
softmax_scratch.train(epoch=1000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
X : (112, 5)
W : (3, 5)
scpres : (112, 3)
y : (112,)
true ; (112, 3)
X shape : (112, 5)
dx shape : (112, 3)
[Epoch 376] : loss = 0.7098765203628344
X : (112, 5)
W : (3, 5)
scpres : (112, 3)
y : (112,)
true ; (112, 3)
X shape : (112, 5)
dx shape : (112, 3)
[Epoch 377] : loss = 0.7098749922236769
X : (112, 5)
W : (3, 5)
scpres : (112, 3)
y : (112,)
true ; (112, 3)
X shape : (112, 5)
dx shape : (112, 3)
[Epoch 378] : loss = 0.7098734668247227
X : (112, 5)
W : (3, 5)
scpres : (112, 3)
y : (112,)
true ; (112, 3)
X shape : (112, 5)
dx shape : (112, 3)
[Epoch 379] : loss = 0.7098719442163601
X : (112, 5)
W : (3, 5)
scpres : (112, 3)
y : (112,)
true ; (112, 3)
X shape : (112, 5)
dx shape : (112, 3)
[Epoch 380] : loss = 0.709870424447755
X : (112, 5)
W : (3, 5)
scpres : (112, 3)
y : (112,)
true ; (112, 3)
X shape : (112, 5)
dx shape : (112, 3)
[Epoch 381] : loss = 0.7098689075668748
X : (112, 5)
W : (3, 5)
scpres : (112, 3)
y : (1

In [32]:
output = softmax_scratch.predict(X_test)
acc = np.mean(output == y_test)

print(f"Accuracy of the softmax made by myself is : {acc}")

Accuracy of the softmax made by myself is : 0.8157894736842105


## **Comparing it with the sklearn softmax regression model**

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create the softmax regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=50)

# Fit the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

output = model.predict(X_test)
acc = np.mean(output == y_test)

print(f"Accuracy of the softmax made by sklearn is : {acc}")

Accuracy of the softmax made by sklearn is : 1.0




In [34]:
import numpy as np

# First matrix: random integers from 0 to 10
matrix1 = np.random.randint(0, 11, size=(10, 1))

# Second matrix: random 0 or 1
matrix2 = np.random.randint(0, 2, size=(10, 1))

print("Matrix 1:\n", matrix1)
print("\nMatrix 2:\n", matrix2)

Matrix 1:
 [[6]
 [8]
 [9]
 [1]
 [6]
 [3]
 [8]
 [1]
 [0]
 [7]]

Matrix 2:
 [[0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]]


In [35]:
# import numpy as np
matrix1[np.where(matrix2 == 0)[0]]

array([[6],
       [9],
       [1],
       [6]])

In [36]:
np.sum(matrix1, axis=1)

array([6, 8, 9, 1, 6, 3, 8, 1, 0, 7])

In [37]:
d = 8
a = np.zeros(d)
a

array([0., 0., 0., 0., 0., 0., 0., 0.])

## **Generative Learning Algorithms**

In [38]:
class GDA:
  def __init__(self, X, y=None, lr=0.01):
    # Params untuk menyimpan parameter dan gradientnya
    self.params = {}

    # inisiasi parameter
    # Menambah col baru dengan semua isinya bernilai 1 ke x
    # yang digunakan agar kita memiliki intercept di rumus linear regresi
    # yang menggunakan vector form
    # self.X = np.concatenate((np.ones([np.shape(X)[0], 1]), X),axis=1)
    self.X = X
    self.y = y
    # learning rate sebagai konstanta belajar seberapa cepat mengupdate gradient
    self.lr = lr
    # Inisiasi parameter linear regresi dengan menggunakan nilai random
    self.params['W'] = np.random.rand(len(np.unique(y)), self.X.shape[1])

  def predict(self, X, mode="test"):
    """Untuk Prediksi y Berdasarkan input X menggunakan model linear regresi"""
    if mode == "test":
      # Menambah col baru dengan semua isinya bernilai 1 ke x
      # yang digunakan agar kita memiliki intercept di rumus linear regresi
      # yang menggunakan vector form
      print('sblm x')
      # X = np.concatenate((np.ones([np.shape(X)[0], 1]), X),axis=1)
      print('hbs x')
      output = self.forward(X)
      print('hbs fwd')
      output = np.where(output > 0.5, 1, 0)
      return output
    # Memprediksi y berdasarkan X menggunakan model yang sudah dibuat
    # output = self.forward(X, mode)
    # output = self.softmax(output)
    return output

  def forward(self, X):
    num_train = int(X.shape[0])
    print(f"num train: {num_train}")
    output = np.zeros(num_train)
    print(f"output : {output.shape}")
    sigma_inv = np.linalg.inv(self.params['sigma'])
    for i in range(num_train):
      # print(f"X shape : {X.shape}")
      # # cov1st = (X[i] - self.params['mu_0']).T
      # print(f"cov1st shape : {cov1st.shape}")
      # print(f"hbs cov")
      # # sigma_inv = np.linalg.inv(self.params['sigma'])
      # print(f"hbs simga inv")
      # # cov2nd = (X[i] - self.params['mu_0'])
      # print(f"hbs cov2nd")
      # # tes = cov1st * sigma_inv
      # print('hbs yo')
      exp_1st = -1/2 * (X[i] - self.params['mu_0']).T @ sigma_inv @ (X[i] - self.params['mu_0'])
      print('hbs 1st')


      exp_2nd = 1/2 * (X[i] - self.params['mu_1']).T @ sigma_inv @ (X[i] - self.params['mu_1'])
      phi_ins = ((1 - self.params['phi']) / self.params['phi'])
      exp_3rd = np.log(phi_ins)
      print(f"exp_1st : {exp_1st}")
      print(f"exp_2nd : {exp_2nd}")
      print(f"exp_3rd : {exp_3rd}")
      # print(f"phi : {phi_ins}")
      exp = np.exp(exp_1st + exp_2nd + exp_3rd)
      print(f"exp(1+2+3) : {exp}")
      output[i] = (1 / (1 + exp))
      print(f'OUtput : {output[i]}')
    return output


  def calculate(self, X, y):
    """Untuk kalkulasi gradien output dan least square dari model"""
    # Mendapatkan output dari linear regresi
    # num_train = X.shape[0]
    # output = self.predict(X, mode=mode)
    # theta = output
    # f = theta - np.max(theta, axis=1, keepdims=True)
    # scores = np.exp(f) / np.sum(np.exp(f), axis=1, keepdims=True)
    # true_scores = scores[np.arange(num_train).flatten(), y.flatten()]
    # loss = -np.sum(np.log(true_scores)) / num_train


    # Training menggunakan gradient descent
    # Menghitung loss menggunakan output dan y
    num_train = len(y)
    print(f'y : {y}')


    # Menentukan solver type gradient descent atau metode least square

      # Menghitung gradient menggunakan rumus dibawah
    self.params['phi'] = np.sum(y) / num_train
    list_x_0 = np.where(y == 0)[0]
    self.params['X_0'] = X[list_x_0]
    list_x_1 = np.where(y == 1)[0]
    self.params['X_1'] = X[list_x_1]
    self.params['mu_0'] = np.sum(self.params['X_0'], axis=0) / len(self.params['X_0'])
    self.params['mu_1'] = np.sum(self.params['X_1'], axis=0) / len(self.params['X_1'])
    # self.params['sigma'] = (self.params['X_0'] - self.params['mu_0']) @ (self.params['X_0'] - self.params['mu_0']).T
    d = int(np.shape(X[0])[0])
    sum_cov0 = np.zeros((d, d))

    # Reshaping the mu_0
    mu_0 = self.params['mu_0'].reshape(self.params['mu_0'].shape[0], 1)
    for i in list_x_0:
      # Reshape so that itll be matrix
      x_i = X[i].reshape(X[i].shape[0], 1)

      # Find cov
      first = (x_i - mu_0)
      second = (x_i - mu_0).T
      sum_cov0 += first @ second

    sum_cov1 = np.zeros((d, d))
    # Reshaping the mu_1
    mu_1 = self.params['mu_1'].reshape(self.params['mu_1'].shape[0], 1)
    for i in list_x_1:
      # Reshape so that itll be matrix
      x_i = X[i].reshape(X[i].shape[0], 1)

      # Find cov
      first = (x_i - mu_1)
      second = (x_i - mu_1).T
      sum_cov1 += first @ second
      # sum_cov1 += (x_i - self.params['mu_1']) @ (x_i - self.params['mu_1']).T

    self.params['sigma'] = (sum_cov0 + sum_cov1 ) / num_train

    # + np.sum((self.params['X_1'] - self.params['mu_1']) @ (self.params['X_1'] - self.params['mu_1']).T)
    # self.params['sigma'] = self.params['sigma'] / num_train


  def train(self):
    """Untuk train model linear regresi"""
    self.calculate(self.X, self.y)


## **import data for binary classification**

In [39]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

In [40]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
standard_scaler = StandardScaler()

# Fit and transform the data
X = pd.DataFrame(standard_scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,2.217515,2.255747,...,1.886690,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015
1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,0.001392,-0.868652,...,1.805927,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.243890,0.281190
2,1.579888,0.456187,1.566503,1.558884,0.942210,1.052926,1.363478,2.037231,0.939685,-0.398008,...,1.511870,-0.023974,1.347475,1.456285,0.527407,1.082932,0.854974,1.955000,1.152255,0.201391
3,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,2.867383,4.910919,...,-0.281464,0.133984,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.935010
4,1.750297,-1.151816,1.776573,1.826229,0.280372,0.539340,1.371011,1.428493,-0.009560,-0.562450,...,1.298575,-1.466770,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.397100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,2.110995,0.721473,2.060786,2.343856,1.041842,0.219060,1.947285,2.320965,-0.312589,-0.931027,...,1.901185,0.117700,1.752563,2.015301,0.378365,-0.273318,0.664512,1.629151,-1.360158,-0.709091
565,1.704854,2.085134,1.615931,1.723842,0.102458,-0.017833,0.693043,1.263669,-0.217664,-1.058611,...,1.536720,2.047399,1.421940,1.494959,-0.691230,-0.394820,0.236573,0.733827,-0.531855,-0.973978
566,0.702284,2.045574,0.672676,0.577953,-0.840484,-0.038680,0.046588,0.105777,-0.809117,-0.895587,...,0.561361,1.374854,0.579001,0.427906,-0.809587,0.350735,0.326767,0.414069,-1.104549,-0.318409
567,1.838341,2.336457,1.982524,1.735218,1.525767,3.272144,3.296944,2.658866,2.137194,1.043695,...,1.961239,2.237926,2.303601,1.653171,1.430427,3.904848,3.197605,2.289985,1.919083,2.219635


In [41]:
X, y = X.to_numpy(), y.to_numpy()

In [42]:
# Split data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y ,
                                   random_state=104,
                                   test_size=0.25,
                                   shuffle=True)

## **Trying GDA Model made from scratch**

In [43]:
model = GDA(X_train, y_train)
model.train()
output = model.predict(X_test)

y : [1 1 1 1 0 1 0 1 1 1 0 1 0 1 1 0 1 0 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0
 1 1 1 1 0 0 1 0 0 0 1 0 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1 0 1
 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 0 0 1 0 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1
 0 0 0 1 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 0 1 1 1 0 1 1 0
 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1
 1 0 0 1 0 1 1 1 0 0 0 1 0 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0 0 1 0 1
 0 1 1 1 1 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1
 0 1 1 1 0 0 1 0 1 0 0 0 1 1 1 1 1 1 1 0 0 0 1 0 1 0 0 0 0 1 0 0 1 1 1 1 0
 1 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0 1 1 0
 0 1 0 1 0 0 1 1 1 1 1 0 0 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1
 1 1 1 0 0 1 1 1 0 1 0 1 1 0 1 0 1 0 1 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 0 0 1 0 1 1 1 0 1 1 1 1 0 1 1 1]
sblm x
hbs x
num train: 143
output : (143,)
hbs 1st
exp_1st : -60.58911540121791
exp_2nd : 62.9965346380427
exp_3rd : -0.5485659517

In [44]:
# Accuracy test
acc = np.mean(output == y_test)
print(f"The accuracy of the model is : {acc}")

The accuracy of the model is : 0.958041958041958


## **Comparision using GDA from sklearn**

In [45]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score

# Fit the GDA (LDA) model
model = LinearDiscriminantAnalysis()
model.fit(X_train, y_train)

# Predict on the same X or new test data
y_pred = model.predict(X_test)

# Evaluate accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.958041958041958


## **Naive Bayes**

In [251]:
class NaiveBayes:
  def __init__(self, X, y=None, lr=0.01):
    # Params untuk menyimpan parameter dan gradientnya
    self.params = {}

    # inisiasi parameter
    # Menambah col baru dengan semua isinya bernilai 1 ke x
    # yang digunakan agar kita memiliki intercept di rumus linear regresi
    # yang menggunakan vector form
    # self.X = np.concatenate((np.ones([np.shape(X)[0], 1]), X),axis=1)
    self.X = X
    self.y = y
    # learning rate sebagai konstanta belajar seberapa cepat mengupdate gradient
    self.lr = lr
    # Inisiasi parameter linear regresi dengan menggunakan nilai random
    self.params['W'] = np.random.rand(len(np.unique(y)), self.X.shape[1])

  def predict(self, X, mode="test"):
    """Untuk Prediksi y Berdasarkan input X menggunakan model linear regresi"""
    if mode == "test":
      output = self.forward(X)
      return output
    return output

  def forward(self, X):
    # Cari likelihood
    X = X.toarray()


    print(X.dtype)
    phi1 = self.params['phi_1'].shape
    print(f"phi shape : {phi1}")
    print(f"X shape : {X.shape}")
    like_1 = np.where(X == 1, self.params['phi_1'], (1 - self.params['phi_1']))
            #  np.where(matrix1 == 1, matrix2, matrix3)
    print(f"evid : {like_1}")
    like_1 = np.prod(like_1, axis=1)

    like_0 = np.where(X == 1, self.params['phi_0'], (1 - self.params['phi_0']))
    like_0 = np.prod(like_0, axis=1)
    print(f"like1:{like_1}")
    # Cari numerator
    num_1 = like_1 * self.params['phi_y']
    num_0 = like_0 * (1 - self.params['phi_y'] )

    # Cari evidence
    evid = num_1 + num_0

    # Cari posterior
    post_1 = num_1 / evid
    post_0 = num_0 / evid

    return np.where(post_1 > post_0, 1, 0)

  def calculate(self, X, y):
    """Untuk kalkulasi mle dari parameter naive bayes"""

    # Training menggunakan gradient descent
    # Menghitung loss menggunakan output dan y
    num_train = len(y)
    y = y.to_numpy().reshape(-1, 1)
    # print(y.shape)

    self.params['phi_1'] = ((np.sum(X[np.where(y == 1)[0]], axis=0) + 1) / (2 + np.sum(y))).reshape(1, -1)

    self.params['phi_0'] = ((np.sum(X[np.where(y == 0)[0]], axis=0) + 1) / (2 + (len(y) - np.sum(y)))).reshape(1, -1)
    self.params['phi_y'] = np.sum(y)/num_train
    print(self.params['phi_0'])

  def train(self):
    """Untuk train model linear regresi"""
    self.calculate(self.X, self.y)


In [252]:
model = NaiveBayes(X_train, y_train)
model.train()
output = model.predict(X_test)
output

[[0.00025893 0.00025893 0.00051787 ... 0.00025893 0.00181253 0.00051787]]
int64
phi shape : (1, 8672)
X shape : (1115, 8672)
evid : [[0.98497496 0.96327212 0.99833055 ... 0.9966611  0.99833055 0.99833055]
 [0.98497496 0.96327212 0.99833055 ... 0.9966611  0.99833055 0.99833055]
 [0.98497496 0.96327212 0.99833055 ... 0.9966611  0.99833055 0.99833055]
 ...
 [0.98497496 0.96327212 0.99833055 ... 0.9966611  0.99833055 0.99833055]
 [0.98497496 0.96327212 0.99833055 ... 0.9966611  0.99833055 0.99833055]
 [0.98497496 0.96327212 0.99833055 ... 0.9966611  0.99833055 0.99833055]]
like1:[1.44298241e-64 2.93108601e-83 7.77336868e-37 ... 5.26917414e-35
 5.39406323e-23 6.94492493e-43]


array([0, 0, 1, ..., 0, 0, 1])

In [253]:
np.mean(output == y_test)

np.float64(0.97847533632287)

In [205]:
# 10x5 matrix with values 0 or 1
matrix1 = np.random.randint(0, 2, size=(10, 5))
yy = np.random.randint(0, 2, size=(10, 1))

# 1x5 matrix with fractional values between 0 and 1
matrix2 = np.random.rand(1, 5)

print("Matrix 1 (10x5, 0 or 1):\n", matrix1)
print("\nMatrix 2 (1x5, fractional 0-1):\n", matrix2)
print("\nMatrix 2 (1x5, fractional 0-1):\n", yy)

Matrix 1 (10x5, 0 or 1):
 [[0 0 0 0 0]
 [1 0 1 1 1]
 [1 0 0 0 1]
 [1 1 1 0 1]
 [0 0 0 0 1]
 [1 1 1 0 1]
 [0 0 0 1 1]
 [0 0 1 0 0]
 [0 1 0 1 0]
 [0 0 1 0 1]]

Matrix 2 (1x5, fractional 0-1):
 [[0.46084031 0.49551127 0.98339438 0.83261174 0.07472049]]

Matrix 2 (1x5, fractional 0-1):
 [[1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]]


In [201]:
matrix1 + 1

array([[2, 1, 1, 2, 1],
       [2, 2, 2, 1, 2],
       [2, 1, 1, 2, 2],
       [1, 2, 1, 2, 1],
       [1, 2, 1, 1, 2],
       [1, 1, 2, 1, 2],
       [2, 1, 2, 1, 2],
       [1, 2, 2, 2, 1],
       [1, 1, 1, 2, 1],
       [2, 1, 1, 2, 2]])

In [181]:
np.where(matrix1 == 1, 0, 1)

array([[0, 1, 1, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 1, 1, 0, 0],
       [1, 0, 1, 0, 1],
       [1, 0, 1, 1, 0],
       [1, 1, 0, 1, 0],
       [0, 1, 0, 1, 0],
       [1, 0, 0, 0, 1],
       [1, 1, 1, 0, 1],
       [0, 1, 1, 0, 0]])

In [105]:
matrix1[np.where(yy == 1)[0]]

array([[1, 0, 0, 1, 0],
       [1, 1, 1, 0, 1],
       [0, 0, 1, 0, 1],
       [0, 1, 1, 1, 0],
       [0, 0, 0, 1, 0],
       [1, 0, 0, 1, 1]])

In [60]:
matrix3 = 1 - matrix2

array([[0.98269097, 0.87831634, 0.94415875, 0.14311454, 0.71438613]])

In [64]:
np.prod(np.where(matrix1 == 1, matrix2, matrix3), axis=1)

array([8.67951920e-05, 3.51294107e-03, 8.12855119e-05, 6.91114567e-02,
       3.51294107e-03, 3.47009964e-05, 2.87847381e-05, 2.07769107e-04,
       8.33162490e-02, 1.99441263e-01])

In [65]:
# Two 10x1 matrices with fractional values between 0 and 1
matrix_a = np.random.rand(10, 1)
matrix_b = np.random.rand(10, 1)

print("Matrix A (10x1):\n", matrix_a)
print("\nMatrix B (10x1):\n", matrix_b)

Matrix A (10x1):
 [[0.71726744]
 [0.30944522]
 [0.32702466]
 [0.09107432]
 [0.49674894]
 [0.2279108 ]
 [0.29558839]
 [0.34251823]
 [0.91826165]
 [0.32619772]]

Matrix B (10x1):
 [[0.04579104]
 [0.30267564]
 [0.75802592]
 [0.78048965]
 [0.36627346]
 [0.6937706 ]
 [0.74670115]
 [0.07904908]
 [0.15025806]
 [0.22507448]]


In [66]:
np.where(matrix_a > matrix_b, 1, 0)

array([[1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1]])

Import data dan preprocessing

In [69]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/sms-spam-collection-dataset


In [118]:
df = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", sep=',', encoding='latin-1')[['v1', 'v2']]
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [119]:
from sklearn.feature_extraction.text import CountVectorizer
df.columns = ['label', 'message']

# Convert labels to binary values
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Convert text messages to binary feature vectors
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(df['message'])

# Labels
y = df['label']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Mencoba model yg sudah dibikin sendiri

In [195]:
model = NaiveBayes(X_train, y_train)
model.train()
output = model.predict(X_test)

(597, 1)
int64
phi shape : (1, 8672)
X shape : (1115, 8672)


  post_1 = num_1 / evid
  post_0 = num_0 / evid


In [200]:
sum(output)

np.int64(0)

In [196]:
# Akurasi
akurasi = np.mean(output == y_test)
print(f"Akurasi dari model Naive Bayes bikinan sendiri adalah : {akurasi}")

Akurasi dari model Naive Bayes bikinan sendiri adalah : 0.8654708520179372


Mencoba model default Naive Bayes di sklearn

In [198]:
from sklearn.naive_bayes import BernoulliNB

nb = BernoulliNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.97847533632287


In [199]:
y_test.value_counts()/sum(y_test.value_counts())*100

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,86.547085
1,13.452915
