<a href="https://colab.research.google.com/github/notnsas/cs229-ml-from-scratch/blob/main/cs229_ml_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
class LogisticRegression:
  def __init__(self, X, y=None, lr=0.01):
    # Params untuk menyimpan parameter dan gradientnya
    self.params = {}

    # inisiasi parameter
    # Menambah col baru dengan semua isinya bernilai 1 ke x
    # yang digunakan agar kita memiliki intercept di rumus linear regresi
    # yang menggunakan vector form
    self.X = np.concatenate((np.ones([np.shape(X)[0], 1]), X),axis=1)
    self.y = y
    # learning rate sebagai konstanta belajar seberapa cepat mengupdate gradient
    self.lr = lr
    # Inisiasi parameter linear regresi dengan menggunakan nilai random
    self.params['W'] = np.random.rand(1, self.X.shape[1])

  def predict(self, X, mode="test"):
    """Untuk Prediksi y Berdasarkan input X menggunakan model linear regresi"""
    if mode == "test":
      # Menambah col baru dengan semua isinya bernilai 1 ke x
      # yang digunakan agar kita memiliki intercept di rumus linear regresi
      # yang menggunakan vector form
      X = np.concatenate((np.ones([np.shape(X)[0], 1]), X),axis=1)
      output = self.forward(X, mode)
      output = self.sigmoid(output, mode)
      return np.round(output)
    # Memprediksi y berdasarkan X menggunakan model yang sudah dibuat
    output = self.forward(X, mode)
    output = self.sigmoid(output, mode)
    return output

  def forward(self, X, mode="test"):
    return np.dot(X, self.params["W"].T)

  def sigmoid(self, X, mode="test"):
    return 1 / (1 + np.exp(-X))

  def loss(self, output, y):
    """Menghitung loss"""
    # Menghitung loss menggunakan mean squared error dan dikali 1/2 agar
    # gradientnya nanti gampang dihitung
    # return 1/2 * np.mean(np.square(output - y))
    loss = y * np.log(output) + (1 - y) * np.log((1 - output))
    loss = np.sum(loss)
    # print(loss)
    return loss

  def batch(self, batch_size):
    """Menggunakan mini-batch untuk menghitung loss, output, dan gradien"""
    # Meninisiasi loss dan menshuffle X & y
    loss = 0
    # Menshuffle menggunakan fungsi np.random.permutation() sebagai index
    # lalu menggunakan index tersebut untuk menshuffle X dan y
    indices = np.random.permutation(len(self.y))
    X_shuffle = self.X[indices]
    y_shuffle = self.y[indices]

    # Membagi data menjadi minibatch yg diitung loss nya
    for i in range(0, len(self.y), batch_size):
      end = min(i + batch_size, len(self.y))
      # Membagi data per batch dengan banyak n (training example) sesuai
      # dengan batch_size yang dipilih
      X_batch = X_shuffle[i:end, :]
      y_batch = y_shuffle[i:end]
      # Kalulasi loss per batch
      a = self.calculate(X_batch, y_batch, "train", "gradient_descent")[0]

      # Melakukan running average dari loss sebelumnya agar mendapatkan rata
      # rata loss dari mini batch
      loss = loss + a * (end - i)
    # Return total loss dari loss yang didapatkan menggunakan running average
    # dan dibagi total training (n)
    return loss / len(self.y)


  def calculate(self, X, y, mode="test", solver_type="gradient_descent"):
    """Untuk kalkulasi gradien output dan least square dari model"""
    # Mendapatkan output dari linear regresi
    output = self.predict(X, mode=mode)
    # Bila modenya train
    if mode == "train":
      # Training menggunakan gradient descent
      # Menghitung loss menggunakan output dan y
      loss = self.loss(output, y)

      # Menentukan solver type gradient descent atau metode least square
      if solver_type == "gradient_descent":
        # Menghitung gradient menggunakan rumus dibawah
        self.params['dW'] = np.dot((y[:, np.newaxis] - output).T, X) / len(y)
        # Update parameter berdasarkan gradient dikali konstanta belajar (lr/learning rate)
        self.params['W'] += self.lr * self.params['dW']
      # Bila menggunakan metode least square / metode kuadrat terkecil
      else:
        # Perhitungan menggunakan metode kuadrat terkecil agar mendapatkan
        # Parameter optimal untuk model secara langsung agar meminimalkan
        # loss function
        a = np.linalg.inv(np.dot(X.T, X))
        b = np.dot(X.T, y)
        c = np.dot(a, b)

        # Mengupdate parameter berdasarkan parameter optimal
        self.params['W'] = c[:, np.newaxis].T

    # Return output bila mode test
    else:
      return output

    # Return loss serta output
    return loss, output

  def train(self, solver_type="gradient_descent", epoch=50, batch_size=216):
    """Untuk train model linear regresi"""
    if solver_type == "gradient_descent":
      # Bila solver type gradient descent update parameter dilakukan sampai
      # epoch/iterasi tertentu
      for i in range(epoch):
        # Menghitung loss dari model yang di update parameternya
        loss = self.batch(batch_size)
        print(f"[Epoch {i + 1}] : loss = {loss}")
    else:
      # Menghitung loss yang sudah di update parameternya menggunakan least square/metode kuadrat terkecil
      loss, _ = self.calculate(self.X, self.y, "train", "least_square")
      print(f"loss : {loss}")


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("fedesoriano/stroke-prediction-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/fedesoriano/stroke-prediction-dataset/versions/1


In [None]:
from os import listdir
from os.path import isfile, join

mypath = '/root/.cache/kagglehub/datasets/fedesoriano/stroke-prediction-dataset/versions/1'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [None]:
onlyfiles

['healthcare-dataset-stroke-data.csv']

In [None]:
import pandas as pd
df = pd.read_csv("/root/.cache/kagglehub/datasets/fedesoriano/stroke-prediction-dataset/versions/1/healthcare-dataset-stroke-data.csv")
df = df.select_dtypes(include='number')
df = df.fillna(df.mean())

In [None]:
X = df[[i for i in df.select_dtypes(include='number').columns if i!= "stroke"]]
y = df['stroke']


In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
standard_scaler = StandardScaler()

# Fit and transform the data
X = pd.DataFrame(standard_scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi
0,-1.298312,1.051434,-0.328602,4.185032,2.706375,1.001234e+00
1,0.716371,0.786070,-0.328602,-0.238947,2.121559,4.615554e-16
2,-0.255478,1.626390,-0.328602,4.185032,-0.005028,4.685773e-01
3,1.118363,0.255342,-0.328602,-0.238947,1.437358,7.154182e-01
4,-1.647136,1.582163,3.043196,-0.238947,1.501184,-6.357112e-01
...,...,...,...,...,...,...
5105,-0.864089,1.626390,3.043196,-0.238947,-0.494658,4.615554e-16
5106,0.394863,1.670617,-0.328602,-0.238947,0.420775,1.442949e+00
5107,-0.793720,-0.363842,-0.328602,-0.238947,-0.511443,2.217363e-01
5108,0.048497,0.343796,-0.328602,-0.238947,1.328257,-4.278451e-01


In [None]:
X, y = X.to_numpy(), y.to_numpy()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y ,
                                   random_state=104,
                                   test_size=0.25,
                                   shuffle=True)

In [None]:
model = LogisticRegression(X_train, y_train)


In [None]:
uniques, counts = np.unique(y, return_counts=True)
percentages = dict(zip(uniques, counts * 100 / len(y)))

In [None]:
percentages

{np.int64(0): np.float64(95.12720156555773),
 np.int64(1): np.float64(4.87279843444227)}

In [None]:
model.train()

[Epoch 1] : loss = -63042.118385910995
[Epoch 2] : loss = -59918.79532856278
[Epoch 3] : loss = -56760.628473459474
[Epoch 4] : loss = -53790.748735083616
[Epoch 5] : loss = -50933.967460661996
[Epoch 6] : loss = -48384.22100917863
[Epoch 7] : loss = -45811.161390781555
[Epoch 8] : loss = -43321.32274046149
[Epoch 9] : loss = -41123.938667039496
[Epoch 10] : loss = -38920.59747865657
[Epoch 11] : loss = -36912.43057683033
[Epoch 12] : loss = -34904.86603557671
[Epoch 13] : loss = -33217.73454719173
[Epoch 14] : loss = -31508.253915980116
[Epoch 15] : loss = -29997.791144697712
[Epoch 16] : loss = -28543.128431005127
[Epoch 17] : loss = -27195.807884996288
[Epoch 18] : loss = -25859.535125291062
[Epoch 19] : loss = -24761.836068890432
[Epoch 20] : loss = -23741.224006797256
[Epoch 21] : loss = -22708.455161700764
[Epoch 22] : loss = -21849.764745784712
[Epoch 23] : loss = -20960.904272235806
[Epoch 24] : loss = -20224.72231136164
[Epoch 25] : loss = -19516.816541965054
[Epoch 26] : loss

In [None]:
output = model.predict(X_test)


In [None]:
acc = np.mean(output == y_test)
acc

np.float64(0.9523426421859272)

In [None]:
output

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [None]:
from sklearn.metrics import classification_report
# y_true = [1, 0, 1, 1, 0]
# y_pred = [1, 0, 1, 0, 0]

accuracy = classification_report(y_test, output)
print("Accuracy Score:", accuracy)

Accuracy Score:               precision    recall  f1-score   support

           0       0.95      1.00      0.98      1218
           1       0.00      0.00      0.00        60

    accuracy                           0.95      1278
   macro avg       0.48      0.50      0.49      1278
weighted avg       0.91      0.95      0.93      1278



In [None]:
display(accuracy)

'              precision    recall  f1-score   support\n\n           0       0.95      1.00      0.98      1218\n           1       0.00      0.00      0.00        60\n\n    accuracy                           0.95      1278\n   macro avg       0.48      0.50      0.49      1278\nweighted avg       0.91      0.95      0.93      1278\n'

In [None]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=16)

# fit the model with data
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

In [None]:
np.mean(y_pred == y_test)

np.float64(0.9530516431924883)