In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression as SklearnLogisticRegression

## Logistic Regression from scratch

In [2]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, n_iters=1000):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient Descent
        for _ in range(self.n_iters):
            # Tính toán mô hình tuyến tính
            linear_model = np.dot(X, self.weights) + self.bias
            # Áp dụng hàm sigmoid để có xác suất
            y_predicted = self.sigmoid(linear_model)

            # Tính toán đạo hàm (gradients)
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # Cập nhật trọng số
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict_proba(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        return self.sigmoid(linear_model)

    def predict(self, X, threshold=0.5):
        probas = self.predict_proba(X)
        return np.array([1 if i > threshold else 0 for i in probas])

* dataset

In [3]:
bc = datasets.load_breast_cancer()
X, y = bc.data, bc.target

bc_df = pd.DataFrame(data=X, columns=bc.feature_names)
bc_df['target'] = y
bc_df.sample(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
246,13.2,17.43,84.13,541.6,0.07215,0.04524,0.04336,0.01105,0.1487,0.05635,...,27.82,88.28,602.0,0.1101,0.1508,0.2298,0.0497,0.2767,0.07198,1
368,21.71,17.25,140.9,1546.0,0.09384,0.08562,0.1168,0.08465,0.1717,0.05054,...,26.44,199.5,3143.0,0.1363,0.1628,0.2861,0.182,0.251,0.06494,0
447,14.8,17.66,95.88,674.8,0.09179,0.0889,0.04069,0.0226,0.1893,0.05886,...,22.74,105.9,829.5,0.1226,0.1881,0.206,0.08308,0.36,0.07285,1
44,13.17,21.81,85.42,531.5,0.09714,0.1047,0.08259,0.05252,0.1746,0.06177,...,29.89,105.5,740.7,0.1503,0.3904,0.3728,0.1607,0.3693,0.09618,0
268,12.87,16.21,82.38,512.2,0.09425,0.06219,0.039,0.01615,0.201,0.05769,...,23.64,89.27,597.5,0.1256,0.1808,0.1992,0.0578,0.3604,0.07062,1


* Train model

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_scratch = LogisticRegression(learning_rate=0.001, n_iters=1000)
model_scratch.fit(X_train_scaled, y_train)
predictions_scratch = model_scratch.predict(X_test_scaled)
accuracy_scratch = accuracy_score(y_test, predictions_scratch)
print(f"Độ chính xác (from scratch): {accuracy_scratch:.4f}")

Độ chính xác (from scratch): 0.9737


## Logistic Regression from sklearn

In [6]:
model_sklearn = SklearnLogisticRegression(solver='liblinear')
model_sklearn.fit(X_train_scaled, y_train)
predictions_sklearn = model_sklearn.predict(X_test_scaled)
accuracy_sklearn = accuracy_score(y_test, predictions_sklearn)
print(f"Độ chính xác (Sklearn): {accuracy_sklearn:.4f}")

Độ chính xác (Sklearn): 0.9737
