In [1]:
import numpy as np

In [2]:
class Logistic_Regression():
    def __init__(self, n_epoch = 500, learning_rate = 0.01, loss_tolerance = 0.0001):
        self._n_epoch = n_epoch
        self._lr = learning_rate
        self._loss_tolerance = loss_tolerance    

    def fit(self, X, y):
        """模型训练"""
        n_sample, n_feature = xtrain.shape
        rnd = 1 / np.sqrt(n_feature)
        rng = np.random.default_rng()
        # 初始化权重和偏置
        self._w = rng.uniform(-rnd, rnd, size = n_feature)
        self._b = 0
        
        num_epoch = 0
        pre_loss = 0

        while True:
            cur_loss = 0
            X, y = self.shuffleData(xtrain, ytrain) # 每次把数据打散
            for i in range(n_sample):
                y_pred = self._sigmoid(np.dot(self._w, X[i]) + self._b)
                diff = y[i] - y_pred
                self._w += self._lr * diff * X[i]
                self._b += self._lr * diff
                cur_loss += abs(diff)
            num_epoch += 1
            loss_diff = cur_loss - pre_loss
            cur_loss = pre_loss
            
            # 如果epoch达到指定的n_epoch或者相邻的两个损失差小于loss_tolerance，则停止训练
            if num_epoch >= self._n_epoch or abs(loss_diff) < self._loss_tolerance:
                break

    def predict(self, X):    
        """给定输入样本，预测类别"""
        logit = np.dot(self._w, X) + self._b
        return 1 if logit > 0 else 0
    
    def _sigmoid(self, z):
        return 1/(1 + np.exp(-z))
    
    def shuffleData(self, xtrain, ytrain):
        arr = np.arange(len(xtrain))        
        np.random.shuffle(arr)
        X = xtrain[arr]
        y = ytrain[arr]
        return X, y

In [3]:
def standardization(data):
    mu = np.mean(data, axis=0)
    sigma = np.std(data, axis=0)
    return (data - mu) / sigma

In [4]:
# 逻辑回归分类效果
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris_data, iris_y = load_iris(return_X_y=True) #return_X_y为True，表示因变量和自变量独立导出
X = standardization(iris_data)
xtrain, xtest, ytrain, ytest = train_test_split(X[:100], iris_y[:100], train_size=0.8, shuffle=True)

model = Logistic_Regression(n_epoch = 1000, learning_rate = 0.01, loss_tolerance = 0.00001)
model.fit(xtrain, ytrain)

n_test = xtest.shape[0]
n_right = 0
for i in range(n_test):
    y_pred = model.predict(xtest[i])
    if y_pred == ytest[i]:
        n_right += 1
print("逻辑回归在测试集上的准确率为：{}%".format((n_right * 100) / n_test))

逻辑回归在测试集上的准确率为：100.0%


In [5]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter = 500)
clf.fit(xtrain, ytrain)
print("sklearn逻辑回归模型在测试集上准确率为：{}%".format(100 * clf.score(xtest, ytest)))

sklearn逻辑回归模型在测试集上准确率为：100.0%
