#### 线性模型

##### 线性回归

In [1]:
# 构造数据集
import numpy as np

# 生成一个样本量为100，特征量为10的随机数据集
X = np.random.rand(100, 10)
print('X:', X[:2, :])  # 打印前2行数据
# 构造目标值 y = 3 + 2*X1 +5*X2 - 3*X3
y = 3 + 2 * X[:, 0] + 5 * X[:, 1] - 3 * X[:, 2] + np.random.randn(100)
print('y:', y)
print('y shape:', y.shape)

X: [[0.09150346 0.38253303 0.56802796 0.65080838 0.69838482 0.45911975
  0.81531773 0.10825329 0.32132788 0.65849552]
 [0.34823725 0.52689761 0.76052561 0.811579   0.11093559 0.38922179
  0.81909977 0.84519138 0.43731844 0.33471118]]
y: [ 3.69663462e+00  3.50398986e+00  5.59184559e+00  4.97493724e+00
  3.96939341e+00  3.23792563e+00  5.67313057e+00  6.03525765e+00
  4.20998822e+00  4.00753883e+00  6.21259445e+00  5.58509087e+00
 -1.79304101e-03  7.29437041e+00  5.09061056e+00  6.28512879e+00
  2.80010001e+00  7.45869234e+00  1.94071552e+00  4.00514433e+00
  3.54019583e+00  3.75516783e+00  6.00561174e+00  7.62568859e+00
  4.79979986e+00  9.17358380e+00  6.34191230e+00  7.54555877e+00
  5.88254095e+00  5.75928525e+00  1.82670652e+00  6.22472443e+00
  7.00252864e+00  6.33119612e+00  4.26640287e+00  2.24156293e+00
  2.83210507e+00  2.54247546e+00  5.45860576e+00  4.54260322e+00
  2.36256421e+00  8.32969236e+00  4.24480674e+00  5.04100416e+00
  5.77213199e+00  5.58086714e+00  8.22814652e+00

In [2]:
# 构建模型
class LR():
    def __init__(self):
        self.w = None
        self.b = None
    
    def get_loss(self, y, y_pre):
        loss = np.mean((y - y_pre) ** 2)
        return loss
    
    def fit(self, X, y, epochs=500, lr=0.01):
        sample, feature = X.shape
        if self.w is None:
            self.w = np.random.randn(feature)
            self.b = 0.0
        
        # 开始训练
        for i in range(epochs):
            y_pre = np.dot(X, self.w) + self.b
            diff = y_pre - y
            dw = 2/sample * np.dot(X.T, diff)
            db = 2/sample * np.sum(diff, axis=0)
            # 更新参数
            self.w -= lr * dw
            self.b -= lr * db
            # 计算损失
            loss = self.get_loss(y, y_pre)
            print(f'Epoch {i+1}/{epochs}, Loss: {loss:.4f}')
    def predict(self, X):
        return np.dot(X, self.w) + self.b

In [3]:
#生成一个样本量为100 特征量为10的数据集
sample=1
feature=10
test_x=np.random.rand(sample,10)
#构造目标值 y=3+2*x1+5*x2-3*x3
test_y=3+2*test_x[:,0]+5*test_x[:,1]-3*test_x[:,2]+np.random.randn(sample) # 增加一点噪音
#np.random.randn 生成满足正态分布的随机数

model=LR()
model.fit(X,y)
y_pre=model.predict(test_x)

Epoch 1/500, Loss: 12.8899
Epoch 2/500, Loss: 11.7523
Epoch 3/500, Loss: 10.7677
Epoch 4/500, Loss: 9.9153
Epoch 5/500, Loss: 9.1772
Epoch 6/500, Loss: 8.5378
Epoch 7/500, Loss: 7.9837
Epoch 8/500, Loss: 7.5033
Epoch 9/500, Loss: 7.0867
Epoch 10/500, Loss: 6.7251
Epoch 11/500, Loss: 6.4111
Epoch 12/500, Loss: 6.1383
Epoch 13/500, Loss: 5.9010
Epoch 14/500, Loss: 5.6944
Epoch 15/500, Loss: 5.5143
Epoch 16/500, Loss: 5.3573
Epoch 17/500, Loss: 5.2200
Epoch 18/500, Loss: 5.0999
Epoch 19/500, Loss: 4.9946
Epoch 20/500, Loss: 4.9021
Epoch 21/500, Loss: 4.8207
Epoch 22/500, Loss: 4.7489
Epoch 23/500, Loss: 4.6854
Epoch 24/500, Loss: 4.6290
Epoch 25/500, Loss: 4.5788
Epoch 26/500, Loss: 4.5340
Epoch 27/500, Loss: 4.4937
Epoch 28/500, Loss: 4.4576
Epoch 29/500, Loss: 4.4248
Epoch 30/500, Loss: 4.3951
Epoch 31/500, Loss: 4.3680
Epoch 32/500, Loss: 4.3431
Epoch 33/500, Loss: 4.3202
Epoch 34/500, Loss: 4.2990
Epoch 35/500, Loss: 4.2792
Epoch 36/500, Loss: 4.2607
Epoch 37/500, Loss: 4.2433
Epoch 3

##### 使用sklearn包实现

In [4]:
# 生成样本量为100，特征量为10的数据集
X = np.random.rand(100, 10)

# 构造目标值
y = 3  + 2 * X[:, 0] + 5 * X[:, 1] - 3 * X[:, 2] + np.random.randn(100)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X, y)
y_pre = model.predict(test_x)

# 显示权重系数
weights = model.coef_
intercept = model.intercept_

print('权重：', weights)
print('截距：', intercept)

权重： [ 2.28761353  5.16264964 -2.66373939 -0.05104705  0.22371288  0.04072163
 -0.12239478  0.7921249  -0.0543275   0.68380205]
截距： 1.9553038376273193


##### 对数几率回归

In [18]:
# sigmoid函数
import numpy as np
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

x = np.array([5])
print(sigmoid(x))

[0.99330715]


In [19]:
# 构造数据集
#生成一个样本量为100 特征量为10的数据集
sample=100
feature=10
X=np.random.rand(sample,10)
#构造目标值 y=3+2*x1+5*x2-3*x3
y=3+2*X[:,0]+5*X[:,1]-3*X[:,2]+np.random.randn(sample) # 增加一点噪音

# 构建分类问题
mean_ = np.mean(y)
# 一半数据低于平均，一半高于平均
y = np.where(y>mean_,1,0)

In [20]:
# sklearn包实现
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2025)

# 创建逻辑回归模型
log_reg = LogisticRegression()

# 训练模型
log_reg.fit(X_train, y_train)

# 预测
predictions = log_reg.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, predictions)
print('准确率：', accuracy)



准确率： 0.85


##### 线性判别分析

In [25]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.datasets import load_iris
from sklearn.model_selection  import train_test_split
from sklearn.metrics import accuracy_score

# 加载数据集
iris = load_iris()
X = iris.data
y = iris.target

# 划分训练和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2025)

# 创建模型
lda = LinearDiscriminantAnalysis()

# 训练模型
lda.fit(X_train, y_train)

# 预测测试集
predictions = lda.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, predictions)
print('lda准确率：', accuracy)


lda准确率： 0.9333333333333333


多分类学习

In [26]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建逻辑回归模型
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# 训练模型
logreg.fit(X_train, y_train)

# 预测测试集
y_pred = logreg.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 1.00


##### 类别不平衡问题

In [27]:
# 类别不平衡
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=100, n_features=20, n_informative=2, n_redundant=10, n_classes=3, random_state=42,n_clusters_per_class=1)
print("特征数据：\n", X)
print("标签数据：\n", y)

特征数据：
 [[-0.90447949  0.40579743  0.83992346 ...  1.03307436 -1.63732419
  -0.66011757]
 [ 0.4818873   0.6162199   0.4308624  ... -0.75106561  0.31506254
   0.38420953]
 [ 0.75192393  1.79550572  1.55230014 ... -1.37298251 -0.06669316
   0.63208385]
 ...
 [-0.65617246  0.77519106  1.11666824 ...  0.63356167 -1.50970179
  -0.4601161 ]
 [ 0.6944778   1.57243472  1.34306973 ... -1.24738177 -0.00409399
   0.58043839]
 [-0.57605299  1.07916446  1.40094425 ...  0.46010921 -1.59222741
  -0.38836568]]
标签数据：
 [0 2 2 1 1 0 1 0 1 2 1 1 0 0 0 2 2 0 1 2 0 0 1 2 2 1 1 1 1 1 2 0 1 0 0 2 0
 0 2 2 2 0 1 1 0 1 0 2 1 1 2 1 2 0 0 1 2 1 2 0 0 2 0 1 2 1 1 1 1 2 1 1 0 0
 0 2 0 2 1 2 2 2 2 0 0 2 0 2 1 1 1 0 1 2 0 0 2 0 2 0]


In [29]:
#欠采样

from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# 创建一个不平衡的数据集
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)

# 打印原始数据集的类别分布
print("原始数据集类别分布：", Counter(y))

# 实例化RandomUnderSampler对象
rus = RandomUnderSampler(random_state=42)

# 对数据集进行欠采样
X_resampled, y_resampled = rus.fit_resample(X, y)

# 打印欠采样后的数据集类别分布
print("欠采样后数据集类别分布：", Counter(y_resampled))

原始数据集类别分布： Counter({1: 900, 0: 100})
欠采样后数据集类别分布： Counter({0: 100, 1: 100})


In [30]:
# 过采样

from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# 创建一个不平衡的数据集
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)

# 打印原始数据集的类别分布
print("原始数据集类别分布：", Counter(y))

# 实例化RandomOverSampler对象
ros = RandomOverSampler(random_state=42)

# 对数据集进行过采样
X_resampled, y_resampled = ros.fit_resample(X, y)

# 打印过采样后的数据集类别分布
print("过采样后数据集类别分布：", Counter(y_resampled))

原始数据集类别分布： Counter({1: 900, 0: 100})
过采样后数据集类别分布： Counter({0: 900, 1: 900})


In [31]:
# 阈值移动
#分类的时候，当不同类别的样本量差异很大时，很容易影响分类结果，因此要么每个类别的数据量大致相同，要么就要进行校正。
#sklearn的做法可以是加权，加权就要涉及到class_weight和sample_weight
#当不设置class_weight参数时，默认值是所有类别的权值为1

#那么'balanced'的计算方法是什么呢？
import numpy as np

y = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,2,2]  #标签值，一共16个样本

a = np.bincount(y)  # array([8, 6, 2], dtype=int64) 计算每个类别的样本数量
aa = 1/a  #倒数 array([0.125     , 0.16666667, 0.5       ])
print(aa)

from sklearn.utils.class_weight import compute_class_weight 
y = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,2,2]

# 计算类别权重
class_weights = compute_class_weight('balanced', classes=[0, 1,2], y=y)
print("类别权重：", class_weights) # [0.66666667 0.88888889 2.66666667]


#weight_ = n_samples / (n_classes * np.bincount(y))

print(16/(3*8))  #输出 0.6666666666666666
print(16/(3*6))  #输出 0.8888888888888888
print(16/(3*2))  #输出 2.6666666666666665
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 创建一个不平衡的数据集
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建逻辑回归模型，并设置class_weight为'balanced'
#class_weight 可以设置为 {0:99:，1:1}
logreg = LogisticRegression(class_weight='balanced', solver='liblinear', max_iter=1000)

# 训练模型
logreg.fit(X_train, y_train)

# 预测测试集
y_pred = logreg.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print("逻辑回归模型的准确率：", accuracy)

[0.125      0.16666667 0.5       ]
类别权重： [0.66666667 0.88888889 2.66666667]
0.6666666666666666
0.8888888888888888
2.6666666666666665
逻辑回归模型的准确率： 0.99
