## 梯度下降与`sklean`实现逻辑回归

### 梯度下降法实现逻辑回归

In [108]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score 

#### 构建`sigmoid`函数

In [109]:
def sigmoid(z):
    return 1/(1+np.exp(-z))
def predict(theta,X):
    print(X)
    print(theta)
    prob = sigmoid(X * theta.T)
    return [1 if a >= 0.5 else 0 for a in prob] # 根据admit的概率设定阈值，大于0.5记为1，否则为0

#### 构建计算函数

In [110]:
def gradientDescent(X,y,theta,alpha,m,epoch):
    for i in range(epoch):
        theta = np.matrix(theta) # 矩阵化
        pred = np.array(predict(theta, X))
        loss = pred-y
        gradient = np.dot(X.T,loss) / m # 计算梯度
        theta = theta - alpha*gradient  # 更新theta
    return theta

#### 加载数据

In [111]:
df = pd.read_csv('Social_Network_Ads.csv')
df = df.drop(columns='Gender')
df = df.drop(columns='User ID')
df.insert(0,'ones',1)
df.head(10)


Unnamed: 0,ones,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,1,26,43000,0
3,1,27,57000,0
4,1,19,76000,0
5,1,27,58000,0
6,1,27,84000,0
7,1,32,150000,1
8,1,25,33000,0
9,1,35,65000,0


#### 梯度下降法求解参数

In [112]:
X = df.iloc[:,0:3] # 取前四列为X变量
y = df['Purchased']

# 把X、y转化为数组形式
X = np.array(X.values)
y = np.array(y.values)
m,n = np.shape(X)   # 设置训练样本值和变量个数

theta = np.ones(n) #初始化
X.shape,theta.shape,y.shape
epoch = 10000
alpha = 0.05
theta = gradientDescent(X,y,theta,alpha,m,epoch)
print(theta)

[[    1    19 19000]
 [    1    35 20000]
 [    1    26 43000]
 ...
 [    1    50 20000]
 [    1    36 33000]
 [    1    49 36000]]
[[1. 1. 1.]]
[[    1    19 19000]
 [    1    35 20000]
 [    1    26 43000]
 ...
 [    1    50 20000]
 [    1    36 33000]
 [    1    49 36000]]
[[ 9.67875e-01 -5.35000e-02 -1.94400e+03]]
[[    1    19 19000]
 [    1    35 20000]
 [    1    26 43000]
 ...
 [    1    50 20000]
 [    1    36 33000]
 [    1    49 36000]]
[[   0.98575    0.77575 -401.875  ]]
[[    1    19 19000]
 [    1    35 20000]
 [    1    26 43000]
 ...
 [    1    50 20000]
 [    1    36 33000]
 [    1    49 36000]]
[[1.003625e+00 1.605000e+00 1.140250e+03]]
[[    1    19 19000]
 [    1    35 20000]
 [    1    26 43000]
 ...
 [    1    50 20000]
 [    1    36 33000]
 [    1    49 36000]]
[[ 9.7150e-01  5.5150e-01 -8.0475e+02]]
[[    1    19 19000]
 [    1    35 20000]
 [    1    26 43000]
 ...
 [    1    50 20000]
 [    1    36 33000]
 [    1    49 36000]]
[[  0.989375   1.38075  737.375 

  return 1/(1+np.exp(-z))


[[    1    19 19000]
 [    1    35 20000]
 [    1    26 43000]
 ...
 [    1    50 20000]
 [    1    36 33000]
 [    1    49 36000]]
[[  0.690625   1.063    585.25    ]]
[[    1    19 19000]
 [    1    35 20000]
 [    1    26 43000]
 ...
 [    1    50 20000]
 [    1    36 33000]
 [    1    49 36000]]
[[ 6.58500e-01  9.50000e-03 -1.35975e+03]]
[[    1    19 19000]
 [    1    35 20000]
 [    1    26 43000]
 ...
 [    1    50 20000]
 [    1    36 33000]
 [    1    49 36000]]
[[  0.676375   0.83875  182.375   ]]
[[    1    19 19000]
 [    1    35 20000]
 [    1    26 43000]
 ...
 [    1    50 20000]
 [    1    36 33000]
 [    1    49 36000]]
[[ 6.442500e-01 -2.147500e-01 -1.762625e+03]]
[[    1    19 19000]
 [    1    35 20000]
 [    1    26 43000]
 ...
 [    1    50 20000]
 [    1    36 33000]
 [    1    49 36000]]
[[   0.662125    0.6145   -220.5     ]]
[[    1    19 19000]
 [    1    35 20000]
 [    1    26 43000]
 ...
 [    1    50 20000]
 [    1    36 33000]
 [    1    49 36000]]
[[6.8

#### 预测并计算准确率

In [113]:
pred = predict(theta, X)
correct = [1 if ((a == 1 and b ==1) or (a == 0 and b == 0)) else 0 for (a,b) in zip(pred,y)]
accuracy = (sum(map(int,correct)) % len(correct))

print("accuracy = {:.2f}%".format(100 * accuracy/m))

[[    1    19 19000]
 [    1    35 20000]
 [    1    26 43000]
 ...
 [    1    50 20000]
 [    1    36 33000]
 [    1    49 36000]]
[[ -41.35    -32.0205 1184.25  ]]
accuracy = 35.75%


#### 分析
即便更改了迭代次数以及学习率，预测结果仍不乐观

### 利用`sklearn`做逻辑回归

In [116]:
from sklearn.linear_model._logistic import LogisticRegression

clf = LogisticRegression()
clf.fit(X,y)
print(clf.coef_)

pred_sk = clf.predict(X)
correct_sk = [1 if ((a == 1 and b ==1) or (a == 0 and b == 0)) else 0 for (a,b) in zip(pred_sk,y)]
accuracy = (sum(map(int,correct_sk)) % len(correct_sk))

print("accuracy = {:.2f}%".format(100 * accuracy/m))

print(clf.score(X,y))

[[-2.24944774e-10 -2.10415213e-09 -2.69301408e-06]]
accuracy = 64.25%
0.6425
