In [1]:
import math
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

data = pd.read_csv('data/telcoPreprocess.csv')
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :41], data.iloc[:, 41:], test_size=0.3,
                                                    random_state=42)
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [2]:
# sigmod函数
def sigmoid(x):
    return 1.0 / (1+math.exp(-x))


# 计算hessian矩阵
def computeHessianMatrix(data, hypothesis):
    hessianMatrix = []
    n = len(data)

    for i in range(n):
        row = []
        for j in range(n):
            row.append(-data[i]*data[j]*(1-hypothesis)*hypothesis)
        hessianMatrix.append(row)
    return hessianMatrix


# 计算两个向量的点积
def computeDotProduct(a, b):
    if len(a) != len(b):
        return False
    n = len(a)
    dotProduct = 0
    for i in range(n):
        dotProduct += a[i] * b[i]
    return dotProduct


# 计算两个向量的和
def computeVectPlus(a, b):
    if len(a) != len(b):
        return False
    n = len(a)
    sum = []
    for i in range(n):
        sum.append(a[i]+b[i])
    return sum


# 计算某个向量的n倍
def computeTimesVect(vect, n):
    nTimesVect = []
    for i in range(len(vect)):
        nTimesVect.append(*(n * vect[i]))
    return nTimesVect


# 牛顿法
def newtonMethod(dataMat, labelMat, iterNum=10):
    m = len(dataMat)  # 训练集个数
    n = len(dataMat[0])  # 数据特征纬度
    theta = [0.0] * n

    while (iterNum):
        gradientSum = [0.0] * n
        hessianMatSum = [[0.0] * n] * n
        for i in range(m):
            try:
                hypothesis = sigmoid(computeDotProduct(dataMat[i], theta))
            except:
                continue
            error = labelMat[i] - hypothesis
            gradient = computeTimesVect(dataMat[i], error / m)
            gradientSum = computeVectPlus(gradientSum, gradient)
            hessian = computeHessianMatrix(dataMat[i], hypothesis / m)
            for j in range(n):
                hessianMatSum[j] = computeVectPlus(hessianMatSum[j], hessian[j])

        # 计算hessian矩阵的逆矩阵有可能异常，如果捕获异常则忽略此轮迭代
        try:
            hessianMatInv = np.mat(hessianMatSum).I.tolist()
        except:
            continue
        for k in range(n):
            theta[k] -= computeDotProduct(hessianMatInv[k], gradientSum)

        iterNum -= 1
    return theta

In [3]:
def Hypothesis(theta, x):
    z = 0
    for i in range(len(theta)):
        z += x[i] * theta[i]
    return sigmoid(z)


def Declare_Winner(theta):
    score = 0
    length = len(X_test)
    for i in range(length):
        prediction = round(Hypothesis(X_test[i], theta))
        answer = y_test[i]
        if prediction == answer:
            score += 1

    my_score = float(score) / float(length)
    print('Your score: ', my_score)

In [4]:
theta = newtonMethod(X_train, y_train)
print(theta)
Declare_Winner(theta)

[-0.11071955169751357, -0.024310378174405543, 0.007080868787172948, 0.12945431254976425, -0.03569746122753176, -0.0006125351071733579, -0.019866709300612392, -0.05058989976041072, -0.4793373793078319, 0.21917474697641692, -0.062035337928069506, 2.6812957905196746, -0.4571590789211556, 0.08887267550077721, 0.18596956523576336, 0.0690221060284142, -0.12776802441229418, 0.017956540737877575, -0.026149197081969666, 0.028416048193072654, -0.0011939440136531057, 0.0013414426528306767, 0.0002509597382315725, -6.43539817349325e-05, -0.0004722746612622063, 0.41564199783195094, 0.3832100766656462, -0.4246146816940337, 0.5000862976847763, 0.008015432936708897, -0.4908744091600162, 0.5970367108620283, -0.5797814159931519, 0.351254545501901, -0.5760948722996599, 1.719111638609769, -2.337600352902394, 1.0001590610980846, -0.10303477161354878, 0.32950388069322967, -0.04627219860135797]
Your score:  0.78


In [5]:
from sklearn.preprocessing import Normalizer

nor = Normalizer()
X_train = nor.fit_transform(X_train)
X_test = nor.transform(X_test)

theta = newtonMethod(X_train, y_train)
print(theta)
Declare_Winner(theta)

[28.67751144749691, 5.306455057353073, -1.7038771885839044, 1.9323754235730366, -4.00544676956844, -0.4841745262033978, -17.258305934234194, -13.561428076171659, -322.4461119199384, -8.075894968543832, 8.073985713316048, 999.7283395852726, -107.22661326072357, -411.28888911536063, -627.743580857512, 67.76327211688455, -33.91969560869811, 3.102077768134201, 25.350453374336585, 27.734220587112848, -3.479012822865501, -0.9138255391342573, 0.41824685565876835, -1.5579297461873884, -0.15750500590993713, 126.78589027408053, 31.7143589138043, -71.6070425557785, 144.1176783478349, 20.201344086630094, -127.93062404062157, 40.01256565983115, -80.7004610126074, -53.015097226281384, -121.49732628750154, 704.6681724493399, -288.8745123230913, -207.15062541684833, -306.6464397135834, 147.9161764539983, -5.139109323846027]
Your score:  0.79
