In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

data=np.loadtxt(fname="Breast_cancer_data.txt")

X=data[:,:9]
T=data[:,9]
X=X.astype(int)
T=T.astype(int)

X_n=data.shape[0]

print(X_n)
print(X)
print(X[0]) 

683
[[ 5 10  6 ...  4 10 10]
 [ 3  1  1 ...  1  1  1]
 [ 3  1  2 ...  2  1  1]
 ...
 [ 3  1  1 ...  1  1  1]
 [ 3  1  1 ...  1  1  1]
 [ 5  1  1 ...  3  1  1]]
[ 5 10  6  1 10  4  4 10 10]


In [2]:
def logistic(x,w):   
    y1=1/(1+np.exp(-(w[0]*x[0] + w[1]*x[1] + w[2]*x[2] + w[3]*x[3] +  w[4]*x[4] + w[5]*x[5] + w[6]*x[6] + w[7]*x[7] + w[8]*x[8]+w[9])))
          
          
    return y1


W=[1,0,0.5,0,-1,0,0.4,0,0,0]
# W=W.astype(int)
pp=logistic(X,W)  
print(pp)            

[0.9987706  0.99999863 0.9999546  0.98901306 0.9999925  0.99592986
 0.9900482  0.99999695 0.99999545]


In [3]:
def cee_logistic(w,x,t):   
    y=logistic(x,w)
    cee=0
    for n in range(len(y)):  
        cee=cee-(t[n]*np.log(y[n])+(1-t[n])*np.log(1-y[n]))
    cee=cee/X_n
    return cee
#1,0,0.5,0,-1,0,0.4,0,0,0
#-2.27939392 , 0.25030328 , 0.89458987 , 0.64590032 , -0.55887856 , 2.21174359 , -1.7981906 , 0.88993109 , 0.9867008 , -0.02873069
W=[-2.27939392 , 0.25030328 , 0.89458987 , 0.64590032 , -0.55887856 , 2.21174359 , -1.7981906 , 0.88993109 , 0.9867008 , -0.02873069]
cee_logistic(W,X,T)


2.353799626149109e-06

In [4]:
def dcee_logistic(w,x,t):   #기울기구하는함수  기울기를 구하는 함수를 정의하고 다음칸에서 사용하는 미니마이즈의 인수로 넣어줘야하기때문
    y=logistic(x,w)
    dcee=np.zeros(10)
    for n in range(len(y)):
        dcee[0]=dcee[0]+(y[n]-t[n])*x[0][n]
        dcee[1]=dcee[1]+(y[n]-t[n])*x[1][n]
        dcee[2]=dcee[2]+(y[n]-t[n])*x[2][n]
        dcee[3]=dcee[3]+(y[n]-t[n])*x[3][n]
        dcee[4]=dcee[4]+(y[n]-t[n])*x[4][n]
        dcee[5]=dcee[5]+(y[n]-t[n])*x[5][n]
        dcee[6]=dcee[6]+(y[n]-t[n])*x[6][n]
        dcee[7]=dcee[7]+(y[n]-t[n])*x[7][n]
        dcee[8]=dcee[8]+(y[n]-t[n])*x[8][n]
        dcee[9]=dcee[9]+(y[n]-t[n])
    dcee=dcee/X_n
    return dcee

W=[1,0,0.5,0,-1,0,0.4,0,0,0]  
dcee_logistic(W,X,T)
        

array([0.06870621, 0.01020678, 0.01165628, 0.01016932, 0.01019221,
       0.04952295, 0.07587051, 0.01309747, 0.01017584, 0.00874627])

In [5]:
from scipy.optimize import minimize

def fit_logistic(w_init,x,t):
    res1=minimize(cee_logistic,w_init,args=(x,t),jac=dcee_logistic,method="CG")
    #미니마이즈(최적화할놈, 초깃값, w에대해서 최적화, 기울기구하는함수, 경사하강법사용)의 구성이다
    #cee_logistic을 최소화한다, x와t는 제외 w에 대해서만 최소화. jac는 기울기를구하는방법
    #CG는 Conjugate Gradient
    return res1.x


W_init=[1,0,0.5,0,-1,0,0.4,0,0,0]
W=fit_logistic(W_init,X,T)
print(W)



[-2.27939392  0.25030328  0.89458987  0.64590032 -0.55887856  2.21174359
 -1.7981906   0.88993109  0.9867008  -0.02873069]


In [6]:
def validate_model(w):
    d=np.loadtxt(fname="Sample_test_data.txt")
    X=d[:,:9].astype(int)
    T=d[:,9].astype(int)
    
    N=X.shape[0]
    
    y=np.zeros(N)
    decision=np.zeros(N).astype(int)
    err_cnt=0
    
    print('No. \t Y \t T')
    print("-------------------")
    for i in range(N):
        x=np.r_[X[i,:],1]
        u=np.array(w).dot(x)
        y[i]=1/(1+np.exp(-u))
        if y[i]<0.5:
            decision[i]=1
        
        if decision[i] !=T[i]:
            err_cnt=err_cnt+1
            
        print("{0} \t {1} \t {2}".format(i,decision[i],T[i]))
        
    hit_ratio=np.round((1-err_cnt/N)*100,1)
    
    print("-------------------")
    print("Total error : {0} out of {1}".format(err_cnt,N))
    print("Hit ratio : {0:.1f} %".format(hit_ratio))
    
    return hit_ratio

StudentID='2016146036'
W=np.loadtxt(fname=StudentID+".txt")

print("Student ID: "+ StudentID)
print("W= "+np.str(W))

validate_model(W)

Student ID: 2016146036
W= [-2.27939392  0.25030328  0.89458987  0.64590032 -0.55887856  2.21174359
 -1.7981906   0.88993109  0.9867008  -0.02873069]
No. 	 Y 	 T
-------------------
0 	 0 	 0
1 	 1 	 0
2 	 1 	 0
3 	 1 	 0
4 	 1 	 1
5 	 0 	 0
6 	 0 	 1
7 	 1 	 1
8 	 0 	 0
9 	 1 	 0
-------------------
Total error : 5 out of 10
Hit ratio : 50.0 %


50.0

In [7]:
def kfold_model_A(x,t,k):
    n=len(x)
    cee_train=np.zeros(k)
    cee_test=np.zeros(k)
    
    for i in range(0,k):
        x_train=x[np.fmod(range(n),k)!=i] 
        t_train=t[np.fmod(range(n),k)!=i]  
        x_test=x[np.fmod(range(n),k)==i]   
        t_test=t[np.fmod(range(n),k)==i]
        
        wm=fit_logistic(np.array([1,0,0.5,0,-1,0,0.4,0,0,0]),x,t) 
        cee_train[i]=logistic(x_train,t_train)
        cee_test[i]=logistic(x_test,t_test)
    return cee_train, cee_test

K=11

Cv_A_train, Cv_A_test = kfold_model_A(X,T,K)
mean_A_test=np.sqrt(np.mean(Cv_A_test))  #평균오차

print("Gauss(M=3) SD={0:.2f} cm".format(mean_gauss_test[1]))
print("Model A SD={0:.2f} cm".format(mean_A_test))

SD=np.append(mean_gauss_test[0:5], mean_A_test)
M=range(6)

label=["M=2", "M=3", "M=4", "M=5", "M=6", "Model A"]

plt.figure(figsize=(8,5))
plt.bar(M,SD,tick_label=label, align="center", facecolor="cornflowerblue")
#plt.bar(xy) 막대그래프만들어줌
#선형회귀모델과 비선형회귀모델과의 평균오차에 대한 그래프
#(선형회귀모델에서)M=3일때 오차가 6.51로 최저지만 비선형모델에선 4.72로 더 적다
plt.grid(True)
plt.show()

ValueError: setting an array element with a sequence.