In [34]:
# Binary Classification의 대표적인 2개의 예제(위스콘신 유방암 데이터, Titanic)를 구현해보자
# Binary Classification - 위스콘신 유방암 데이터 by Sklearn

import numpy as np
from sklearn import linear_model
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score # cross validation

import warnings
warnings.filterwarnings('ignore')

# from sklearn.preprocessing import MinMaxScaler
# from scipy import stats
# import matplotlib.pyplot as plt


cancer = load_breast_cancer()
# print(type(cancer))
# <class 'sklearn.utils.Bunch'>. Sklearn이 데이터를 표현하기 위해 사용하는 자료구조. Python의 Dict와 유사함

# print(cancer) # data이 독립변수, target이 종속변수
# print(cancer.data.shape, cancer.target.shape) # (569, 30) (569,)

# print(np.unique(cancer.target, return_counts=True)) # return_counts=True 값이 몇 개가 있는지
# (array([0, 1]), array([212, 357], dtype=int64))

# print(cancer.DESCR) # 유방암 데이터에 대한 상세 내용
# Missing Attribute Values: None 결측치 없음
# Class Distribution: 212 - Malignant(악성), 357 - Benign(정상)

# Data Set
x_data = cancer.data
t_data = cancer.target

# print(type(x_data), x_data.shape) # <class 'numpy.ndarray'> (569, 30)
# print(type(t_data), t_data.shape) # <class 'numpy.ndarray'> (569,)

# Hold-out Validation을 위해, Train과 Validation Data Set으로 분리.default는 75% : 25%
train_x_data, test_x_data, train_t_data, test_t_data = \
train_test_split(x_data, t_data, test_size=0.2, random_state=2, stratify=t_data) # stratify는 class의 개수를 맞춰줌
# print(train_x_data.shape, train_t_data.shape) # (455, 30) (455,)
# print(np.unique(train_t_data, return_counts=True)) # (array([0, 1]), array([170, 285], dtype=int64))

# Model 생성
model = linear_model.LogisticRegression()

# K-Flod Cross Validation
test_score = cross_val_score(model, x_data, t_data, scoring='accuracy', cv=5) # scoring은 metric 종류, cv은 몇 번 검증할지
# print(test_score)
print(test_score.mean()) # 0.9490451793199813%

# Hold-out Validation
model.fit(train_x_data, train_t_data)
test_score = model.score(test_x_data, test_t_data)
print(test_score) # 0.956140350877193

0.9490451793199813
0.956140350877193


In [38]:
# Binary Classification - 위스콘신 유방암 데이터 by Tensorflow

import tensorflow as tf

# Placeholder
X = tf.placeholder(shape=[None,30], dtype=tf.float32)
T = tf.placeholder(shape=[None,1], dtype=tf.float32)

# Weight, bias
W = tf.Variable(tf.random.normal([30,1]))
b = tf.Variable(tf.random.normal([1]))

# Hypothesis, model, predict model, Logistic Regression Model
logit = tf.matmul(X,W) + b
H = tf.sigmoid(logit)

# cross entropy(loss func)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, labels=T))

# train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-4).minimize(loss)

# Session & 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# 반복 학습
# 전체 데이터를 이용해서 1번 학습하는 것을 1 epoch이라 함
for step in range(100000):
    _, loss_val = sess.run([train, loss], feed_dict={X: train_x_data, T: train_t_data.reshape(-1,1)})
    
    if step % 10000 == 0:
        print('loss value : {}'.format(loss_val))

loss value : 703.2249145507812
loss value : 0.799042820930481
loss value : 0.5356259942054749
loss value : 0.4693799614906311
loss value : 0.45146843791007996
loss value : 0.4825952351093292
loss value : 0.5068585872650146
loss value : 0.5115243196487427
loss value : 0.5085011124610901
loss value : 0.502677321434021


In [39]:
# Accuracy 측정

# validation data(test_x_data, test_t_data)를 이용해서 정확도를 측정
predict = tf.cast(H >= 0.5, dtype=tf.float32) # True -> 1.0, False -> 0.0
correct = tf.equal(predict, T) # equal은 predict와 T가 같은지 비교. boolean으로 나옮(True, False, Flase, True...)
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32)) # correct를 실수로 바꿔서 평균을 구함(cross entropy)

accuracy_val = sess.run(accuracy, feed_dict={X: test_x_data, T: test_t_data.reshape(-1,1)})
print('Accuracy : {}'.format(accuracy_val)) # Accuracy : 0.9122806787490845

Accuracy : 0.9122806787490845
