In [32]:
# Binary Classification - 위스콘신 유방암 데이터 by Gradient Descent Classifier

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Raw Data Set Loading
cancer = load_breast_cancer()

# Data Set
x_data = cancer.data # 2차원 ndarray 독립변수(feature)
t_data = cancer.target # 1차원 ndarray 종속변수(label)

train_x_data, test_x_data, train_t_data, test_t_data = \
train_test_split(x_data, t_data, test_size=0.3, random_state=2, stratify=t_data) # stratify=t_data는 데이터가 편향되는 것을 방지함

# Model 생성
model = linear_model.LogisticRegression()

# Model 학습
model.fit(train_x_data, train_t_data)

# Accuracy로 Model 평가
test_score = model.score(test_x_data, test_t_data)

print('Logistic Regression Model의 정확도 : {}'.format(test_score)) # 0.9473684210526315

Logistic Regression Model의 정확도 : 0.9473684210526315


In [47]:
# Binary Classification - 위스콘신 유방암 데이터 by SGD Classifier Ver. 1(정규화 안 함)

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Raw Data Set Loading
cancer = load_breast_cancer()

# Data Set
x_data = cancer.data
t_data = cancer.target

train_x_data, test_x_data, train_t_data, test_t_data = \
train_test_split(x_data, t_data, test_size=0.3, random_state=2, stratify=t_data)

# Model 생성
sgd = linear_model.SGDClassifier(loss='log', # loss='log'는 Logistic Regression을 이용해 Binary Classification 하겠다.
                                tol=1e-5,    # log loss가 1e-5이면 반복을 멈춤
                                random_state=2)

# Model 학습
sgd.fit(train_x_data, train_t_data)

# Accuracy로 Model 평가
test_score = sgd.score(test_x_data, test_t_data)

print('SGD Classifier의 정확도 : {}'.format(test_score)) # 0.8947368421052632. 정규화를 안 했기 때문에 낮음

SGD Classifier의 정확도 : 0.8947368421052632


In [48]:
# Binary Classification - 위스콘신 유방암 데이터 by SGD Classifier Ver. 2(정규화함)

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Raw Data Set Loading
cancer = load_breast_cancer()

# Data Set
x_data = cancer.data
t_data = cancer.target

train_x_data, test_x_data, train_t_data, test_t_data = \
train_test_split(x_data, t_data, test_size=0.3, random_state=2, stratify=t_data)

# Data 정규화
scaler = StandardScaler()
scaler.fit(train_x_data)

# Model 생성
sgd = linear_model.SGDClassifier(loss='log', # loss='log'는 Logistic Regression을 이용해 Binary Classification 하겠다.
                                tol=1e-5,    # log loss가 1e-5이면 반복을 멈춤
                                random_state=2)

# Model 학습
sgd.fit(scaler.transform(train_x_data), train_t_data)

# Accuracy로 Model 평가
test_score = sgd.score(scaler.transform(test_x_data), test_t_data)

print('정규화를 이용한 SGD Classifier의 정확도 : {}'.format(test_score)) # 0.9649122807017544. 정규화를 함

정규화를 이용한 SGD Classifier의 정확도 : 0.9649122807017544


In [49]:
# Binary Classification - 위스콘신 유방암 데이터 by SGD Classifier Ver. 3(정규화 + L2 Regularization <- Over-fitting(과대 적합) 방지)

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Raw Data Set Loading
cancer = load_breast_cancer()

# Data Set
x_data = cancer.data
t_data = cancer.target

train_x_data, test_x_data, train_t_data, test_t_data = \
train_test_split(x_data, t_data, test_size=0.3, random_state=2, stratify=t_data)

# Data 정규화
scaler = StandardScaler()
scaler.fit(train_x_data)

# Model 생성
sgd = linear_model.SGDClassifier(loss='log', # loss='log'는 Logistic Regression을 이용해 Binary Classification 하겠다.
                                tol=1e-5,    # log loss가 1e-5이면 반복을 멈춤
                                random_state=2, # Random Seed
                                penalty='l2', # L2 - Ridge Regression로 규제 적용
                                alpha=0.001) # 규제 강도

# Model 학습
sgd.fit(scaler.transform(train_x_data), train_t_data)

# Accuracy로 Model 평가
test_score = sgd.score(scaler.transform(test_x_data), test_t_data)

print('정규화를 이용한 SGD Classifier의 정확도 : {}'.format(test_score)) # 0.9707602339181286. 정규화 + L2

정규화를 이용한 SGD Classifier의 정확도 : 0.9707602339181286


In [83]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [158]:
# Multinomial Classification - BMI 데이터 by Sklearn

import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler # 결측치가 없기 때문에 정규화에 MinMaxScaler 사용
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy import stats
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('./data/bmi.csv', skiprows=3) # skiprows는 주석이 달려있는 상위 3행을 제외해라
# label 0: thin
# label 1: normal
# label 2: fat

display(df.head()); print(df.shape); display(df.info()) # (20000, 3)

Unnamed: 0,label,height,weight
0,1,188,71
1,2,161,68
2,0,178,52
3,2,136,63
4,1,145,52


(20000, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   label   20000 non-null  int64
 1   height  20000 non-null  int64
 2   weight  20000 non-null  int64
dtypes: int64(3)
memory usage: 468.9 KB


None

In [159]:
# Data Preprocessing

# print(df.isnull().sum()) # None

zscore_threshold = 2.0 # Z-score로 이상치 처리
(np.abs(stats.zscore(df['height'])) > zscore_threshold).sum() # None
(np.abs(stats.zscore(df['weight'])) > zscore_threshold).sum() # None

#(array([0, 1, 2], dtype=int64), array([6470, 5857, 7673], dtype=int64))
# np.unique(df['label'], return_counts=True) # class의 비율 확인

# Train과 Validation Data Set으로 분리
train_x_data, test_x_data, train_t_data, test_t_data = \
train_test_split(df[['height', 'weight']], df['label'], test_size=0.3, random_state=1, stratify=df['label'])

# Normalization
scaler = MinMaxScaler()
scaler.fit(train_x_data)

norm_train_x_data = scaler.transform(train_x_data)
norm_test_x_data = scaler.transform(test_x_data)

In [156]:
# Model 생성 후 학습 및 평가
model = linear_model.LogisticRegression(C=1000) # C 옵션(alpha 값)으로 L2 규제를 적용할 수 있음. alpha=0.001(1/1000)
model.fit(norm_train_x_data, train_t_data)

predict_val = model.predict(norm_test_x_data)
acc = accuracy_score(predict_val, test_t_data)

print('Sklearn으로 구현한 Accuracy : {}'.format(acc)) # 0.9851666666666666, 0.9845

# Prediction
result = model.predict(scaler.transform(np.array([[187, 81]]))) # 187, 81 -> 1
print(result)

Sklearn으로 구현한 Accuracy : 0.9845
[1]


In [173]:
# Multinomial Classification - BMI 데이터 by Tensorflow

import warnings
warnings.filterwarnings('ignore')

sess = tf.Session() # Tensorflow의 기능을 이용해 인코딩을 하므로 node가 생성됨
onehot_train_t_data = sess.run(tf.one_hot(train_t_data, depth=3)) # depth 옵션으로 class의 개수를 알려줘야 함
onehot_test_t_data = sess.run(tf.one_hot(test_t_data, depth=3))

# Placeholder
X = tf.placeholder(shape=[None,2], dtype=tf.float32) # 독립변수(feature)의 개수
T = tf.placeholder(shape=[None,3], dtype=tf.float32) # 원핫 인코딩! class의 개수이면서 Logistic의 개수

# Weight, bias
W = tf.Variable(tf.random.normal([2,3])) # 독립변수 2개, W 3개
b = tf.Variable(tf.random.normal([3]))

# Model, Hypothesis
logit = tf.matmul(X,W) + b
H = tf.nn.softmax(logit) # Multinomial Classification의 Softmax

# Cross Entropy(loss func)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logit, labels=T))

# Train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-1).minimize(loss)

# Session 초기화
sess.run(tf.global_variables_initializer())

# 반복 학습
# for step in range(10000):
#     _, loss_val = sess.run([train, loss], feed_dict={X:norm_train_x_data, T:onehot_train_t_data})
    
#     if step % 1000 == 0:
#         print('loss value : {}'.format(loss_val))

# Memory Fault 나는 것을 방지하기 위해 데이터를 쪼개서 학습 -> batch 처리
num_of_epoch = 1000 # 학습을 위한 전체 epoch 수
num_of_batch = 100   # 한번에 학습할 데이터 양

for step in range(num_of_epoch):
    total_batch = int(norm_train_x_data.shape[0] / num_of_batch) # 학습 데이터의 개수 / batch
    
    for i in range(total_batch):
        batch_x = norm_train_x_data[i*num_of_batch:(i+1)*num_of_batch] # [0:100], [100:200] ~. 100개씩 Slicing
        batch_y = onehot_train_t_data[i*num_of_batch:(i+1)*num_of_batch]
        _, loss_val = sess.run([train, loss], feed_dict={X:batch_x, T:batch_y})
        
    if step % 100 == 0:
        print('loss value : {}'.format(loss_val))

loss value : 0.9441623687744141
loss value : 0.16499893367290497
loss value : 0.12282320111989975
loss value : 0.10350026935338974
loss value : 0.09194015711545944
loss value : 0.08408293128013611
loss value : 0.07831946760416031
loss value : 0.07387088239192963
loss value : 0.07030844688415527
loss value : 0.06737658381462097


In [190]:
# 성능평가(Accuracy)
result = sess.run(H, feed_dict={X:scaler.transform(np.array([[187, 81]]))})

# print(result)       # [[4.761967e-05 9.079144e-01 9.203796e-02]]
# print(result[0,1])  # 0.9079144
# print(result.max()) # 중간의 0.9079144(1)가 가장 높은 확률
# print(np.argmax(result, axis=1)) # [1].가장 큰 값의 index를 추출

predict = tf.argmax(H,1)
correct = tf.equal(predict, tf.argmax(T,1))
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))

result = sess.run(accuracy, feed_dict={X:norm_test_x_data, T:onehot_test_t_data})
print(result) # 0.9855

0.9855
