In [159]:
# 라이브러리 버전 확인
import sklearn

print(sklearn.__version__)

1.0.2


모듈 import하기

In [160]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

데이터 준비

In [161]:
breast_cancer = load_breast_cancer()

print(dir(breast_cancer))

['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'frame', 'target', 'target_names']


In [162]:
# digits에 담긴 정보 확인
breast_cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [163]:
# 데이터 크기 확인 
breast_cancer_data = breast_cancer.data

print(breast_cancer_data.shape)  # shape는 배열의 형상정보를 출력

(569, 30)


In [164]:
# 0번째 데이터
breast_cancer_data[0]

array([1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
       3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
       8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
       3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
       1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01])

In [165]:
# digits의 타겟 정보
breast_cancer_label = breast_cancer.target
print(breast_cancer_label.shape)
breast_cancer_label

(569,)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [166]:
# 라벨의 이름
breast_cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [167]:
# digits에 대한 설명
print(breast_cancer.DESCR)

# 총 569개의 데이터
# 각 데이터에는 30개의 속성

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [168]:
# feature에 대한 설명
breast_cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

데이터 전처리

In [169]:
# digits 데이터셋을 DataFrame으로 변환
import pandas as pd

breast_cancer_df = pd.DataFrame(data=breast_cancer_data, columns=breast_cancer.feature_names)
breast_cancer_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [170]:
# 컬럼 'label' 추가
breast_cancer_df["label"] = breast_cancer.target
breast_cancer_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


데이터 나누기

In [171]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data,
                                                    breast_cancer_label,
                                                    test_size=0.2,
                                                    random_state=7)

print('X_train 개수: ', len(X_train), 'x_test 개수: ', len(X_test))

X_train 개수:  455 x_test 개수:  114


In [172]:
# 데이터셋 확인
X_train.shape, y_train.shape

((455, 30), (455,))

In [173]:
# 데이터셋 확인
X_test.shape, y_test.shape

((114, 30), (114,))

In [174]:
# Decision Tree 사용
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)
print(decision_tree._estimator_type)

classifier


In [175]:
# 모델 학습시키기
decision_tree.fit(X_train, y_train)

DecisionTreeClassifier(random_state=32)

In [176]:
# 예측
y_pred = decision_tree.predict(X_test)
y_pred

array([1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1])

In [177]:
# 실제 정답
# 예측과 어느정도 잘 맞는 것을 확인
y_test

array([1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1])

In [178]:
# 정확도 확인
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9122807017543859

train, test 분리

In [179]:
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data,
                                                    breast_cancer_label,
                                                    test_size=0.2,
                                                    random_state=7)

Decision Tree

In [180]:
from sklearn.tree import DecisionTreeClassifier

# train
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

# test
y_pred_dt = decision_tree.predict(X_test)

Random Forest

In [181]:
from sklearn.ensemble import RandomForestClassifier

# train
random_forest= RandomForestClassifier()
random_forest.fit(X_train,y_train)

# test
y_pred_rf = random_forest.predict(X_test)

SVM

In [182]:
from sklearn import svm

# train
svm_model = svm.SVC()
svm_model.fit(X_train,y_train)

# test
y_pred_svm = svm_model.predict(X_test)

SGD

In [183]:
from sklearn.linear_model import SGDClassifier

# train
sgd_model = SGDClassifier()
svm_model.fit(X_train,y_train)

# test
y_pred_sgd = svm_model.predict(X_test)

Logistic Regression

In [184]:
from sklearn.linear_model import LogisticRegression

# train
logistic_model = LogisticRegression(max_iter = 3000)
logistic_model.fit(X_train, y_train)

# test
y_pred_log = logistic_model.predict(X_test)

성능 지표  : 재현율(recall)

In [185]:
from sklearn.metrics import recall_score

print('Decision Tree의 재현율       : {}'.format(recall_score(y_test, y_pred_dt, average='weighted')))
print('Random Forest의 재현율       : {}'.format(recall_score(y_test, y_pred_rf, average='weighted')))
print('SVM의 재현율                 : {}'.format(recall_score(y_test, y_pred_svm, average='weighted')))
print('SGD의 재현율                 : {}'.format(recall_score(y_test, y_pred_sgd, average='weighted')))
print('Logistic Regression의 재현율 : {}'.format(recall_score(y_test, y_pred_log, average='weighted')))

Decision Tree의 재현율       : 0.9473684210526315
Random Forest의 재현율       : 0.9736842105263158
SVM의 재현율                 : 0.9035087719298246
SGD의 재현율                 : 0.9035087719298246
Logistic Regression의 재현율 : 0.9473684210526315


In [186]:
# test
y_pred_dt = decision_tree.predict(X_test)

# accuracy
recall_dt = recall_score(y_test, y_pred_dt)
recall_dt

0.972972972972973

In [187]:
print('Decision Tree의 정확도 : ', recall_dt)


Decision Tree의 정확도 :  0.972972972972973


모델 평가

In [188]:
from sklearn.metrics import classification_report

print("Decision Tree")
print(classification_report(y_test, y_pred_dt))
print("------------------------------------------------------")
print()

print("Random Forest")
print(classification_report(y_test, y_pred_rf))
print("------------------------------------------------------")
print()

print("SVM")
print(classification_report(y_test, y_pred_svm))
print("------------------------------------------------------")
print()

print("SGD Classifier")
print(classification_report(y_test, y_pred_sgd))
print("------------------------------------------------------")
print()

print("Logistic Regression")
print(classification_report(y_test, y_pred_log))

Decision Tree
              precision    recall  f1-score   support

           0       0.95      0.90      0.92        40
           1       0.95      0.97      0.96        74

    accuracy                           0.95       114
   macro avg       0.95      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114

------------------------------------------------------

Random Forest
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        40
           1       0.96      1.00      0.98        74

    accuracy                           0.97       114
   macro avg       0.98      0.96      0.97       114
weighted avg       0.97      0.97      0.97       114

------------------------------------------------------

SVM
              precision    recall  f1-score   support

           0       1.00      0.72      0.84        40
           1       0.87      1.00      0.93        74

    accuracy                          

Confusion Matrix

In [189]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred_rf)

array([[37,  3],
       [ 0, 74]])

🌟 모델 평가

* breast cancer의 데이터셋은 총 30개의 속성이 있으며, WDBC-Malignant(악성), WDBC-Benign(양성)을 class로 두고 있다. 즉, 악성 종양인지 정상 종양인지 분류하는 모델을 작동시키는 것이다. 유방암은 정확한 진단을 하는 것이 중요하기 때문에 성능지표로 재현율(recall)을 이용하는 것이 적합하다.

* 5가지 모델의 재현율을 비교한 결과, Random Forest가 0.937으로 가장 우수한 성능을 지닌 것을 확인할 수 있다. Random Forest 다음으로 Decision Tree와 Logistic Regression이 약 0.9474으로 뒤를 잇는다. 

* 가장 높은 재현율을 기록한 Random Froest으로 오차행렬을 확인해본다. 그 결과, Random Forest는 클래스끼리 혼동하지 않음을 통하여 오차가 없는 상태임을 확인할 수 있다.

* 정리하면, 유방암을 진단할 때 재현율을 이용하여 분류하였다. 5가지 모델을 동작시켜 재현율을 측정한 결과, Random Forest가 높은  재현율을 기록하였으며, 그 다음으로 Decision Tree와 Logistic Regression이 뒤따랐다.. 즉, Random Forest 모델의 성능이 가장 우수하다. 이에 따라 Random Forest의 오차행렬을 살펴보았을 때, 오차가 없는 상태임을 확인할 수 있다.


✍ **회고**

* Logistic Regression 모델을 사용하였을 때, 
'extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG' convergence warning이 떴다. 이는 총 반복 횟수가 너무 적어서 한계에 도달한 것이며, 반복 횟수를 늘리거나(max_iter) 데이터 크기를 조정하라는 solution을 받았다. 그래서 max_iter = 3000으로 설정한 후 진행한 결과, Logistic Regression 모델이 정상적으로 작동하였다.
(이전 프로젝트에서 해결한 것을 바탕으로 쉽게 해결할 수 있었다.)

* digits 프로젝트에서 사용했던 accuracy 양식을 breast cancer에서도 적용하고자 하였다. accuracy를 recall로 바꿔주면서 런을 돌렸는데 에러가 발생하였다. 에러의 이유는 해당 변수가 선언되지 않았다고 한다. 분명 선언을 해주었는데도 불구하고 선언이 되지 않을 걸로 보인다. 정확한 해결방법을 찾지 못하였고 인터넷에서 타인의 코드를 참고하여 진행하였다.

* 이번 프로젝트도 시간에 쫓기면 작성한 터라 다양한 시도를 해보지 못한 것에 대한 아쉬움이 남는다. 다음 프로젝트에선 빠른 방향성을 설정하고 다른 요소들을 추가해보는 시도를 해보고 싶다.

🥕 참고자료

https://ceuity.tistory.com/6