In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Data Load / EDA

In [3]:
breast_cancer = load_breast_cancer()
print(dir(breast_cancer))

['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'frame', 'target', 'target_names']


In [4]:
breast_cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [16]:
print(breast_cancer.frame) #이건 뭐지

None


In [5]:
print(breast_cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [6]:
breast_cancer.data.shape

(569, 30)

In [7]:
data = breast_cancer.data

In [8]:
data[0]

array([1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
       3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
       8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
       3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
       1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01])

In [9]:
label = breast_cancer.target

In [10]:
label.shape

(569,)

In [11]:
label[0]

0

In [12]:
breast_cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [17]:
breast_cancer.target_names #0이 암 양성, 1이 음성

array(['malignant', 'benign'], dtype='<U9')

In [20]:
import numpy as np
np.unique(label, return_counts = True)

(array([0, 1]), array([212, 357]))

# Train / Test Split

In [51]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, 
                                                    label, 
                                                    test_size=0.2, 
                                                    random_state=7)

print('X_train 개수: ', len(X_train),', X_test 개수: ', len(X_test))

X_train 개수:  455 , X_test 개수:  114


# Decision Tree

In [52]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred_dt = decision_tree.predict(X_test)

# Random Forest

In [53]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)

# SVM

In [54]:
from sklearn import svm
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# SGD Classifier

In [55]:
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
y_pred_sgd = sgd_model.predict(X_test)

# Logistic Regression

In [56]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter=2000)
logistic_model.fit(X_train, y_train)
y_pred_lr = logistic_model.predict(X_test)

# Performance Evaluation

### Decision Tree

In [57]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       0.92      0.82      0.87        40
           1       0.91      0.96      0.93        74

    accuracy                           0.91       114
   macro avg       0.91      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114



In [58]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test,y_pred_dt)) #FN이 7

[[33  7]
 [ 3 71]]


### Random Forest

In [59]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        74

    accuracy                           1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114



### SVM

In [60]:
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       1.00      0.72      0.84        40
           1       0.87      1.00      0.93        74

    accuracy                           0.90       114
   macro avg       0.94      0.86      0.89       114
weighted avg       0.92      0.90      0.90       114



In [61]:
print(confusion_matrix(y_test,y_pred_svm)) #FN이 11

[[29 11]
 [ 0 74]]


### SGD Classifier

In [62]:
print(classification_report(y_test, y_pred_sgd))

              precision    recall  f1-score   support

           0       0.94      0.78      0.85        40
           1       0.89      0.97      0.93        74

    accuracy                           0.90       114
   macro avg       0.91      0.87      0.89       114
weighted avg       0.91      0.90      0.90       114



In [63]:
print(confusion_matrix(y_test,y_pred_sgd)) #FN이 9

[[31  9]
 [ 2 72]]


### Logistic Regression

In [64]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        40
           1       0.93      1.00      0.96        74

    accuracy                           0.95       114
   macro avg       0.96      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



In [65]:
print(confusion_matrix(y_test,y_pred_lr)) #FN이 6

[[34  6]
 [ 0 74]]


# Conclusion

breast_cancer 데이터는 label이 binary이고, 각 레이블 별 데이터의 분포가 크게 차이가 나지 않는 편으로 불균형데이터 혹은 imbalanced data의 문제가 보이지 않았다. 하지만 암을 발견하는 문제에서는, 암이 아닌데 암으로 판단하는 것은 괜찮지만, 암인데 암이 아니라고 판단하면 안되기 때문에 Recall이 중요하다. 때문에 본 문제의 평가 척도는 Recall로 정하였다.

각 모델 별 Recall은 다음과 같다:

In [68]:
from sklearn.metrics import recall_score

#pos_label=0: 해당 데이터에서는 암 양성의 레이블이 0이기 때문에 지정해줘야 함 (디폴트=1)
print("Decision Tree:", recall_score(y_test, y_pred_dt, pos_label=0))
print("Random Forest:", recall_score(y_test, y_pred_rf, pos_label=0))
print("SVM:", recall_score(y_test, y_pred_svm, pos_label=0))
print("SGD Classifier:", recall_score(y_test, y_pred_sgd, pos_label=0))
print("Logistic Regression:", recall_score(y_test, y_pred_lr, pos_label=0))

Decision Tree: 0.825
Random Forest: 1.0
SVM: 0.725
SGD Classifier: 0.775
Logistic Regression: 0.85


#### 결과: RF가 Recall값 1.0으로 가장 높은 성능을 보였다. 나머지는 암인데도 암이 아니라고 판단하는 케이스 수가 각각 7,11,9,6으로 아주 위험한 모델들임