# [E-02]Classification

## load_digits

In [1]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
digits = load_digits()
digits_data = digits.data
digits_label = digits.target

In [3]:
digits.target_names

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [4]:
import pandas as pd

digits_df = pd.DataFrame(data=digits_data, columns=digits.feature_names)
digits_df

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,4.0,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0


### Decision Tree

In [5]:
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(digits_data, 
                                                    digits_label, 
                                                    test_size=0.2, 
                                                    random_state=7)

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
decision_tree_y_pred = decision_tree.predict(X_test)

print(classification_report(y_test, decision_tree_y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        43
           1       0.80      0.93      0.86        42
           2       0.86      0.80      0.83        40
           3       0.94      0.91      0.93        34
           4       0.85      0.95      0.90        37
           5       0.84      0.96      0.90        28
           6       0.87      0.93      0.90        28
           7       0.87      0.82      0.84        33
           8       0.87      0.63      0.73        43
           9       0.81      0.81      0.81        32

    accuracy                           0.87       360
   macro avg       0.87      0.87      0.87       360
weighted avg       0.87      0.87      0.87       360



### Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
random_forest_y_pred = random_forest.predict(X_test)

print(classification_report(y_test, random_forest_y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.91      1.00      0.95        42
           2       1.00      1.00      1.00        40
           3       0.97      1.00      0.99        34
           4       0.93      1.00      0.96        37
           5       0.90      1.00      0.95        28
           6       1.00      0.93      0.96        28
           7       0.94      0.97      0.96        33
           8       1.00      0.84      0.91        43
           9       1.00      0.94      0.97        32

    accuracy                           0.96       360
   macro avg       0.97      0.96      0.96       360
weighted avg       0.97      0.96      0.96       360



### SVM

In [7]:
from sklearn import svm

svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
svm_y_pred = svm_model.predict(X_test)

print(classification_report(y_test, svm_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.95      1.00      0.98        42
           2       1.00      1.00      1.00        40
           3       1.00      1.00      1.00        34
           4       1.00      1.00      1.00        37
           5       0.93      1.00      0.97        28
           6       1.00      1.00      1.00        28
           7       1.00      1.00      1.00        33
           8       1.00      0.93      0.96        43
           9       1.00      0.97      0.98        32

    accuracy                           0.99       360
   macro avg       0.99      0.99      0.99       360
weighted avg       0.99      0.99      0.99       360



### SGD Classifier

In [8]:
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
sgd_model_y_pred = sgd_model.predict(X_test)

print(classification_report(y_test, sgd_model_y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        43
           1       0.76      0.98      0.85        42
           2       1.00      1.00      1.00        40
           3       0.97      0.88      0.92        34
           4       0.95      0.97      0.96        37
           5       0.90      1.00      0.95        28
           6       0.96      0.93      0.95        28
           7       0.97      0.97      0.97        33
           8       0.97      0.77      0.86        43
           9       0.97      0.88      0.92        32

    accuracy                           0.93       360
   macro avg       0.94      0.93      0.94       360
weighted avg       0.94      0.93      0.93       360



### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(max_iter = 4000)
logistic_model.fit(X_train, y_train)
logistic_model_y_pred = logistic_model.predict(X_test)

print(classification_report(y_test, logistic_model_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.95      0.95      0.95        42
           2       0.98      1.00      0.99        40
           3       0.94      0.97      0.96        34
           4       1.00      1.00      1.00        37
           5       0.79      0.96      0.87        28
           6       1.00      0.96      0.98        28
           7       0.94      0.97      0.96        33
           8       0.92      0.81      0.86        43
           9       0.97      0.88      0.92        32

    accuracy                           0.95       360
   macro avg       0.95      0.95      0.95       360
weighted avg       0.95      0.95      0.95       360



## load_wine

In [10]:
from sklearn.datasets import load_wine

In [11]:
wine = load_wine()
wine_data = wine.data
wine_label = wine.target

In [12]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [31]:
digits_df = pd.DataFrame(data=wine_data, columns=wine.feature_names)
digits_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(wine_data,
                                                   wine_label, 
                                                   test_size=0.2, 
                                                   random_state=7)

### Decision Tree

In [14]:
decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
decision_tree_y_pred = decision_tree.predict(X_test)
print(classification_report(y_test, decision_tree_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.89      1.00      0.94        17
           2       1.00      0.83      0.91        12

    accuracy                           0.94        36
   macro avg       0.96      0.94      0.95        36
weighted avg       0.95      0.94      0.94        36



### Random Forest

In [15]:
random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
random_forest_y_pred = random_forest.predict(X_test)
print(classification_report(y_test, random_forest_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00        17
           2       1.00      1.00      1.00        12

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



### SVM

In [16]:
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
svm_model_y_pred = svm_model.predict(X_test)
print(classification_report(y_test, svm_model_y_pred))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86         7
           1       0.58      0.88      0.70        17
           2       0.33      0.08      0.13        12

    accuracy                           0.61        36
   macro avg       0.59      0.61      0.56        36
weighted avg       0.55      0.61      0.54        36



### SGD Classifier

In [17]:
sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
sgd_model_y_pred = sgd_model.predict(X_test)
print(classification_report(y_test, sgd_model_y_pred))

              precision    recall  f1-score   support

           0       1.00      0.29      0.44         7
           1       1.00      0.18      0.30        17
           2       0.39      1.00      0.56        12

    accuracy                           0.47        36
   macro avg       0.80      0.49      0.43        36
weighted avg       0.80      0.47      0.41        36



### Logistic Regression

In [18]:
logistic_model = LogisticRegression(max_iter=4096)
logistic_model.fit(X_train, y_train)
logistic_model_y_pred = logistic_model.predict(X_test)
print(classification_report(y_test, logistic_model_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.94      1.00      0.97        17
           2       1.00      0.92      0.96        12

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.98        36
weighted avg       0.97      0.97      0.97        36



## load_breast_cancer

In [19]:
from sklearn.datasets import load_breast_cancer

In [20]:
breast_cancer = load_breast_cancer()
breast_cancer_data = breast_cancer.data
breast_cancer_label = breast_cancer.target

In [21]:
breast_cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [30]:
digits_df = pd.DataFrame(data=breast_cancer_data, columns=breast_cancer.feature_names)
digits_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [22]:
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data, 
                                                   breast_cancer_label, 
                                                   test_size=0.2, 
                                                   random_state=7)

### Decision Tree

In [23]:
decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
decision_tree_y_pred = decision_tree.predict(X_test)
print(classification_report(y_test, decision_tree_y_pred))

              precision    recall  f1-score   support

           0       0.92      0.82      0.87        40
           1       0.91      0.96      0.93        74

    accuracy                           0.91       114
   macro avg       0.91      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114



### Random Forest

In [24]:
random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
random_forest_y_pred = random_forest.predict(X_test)
print(classification_report(y_test, random_forest_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        74

    accuracy                           1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114



### SVM

In [25]:
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
svm_model_y_pred = svm_model.predict(X_test)
print(classification_report(y_test, svm_model_y_pred))

              precision    recall  f1-score   support

           0       1.00      0.72      0.84        40
           1       0.87      1.00      0.93        74

    accuracy                           0.90       114
   macro avg       0.94      0.86      0.89       114
weighted avg       0.92      0.90      0.90       114



### SGD Classifier

In [26]:
sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
sgd_model_y_pred = sgd_model.predict(X_test)
print(classification_report(y_test, sgd_model_y_pred))

              precision    recall  f1-score   support

           0       0.92      0.82      0.87        40
           1       0.91      0.96      0.93        74

    accuracy                           0.91       114
   macro avg       0.91      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114



### Logistic Regression

In [27]:
logistic_model = LogisticRegression(max_iter=4096)
logistic_model.fit(X_train, y_train)
logistic_model_y_pred = logistic_model.predict(X_test)
print(classification_report(y_test, logistic_model_y_pred))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        40
           1       0.93      1.00      0.96        74

    accuracy                           0.95       114
   macro avg       0.96      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



# 회고

* load_digits:  
    digits 데이터 셋은 SVM 모델의 accuracy가 가장 높게 나왔다. 이미지는 2차원 배열이기 때문일 것이다.  
* load_wine:  
    wine 데이터 셋은 Random Forest 모델의 accuracy가 가장 높게 나왔다. 와인의 종류가 여러 가지이기 때문일 것이다.  
* load_breast_cancer:  
    breast cancer 데이터 셋은 Random Forest 모델의 accuracy가 가장 높게 나왔다. coloumn의 개수가 많기 때문일 것이다.  
    
여러 번 반복해서 코드를 쳐보니 분류 데이터 셋을 모델 학습시키는 절차가 조금 익숙하게 느껴졌다. 그런데 아직 이론적인 부분은 잘 모르겠다. 모델의 성능을 알아보는 방법도 여러 가지 있지만, 어떤 모델에 어떤 방법을 사용해야 하는지 몰라서 그나마 익숙한 accuracy로 판단해 보았다. 노드를 다시 한번 읽어봐야겠다.