### 목표 : 생선 분류 모델
- 데이터 : fish.csv
- 피쳐 : 5개 Weight, Length, Diagonal, Height, Width
- 타겟 : 1개 Speices
- 방법 : 지도학습 + 다중 분류

In [64]:
# 모듈로딩
import pandas as pd
import numpy as np

In [65]:
# 데이터 준비
data_file = '../data/fish.csv'

fishDF = pd.read_csv(data_file)

fishDF

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.5200,4.0200
1,Bream,290.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.7300,4.4555
4,Bream,430.0,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...
154,Smelt,12.2,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,14.3,15.2,2.8728,2.0672


(2) 학습 위한 데이터 준비

(2-1) 피쳐/타겟 분리

In [66]:
featureDF = fishDF[fishDF.columns[1:]]
targetDF = fishDF[fishDF.columns[0]]

In [67]:
print(f'featureDF : {featureDF.shape}, targetDF : {targetDF.shape}')

featureDF : (159, 5), targetDF : (159,)


In [68]:
# 타겟의 클래스 수 확인
targetDF.nunique()

7

In [69]:
# 타겟 클래스 별 데이터 수 확인
targetDF.value_counts()

Species
Perch        56
Bream        35
Roach        20
Pike         17
Smelt        14
Parkki       11
Whitefish     6
Name: count, dtype: int64

In [70]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(targetDF)

In [71]:
targetDF = pd.Series(encoder.transform(targetDF))
targetDF

0      0
1      0
2      0
3      0
4      0
      ..
154    5
155    5
156    5
157    5
158    5
Length: 159, dtype: int32

(2-2) 학습용/테스트용 데이터셋 준비

In [72]:
from sklearn.model_selection import train_test_split

In [73]:
X_train, X_test, y_train, y_test = train_test_split(featureDF, targetDF,
                                                    stratify=targetDF,
                                                    random_state=11)

In [74]:
print(f'[Train Dataset] {X_train.shape}, {y_train.shape}')
print(f'[Test Dataset] {X_test.shape}, {y_test.shape}')

[Train Dataset] (119, 5), (119,)
[Test Dataset] (40, 5), (40,)


(3) 학습 진행

In [75]:
from sklearn.linear_model import LogisticRegression

In [76]:
# 모델 인스턴스 생성 및 학습
model = LogisticRegression(max_iter=1000, solver='liblinear')
model.fit(X_train, y_train)

In [77]:
# 모델 파라미터 확인
print(f'classes_ : {model.classes_}')
print(f'feature_name_in : {model.feature_names_in_}')
print(f'max_iter : {model.max_iter}')
print(f'coef_ : {model.coef_}')
print(f'intercept_ : {model.intercept_}')

classes_ : [0 1 2 3 4 5 6]
feature_name_in : ['Weight' 'Length' 'Diagonal' 'Height' 'Width']
max_iter : 1000
coef_ : [[ 1.31151754e-02 -1.64944470e+00  8.28009575e-01  1.41621595e+00
  -4.15067201e-01]
 [-2.10617657e-02  3.33701594e-01 -9.64909143e-01  2.19381184e+00
   2.66611701e-02]
 [-1.97453974e-03  2.60616873e+00 -2.66412260e+00 -7.93176743e-03
   1.91659551e+00]
 [ 1.01422059e-02  2.55168743e-01  1.51461260e-01 -1.94779290e+00
  -8.36602128e-01]
 [-9.89829706e-03 -1.72578825e+00  1.53807538e+00 -5.12880032e-01
   1.65750894e+00]
 [-7.29426634e-02  3.82049401e-01  1.62783679e-01 -1.55364795e+00
  -5.97839461e-01]
 [ 5.68775586e-03 -5.20399292e-01  2.54546484e-01 -2.46921990e-01
   8.40269158e-01]]
intercept_ : [-0.27362898  0.07982094 -0.34682853 -1.23222237 -1.32590576  0.41907035
 -0.34453293]


(4) 평가

In [78]:
print(f'[Train Score] {model.score(X_train, y_train)}')
print(f'[Test Score] {model.score(X_test, y_test)}')

[Train Score] 0.9495798319327731
[Test Score] 0.975


(5) 모델 활용

In [79]:
y_pre = model.predict(X_test.iloc[[0]])

y_pre, y_test[:1]

(array([0]),
 1    0
 dtype: int32)

In [80]:
# 5개 데이터에 대한 생선 분류 예측
print(model.classes_)
np.round(model.predict_proba(X_test.iloc[:5]), 3), y_test[:5].to_list()

[0 1 2 3 4 5 6]


(array([[0.504, 0.311, 0.   , 0.   , 0.173, 0.   , 0.012],
        [0.158, 0.73 , 0.044, 0.   , 0.057, 0.   , 0.01 ],
        [0.772, 0.024, 0.001, 0.   , 0.18 , 0.   , 0.023],
        [0.001, 0.089, 0.719, 0.002, 0.155, 0.004, 0.03 ],
        [0.   , 0.021, 0.753, 0.009, 0.176, 0.009, 0.031]]),
 [0, 1, 0, 2, 2])

In [81]:
result = model.predict_proba(X_test.iloc[:5]).argmax(axis=1)
result

array([0, 1, 0, 2, 2], dtype=int64)

In [82]:
data = {"Pre Y" : [model.classes_[idx] for idx in result],
        "True Y" : y_test[:5].to_list()}

In [83]:
pd.DataFrame(data)

Unnamed: 0,Pre Y,True Y
0,0,0
1,1,1
2,0,0
3,2,2
4,2,2


(6) 모델 성능 평가
- 정확도
- 정밀도
- 재현율
- F1-score
- Confusion Matrics
- Classification Report

In [84]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report

In [87]:
print(classification_report(y_test, model.predict(X_test), zero_division=0))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00         3
           2       0.93      1.00      0.97        14
           3       1.00      1.00      1.00         4
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         4
           6       0.00      0.00      0.00         1

    accuracy                           0.97        40
   macro avg       0.85      0.86      0.85        40
weighted avg       0.95      0.97      0.96        40


In [89]:
print(f1_score(y_test, model.predict(X_test), average='weighted'))

0.9629310344827587


In [90]:
recall_score(y_test, model.predict(X_test), average='weighted')

0.975

In [92]:
print(confusion_matrix(y_test, model.predict(X_test)))

[[ 9  0  0  0  0  0  0]
 [ 0  3  0  0  0  0  0]
 [ 0  0 14  0  0  0  0]
 [ 0  0  0  4  0  0  0]
 [ 0  0  0  0  5  0  0]
 [ 0  0  0  0  0  4  0]
 [ 0  0  1  0  0  0  0]]
