### 모델 성능 검증 

In [74]:
import pandas as pd 

# 광물 데이터 
df = pd.read_csv("./sonar3.csv", header=None)
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,0
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,0
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,0
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,0
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,0


In [75]:
df[60].value_counts()

# 일반 암석 : 97, 광석 : 111

60
1    111
0     97
Name: count, dtype: int64

#### 데이터 분리 

In [76]:
x = df.iloc[:, 0:60]
y = df.iloc[:, 60]

##### 딥러닝 실행 

In [77]:
from tensorflow import keras 
from keras import Sequential, Input 
from keras.layers import Dense

##### 모델 설계

In [78]:
model = Sequential()
model.add(Input(shape=(60,)))
model.add(Dense(24, activation="relu"))
model.add(Dense(10, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

In [79]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [80]:
history = model.fit(x,y, epochs=200, batch_size=10, verbose=2)

Epoch 1/200
21/21 - 0s - loss: 0.6758 - accuracy: 0.6250 - 225ms/epoch - 11ms/step
Epoch 2/200
21/21 - 0s - loss: 0.6496 - accuracy: 0.6635 - 14ms/epoch - 689us/step
Epoch 3/200
21/21 - 0s - loss: 0.6327 - accuracy: 0.7500 - 14ms/epoch - 679us/step
Epoch 4/200
21/21 - 0s - loss: 0.6162 - accuracy: 0.7692 - 15ms/epoch - 707us/step
Epoch 5/200
21/21 - 0s - loss: 0.6025 - accuracy: 0.7404 - 14ms/epoch - 687us/step
Epoch 6/200
21/21 - 0s - loss: 0.5846 - accuracy: 0.7885 - 13ms/epoch - 637us/step
Epoch 7/200
21/21 - 0s - loss: 0.5693 - accuracy: 0.7837 - 13ms/epoch - 636us/step
Epoch 8/200
21/21 - 0s - loss: 0.5508 - accuracy: 0.7837 - 13ms/epoch - 625us/step
Epoch 9/200
21/21 - 0s - loss: 0.5325 - accuracy: 0.8077 - 13ms/epoch - 628us/step
Epoch 10/200
21/21 - 0s - loss: 0.5211 - accuracy: 0.7692 - 13ms/epoch - 619us/step
Epoch 11/200
21/21 - 0s - loss: 0.5074 - accuracy: 0.7692 - 13ms/epoch - 634us/step
Epoch 12/200
21/21 - 0s - loss: 0.4882 - accuracy: 0.8029 - 14ms/epoch - 647us/step
E

##### 정확도가 거의 100%에 가까움 -> 어떤 광물이라도 100% 확률로 판별하는 모델이라고 하기 어려움 

### 과적합 
<b>모델이 학습 데이터셋 안에서 일정 수준 이상의 예측 정확도를 보이지만, 새로운 데이터에 적용하면 맞지 않음 </b>

<h4>해결 방법</h4>
학습셋, 테스트셋으로 완전히 구분 -> 학습 + 테스트 병행
1. 학습셋으로 딥러닝 학습 
2. 학습 결과 저장 
3. 테스트셋으로 검증 및 예측 수행 

##### 은닉층 개수에 따른 학습셋 및 테스트셋에서 예측률 
은닉층 개수가 올라감에 따라 학습셋(계속 상승) & 테스트셋(상승하다가 떨어짐)

In [81]:
# 사이킷런 사용 
from sklearn.model_selection import train_test_split

# 학습셋, 테스트셋 분리 
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, shuffle=True)

In [82]:
# 만든 모델을 테스트셋에 적용 : model.evaluate()
# model.evaluate() : loss[0], accuracy[1] 계산하여 출력 

score = model.evaluate(x_test, y_test)
print("학습용 손실 : %.2f %%" % (score[0]*100))
print("학습용 정확도 : %.2f %%" % (score[1]*100))

학습용 손실 : 1.23 %
학습용 정확도 : 100.00 %


In [83]:
model = Sequential()
model.add(Input(shape=(60,)))
model.add(Dense(24, activation="relu"))
model.add(Dense(10, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

history = model.fit(x_train, y_train, epochs=200, batch_size=10, verbose=2)

score = model.evaluate(x_test, y_test)
print("테스트 손실 : %.2f %%" % (score[0]*100))
print("테스트 정확도 : %.2f %%" % (score[1]*100))

Epoch 1/200
15/15 - 0s - loss: 0.7380 - accuracy: 0.4759 - 213ms/epoch - 14ms/step
Epoch 2/200
15/15 - 0s - loss: 0.6945 - accuracy: 0.4759 - 11ms/epoch - 723us/step
Epoch 3/200
15/15 - 0s - loss: 0.6817 - accuracy: 0.5517 - 10ms/epoch - 694us/step
Epoch 4/200
15/15 - 0s - loss: 0.6722 - accuracy: 0.6207 - 10ms/epoch - 693us/step
Epoch 5/200
15/15 - 0s - loss: 0.6659 - accuracy: 0.6621 - 10ms/epoch - 676us/step
Epoch 6/200
15/15 - 0s - loss: 0.6473 - accuracy: 0.7103 - 10ms/epoch - 683us/step
Epoch 7/200
15/15 - 0s - loss: 0.6255 - accuracy: 0.6897 - 10ms/epoch - 671us/step
Epoch 8/200
15/15 - 0s - loss: 0.6150 - accuracy: 0.6828 - 10ms/epoch - 647us/step
Epoch 9/200
15/15 - 0s - loss: 0.6005 - accuracy: 0.7310 - 10ms/epoch - 680us/step
Epoch 10/200
15/15 - 0s - loss: 0.5908 - accuracy: 0.7310 - 10ms/epoch - 660us/step
Epoch 11/200
15/15 - 0s - loss: 0.5802 - accuracy: 0.7655 - 10ms/epoch - 645us/step
Epoch 12/200
15/15 - 0s - loss: 0.5704 - accuracy: 0.7517 - 10ms/epoch - 657us/step
E

In [84]:
# 모델 저장하기 : model.save()
model.save("./my_model_13.hdf5") 

# 모델 불러오기 
from keras.models import load_model

del model # 메모리에 저장된 모델 삭제 

# 모델 불러오기 
model = load_model("./my_model_13.hdf5")

# 모델을 테스트셋에 적용하여 정확도 구하기 
print("학습용: %.2f%%" % (100*model.evaluate(x_train, y_train, verbose=0)[1]))
print("시험용: %.2f%%" % (100*model.evaluate(x_test, y_test, verbose=0)[1]))

학습용: 100.00%
시험용: 85.71%


  saving_api.save_model(


#### K-Fold Cross-Validation  (CV)

<b>데이터셋을 여러 개로 나누어 하나씩 테스트셋으로 사용하고 나머지를 합하여 학습셋으로 사용</b>

In [85]:
# KFold 함수 사용 
from sklearn.model_selection import KFold

k = 5 # 몇 개로 나눌 것인지 정함 
kfold = KFold(n_splits=k, shuffle=True) # k개 만큼 분리, 데이터 셔플 ㅇㅇ 
acc_score = [] # 정확도 저장 변수 

# k개의 학습셋, 테스트셋으로 분리 
for train_idx, test_idx in kfold.split(x):
    x_train, x_test = x.iloc[train_idx, :], x.iloc[test_idx, : ] 
    y_train, y_test = y.iloc[train_idx] , y.iloc[test_idx]


In [86]:
# 정확도 구하기 
accuarcy = model.evaluate(x_test, y_test, verbose=2)[1]
acc_score.append(accuarcy)

2/2 - 0s - loss: 0.0396 - accuracy: 1.0000 - 10ms/epoch - 5ms/step


In [87]:
# 모델 학습까지 작성 
for train_index, test_index in kfold.split(x):
  x_train = x.iloc[train_index, 0:60]
  x_test = x.iloc[test_index, :]

  y_train = y.iloc[train_index]
  y_test = y.iloc[test_index]

  model = Sequential()
  model.add(Input(shape=(60, )))  # 입력층 추가!
  model.add(Dense(24, activation="relu"))  # 밀집층 추가! (은닉층)
  model.add(Dense(10, activation="relu"))  # 밀집층 추가! (은닉층)
  model.add(Dense(1, activation="sigmoid"))  # 밀집층 추가! -> 출력층!
  
  model.compile(loss="binary_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])
  
  model.fit(x_train, y_train, epochs=200, batch_size=10, verbose=0)

  acc_score.append(model.evaluate(x_test, y_test, verbose=0)[1])
  

# 정확도 구하기 
avg_acc = sum(acc_score)/k

# 값 출력 
print("정확도 : ", acc_score)

# 평균 출력 
print("모델의 평균 정확도:", sum(acc_score) / k)

정확도 :  [1.0, 0.8809523582458496, 0.8333333134651184, 0.761904776096344, 0.8048780560493469, 0.8048780560493469]
모델의 평균 정확도: 1.0171893119812012
