In [1]:
# 필수 라이브러리
import pandas as pd
import numpy as np
import random
import tensorflow as tf

# 랜덤 시드 고정
SEED=12
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)  
print("시드 고정: ", SEED)

시드 고정:  12


In [14]:
data = pd.read_csv("data/train.csv")

print(data.shape)

(5497, 14)


In [15]:
data.head(2)

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,5,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,white
1,1,5,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,red


In [16]:
data['quality']

0       5
1       5
2       5
3       6
4       6
       ..
5492    5
5493    6
5494    7
5495    5
5496    6
Name: quality, Length: 5497, dtype: int64

In [18]:
data['type'].value_counts()

0    5497
Name: type, dtype: int64

In [19]:
data['type'] = np.where(train['type']=='white', 1, 0).astype(int)
data['type'].value_counts()

0    5497
Name: type, dtype: int64

In [20]:
data['quality'].value_counts()

6    2416
5    1788
7     924
4     186
8     152
3      26
9       5
Name: quality, dtype: int64

In [21]:
from tensorflow.keras.utils import to_categorical

y_data = to_categorical(data.loc[:, 'quality'] - 3)
y_data

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [28]:
# 피처 선택
X_data = data.loc[:, 'fixed acidity':]
X_data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,0
1,8.8,0.610,0.14,2.4,0.067,10.0,42.0,0.99690,3.19,0.59,9.5,0
2,7.9,0.210,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,0
3,7.0,0.210,0.31,6.0,0.046,29.0,108.0,0.99390,3.26,0.50,10.8,0
4,7.8,0.400,0.26,9.5,0.059,32.0,178.0,0.99550,3.04,0.43,10.9,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5492,7.7,0.150,0.29,1.3,0.029,10.0,64.0,0.99320,3.35,0.39,10.1,0
5493,6.3,0.180,0.36,1.2,0.034,26.0,111.0,0.99074,3.16,0.51,11.0,0
5494,7.8,0.150,0.34,1.1,0.035,31.0,93.0,0.99096,3.07,0.72,11.3,0
5495,6.6,0.410,0.31,1.6,0.042,18.0,101.0,0.99195,3.13,0.41,10.5,0


In [30]:
# 피처 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X_data)
X_data_scaled = scaler.fit_transform(X_data)

In [31]:
print(X_data_scaled.shape, y_data.shape)

(5497, 12) (5497, 7)


In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data_scaled, y_data, 
                                                    test_size=0.2, 
                                                    shuffle=True, 
                                                    random_state=100)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4397, 12) (4397, 7)
(1100, 12) (1100, 7)


In [41]:
X_train.shape[1]

12

In [39]:
# 심층 신경망 모델
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

def build_model(train_data, train_target):
    model = Sequential()
    model.add(Dense(128, activation='tanh', input_dim=X_train.shape[1]))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='tanh'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='tanh'))
    model.add(Dense(train_target.shape[1], activation='softmax'))

    model.compile(optimizer='RMSprop', loss='categorical_crossentropy', 
                metrics=['acc', 'mae'])

    return model

model = build_model(X_train_scaled, y_train)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 128)               1664      
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 32)                2080      
                                                                 
 dense_7 (Dense)             (None, 7)                 231       
                                                                 
Total params: 12,231
Trainable params: 12,231
Non-trai

In [49]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, 
                                            shuffle=True, random_state=SEED)

early_stopping = EarlyStopping(monitor='val_loss',  patience=10)
history = model.fit(X_tr, y_tr, batch_size=64, epochs=200,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping],                    
                    verbose=2)

Epoch 1/200
62/62 - 0s - loss: 1.0296 - acc: 0.5598 - mae: 0.1619 - val_loss: 1.0296 - val_acc: 0.5341 - val_mae: 0.1615 - 100ms/epoch - 2ms/step
Epoch 2/200
62/62 - 0s - loss: 1.0293 - acc: 0.5537 - mae: 0.1616 - val_loss: 1.0216 - val_acc: 0.5523 - val_mae: 0.1615 - 83ms/epoch - 1ms/step
Epoch 3/200
62/62 - 0s - loss: 1.0299 - acc: 0.5540 - mae: 0.1616 - val_loss: 1.0429 - val_acc: 0.5432 - val_mae: 0.1660 - 82ms/epoch - 1ms/step
Epoch 4/200
62/62 - 0s - loss: 1.0271 - acc: 0.5605 - mae: 0.1616 - val_loss: 1.0224 - val_acc: 0.5523 - val_mae: 0.1626 - 94ms/epoch - 2ms/step
Epoch 5/200
62/62 - 0s - loss: 1.0274 - acc: 0.5648 - mae: 0.1615 - val_loss: 1.0242 - val_acc: 0.5477 - val_mae: 0.1617 - 84ms/epoch - 1ms/step
Epoch 6/200
62/62 - 0s - loss: 1.0284 - acc: 0.5572 - mae: 0.1619 - val_loss: 1.0204 - val_acc: 0.5591 - val_mae: 0.1604 - 82ms/epoch - 1ms/step
Epoch 7/200
62/62 - 0s - loss: 1.0274 - acc: 0.5633 - mae: 0.1616 - val_loss: 1.0295 - val_acc: 0.5591 - val_mae: 0.1620 - 107ms/

In [50]:
loss, acc, _ = model.evaluate(X_val, y_val)
print('loss>> ', loss)
print('acc>> ', acc)

loss>>  1.0449318885803223
acc>>  0.5454545617103577


In [51]:
# test 데이터에 대한 예측값 정리
y_pred_proba = model.predict(X_test)
y_pred_proba[:5]

array([[3.3087403e-04, 3.2664004e-03, 5.5649336e-02, 4.5105955e-01,
        4.5133883e-01, 3.8333744e-02, 2.1240527e-05],
       [1.0129079e-03, 4.0062489e-03, 1.0825557e-01, 6.0407233e-01,
        2.1205018e-01, 6.9749780e-02, 8.5301540e-04],
       [3.5701350e-03, 1.2336301e-02, 1.0268133e-01, 6.0857946e-01,
        2.3046944e-01, 4.1447483e-02, 9.1583433e-04],
       [8.3621620e-04, 2.4825364e-02, 1.2484629e-01, 5.4735380e-01,
        2.7213880e-01, 2.9942650e-02, 5.6844616e-05],
       [6.1295956e-04, 1.8178798e-02, 5.0340134e-01, 4.5329362e-01,
        2.4066139e-02, 4.4642010e-04, 7.9762265e-07]], dtype=float32)

In [52]:
y_pred_label = np.argmax(y_pred_proba, axis=-1) + 3
y_pred_label[:5]

array([7, 6, 6, 6, 5], dtype=int64)

In [None]:
######[ test데이터를 바꾸어 테스트 해보자]####### 

In [53]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                            shuffle=True, random_state=200)

In [59]:
# test 데이터에 대한 예측값 정리
y_pred_proba = model.predict(X_val)
y_pred_proba[:5]

array([[1.4871861e-03, 1.7312469e-02, 1.4497018e-01, 5.8491778e-01,
        2.1505442e-01, 3.5792690e-02, 4.6518020e-04],
       [1.6229150e-03, 1.6559653e-03, 2.6314253e-02, 4.5993119e-01,
        4.0891257e-01, 1.0131080e-01, 2.5234459e-04],
       [9.9172187e-04, 1.8438471e-02, 2.4883485e-01, 5.6535047e-01,
        1.5928097e-01, 7.0756576e-03, 2.7887418e-05],
       [3.7104872e-04, 8.1696734e-03, 3.0554438e-01, 5.8799380e-01,
        8.6080633e-02, 1.1820494e-02, 1.9916180e-05],
       [3.4547567e-03, 6.9892280e-02, 6.8983310e-01, 2.2064486e-01,
        1.5097243e-02, 1.0644988e-03, 1.3244996e-05]], dtype=float32)

In [60]:
y_pred_label = np.argmax(y_pred_proba, axis=-1) + 3
y_pred_label[:5]

array([6, 6, 6, 6, 5], dtype=int64)