# CNN을 이용한 상품 종류 분류
pickle data를 load해서 상품 품목을 분류하는 알고리즘

In [None]:
# Basic library
import json
import glob
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import pickle

In [None]:
# 이미지, plotting 관련
import matplotlib.pyplot as plt
from skimage.io import imread
from skimage.transform import resize
from sklearn.model_selection import train_test_split

In [None]:
# 구글 드라이브와 연동
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ls  # 현재 폴더 상태 확인

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [None]:
os.chdir('./drive/MyDrive/projects/농산물')  # 현재 폴더 위치 변경

## Import data(pickle format)

In [None]:
filenames = glob.glob('data/*')  # 10개의 pickle files

In [None]:
def import_data(filename, seed):
  """
  파일이름으로 각각 종목에 대해 10000개의 랜덤한 변수들 import
  """
  with open(filename, 'rb') as f:
   data = pickle.load(f) # 단 한줄씩 읽어옴
  np.random.seed(seed)
  random_idx = np.random.choice(range(len(data['X'])), 10000, replace=False)
  X = data['X'][random_idx]
  y = ['_'.join(x.split('_')[:2]) for x in data['file_name'][random_idx]]

  return X, y

In [None]:
def class_to_int(y):
  """
  y가 str 변수이기때문에 int 변수로 변환
  """
  y_dict = {}
  for idx, element in enumerate(set(y)):
    y_dict[element] = idx
  y_int = [y_dict[x] for x in y]
  return y_int

In [None]:
X, y_origin = import_data(filenames[0], 2021)

In [None]:
for fn in tqdm(filenames[1:]):
  X_temp, y_temp =  import_data(fn, 2021)
  X = np.append(X, X_temp, axis=0)
  y.extend(y_temp)

100%|██████████| 9/9 [02:56<00:00, 19.58s/it]


In [None]:
y = []  # 사과, 배, 감자는 서브클래스 분류 X
for i in y_origin:
  if i.startswith('apple'):
    y.append('apple')
  elif i.startswith('pear'):
    y.append('pear')
  elif i.startswith('potato'):
    y.append('potato')
  else:
    y.append(i)

In [None]:
y = ['chinese cabbage' if x[:7] == 'chinese' else x for x in y]  # chinese cabbage의 file name format이 약간 다르기 때문

In [None]:
y = class_to_int(y)

In [None]:
X = np.asarray(X)  # deep learning에 input하기위한 format
y = np.asarray(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## CNN Modeling

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

In [None]:
# ref : https://www.tensorflow.org/tutorials/images/classification?hl=ko
num_classes = len(set(y))  # 특상, 상, 보통
img_height, img_width = 100, 100
model = Sequential([
  layers.experimental.preprocessing.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes)
])

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
epochs=12
history = model.fit(
  X_train, y_train,
  epochs=epochs
)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [None]:
# test dataset 을 이용한 예측
y_pred = model.predict(X_test)
y_pred = [np.argmax(y) for y in y_pred]
y_pred = np.array(y_pred)
result = np.mean(y_pred == y_test)   # test set의 정확도

In [None]:
print(f'품목 분류 Test accuracy: {result}')

품목 분류 Test accuracy: 0.97928


## VGG-19 Model(pretrained model)
결과적으로 좋은 성능을 내지는 못함

In [None]:
from keras.applications.vgg19 import VGG19
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten

In [None]:
base_model = VGG19(
    weights = 'imagenet',
    # weights = None,
    include_top=False, input_shape=(img_height, img_width, 3))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
x = base_model.output
x = Flatten()(x)
predictions = Dense(num_classes, activation='softmax')(x)

In [None]:
model = Model(inputs=base_model.input, outputs=predictions)

In [None]:
# 교육할 계층 설정 최상위 계층만 교육 (False 가중치 고정)
for layer in base_model.layers:
    layer.trainable = False

# 훈련할 레이어 확인
for layer in base_model.layers:
    print(layer, layer.trainable)

<tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7fc7ee0d6908> False
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fc7ecf307b8> False
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fc7ecf30898> False
<tensorflow.python.keras.layers.pooling.MaxPooling2D object at 0x7fc7ecf35da0> False
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fc7edfe0dd8> False
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fc7ecf308d0> False
<tensorflow.python.keras.layers.pooling.MaxPooling2D object at 0x7fc7ecf46e48> False
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fc7ee1152b0> False
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fc7ecf30e48> False
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fc7ecf46dd8> False
<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fc7ecf57358> False
<tensorflow.python.keras.layers.pooling.MaxPooling2D object at 

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
epochs=7
history = model.fit(
  X_train, y_train,
  epochs=epochs
)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [None]:
# test dataset 을 이용한 예측
y_pred = model.predict(X_test)
y_pred = [np.argmax(y) for y in y_pred]
y_pred = np.array(y_pred)
result = np.mean(y_pred == y_test)   # test set의 정확도

In [None]:
print(f'품목 분류 by VGG-19 Test accuracy: {result}')

품목 분류 by VGG-19 Test accuracy: 0.94648
