# CNN을 이용한 상품 품질(등급) 분류
pickle data를 load해서 상품 등급(특, 상, 보통)을 분류하는 알고리즘

In [None]:
import json
import glob
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import pickle

In [None]:
# 이미지, plotting 관련
import matplotlib.pyplot as plt
from skimage.io import imread
from skimage.transform import resize
from sklearn.model_selection import train_test_split

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

In [None]:
# 구글 드라이브와 연동
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
os.chdir('./drive/MyDrive/projects/농산물')  # 현재 폴더 위치 변경

In [None]:
filenames = glob.glob('data/*')  # 10개의 pickle files

In [None]:
filenames

['data/chinese.pkl',
 'data/garlic.pkl',
 'data/mandarin.pkl',
 'data/onion.pkl',
 'data/pear.pkl',
 'data/persimmon.pkl',
 'data/potato.pkl',
 'data/radish.pkl',
 'data/apple.pkl',
 'data/cabbage.pkl']

In [None]:
def import_data(filename, seed):
  """
  파일이름으로 각각 종목에 대해 10000개의 랜덤한 변수들 import
  """
  with open(filename, 'rb') as f:
   data = pickle.load(f) # 단 한줄씩 읽어옴
  np.random.seed(seed)
  random_idx = np.random.choice(range(len(data['X'])), 10000, replace=False)
  X = data['X'][random_idx]
  y = ['_'.join(x.split('_')[:2]) for x in data['file_name'][random_idx]]
  fn = data['file_name'][random_idx]
  return X, y, fn

In [None]:
def class_to_int(y):
  """
  y가 str 변수이기때문에 int 변수로 변환
  """
  y_dict = {}
  for idx, element in enumerate(set(y)):
    y_dict[element] = idx
  y_int = [y_dict[x] for x in y]
  return y_int

In [None]:
def split_sub_class(data):
  subclass_list = []
  grade_list = []

  for fn in data[2]:  # file names
    splited = fn.split('_')
    if splited[1] not in ['M', 'L', 'S']:  # 배추의 경우 다른 농산문들과 다른 format을 가지기 때문에
      subclass_list.append(splited[1])
      grade_list.append(splited[2])
    else:
      subclass_list.append(splited[0])
      grade_list.append(splited[1])

  sb_dict = {}
  for sn in set(subclass_list):
    idxes = []
    for idx,s in enumerate(subclass_list):
      if s == sn:
        idxes.append(idx)
    sb_dict[sn] = idxes

  result = {}
  result['X'] = data[0]
  result['y'] = np.array(list(map(lambda x:x.upper(), grade_list)))
  result['subclass'] = sb_dict
  if splited[0] in ['apple', 'pear','potato']:
    result['subclass'] = {'integrated': list(range(10000))}
  return result

In [None]:
result = {}

In [None]:
for fn in tqdm(filenames):
  data = import_data(fn, 2021)
  data_splited = split_sub_class(data)
  for sn in data_splited['subclass'].keys():
    idxes = data_splited['subclass'][sn]

    X_sub = data_splited['X'][idxes]
    y_sub = data_splited['y'][idxes]
    y_sub = class_to_int(y_sub)

    X_sub = np.asarray(X_sub)  # deep learning에 input하기위한 format
    y_sub = np.asarray(y_sub)

    X_train, X_test, y_train, y_test = train_test_split(X_sub, y_sub, test_size=0.25, random_state=42)

    num_classes = len(set(y_sub))  # 특상, 상, 보통
    img_height, img_width = 100, 100
    model = Sequential([
      layers.experimental.preprocessing.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
      layers.Conv2D(16, 3, padding='same', activation='relu'),
      layers.MaxPooling2D(),
      layers.Conv2D(32, 3, padding='same', activation='relu'),
      layers.MaxPooling2D(),
      layers.Conv2D(64, 3, padding='same', activation='relu'),
      layers.MaxPooling2D(),
      layers.Flatten(),
      layers.Dense(128, activation='relu'),
      layers.Dense(num_classes)
    ])

    model.compile(optimizer='adam',
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    epochs=10
    history = model.fit(
      X_train, y_train,
      epochs=epochs,
      verbose=0  # silent
    )

    # test dataset 을 이용한 예측
    y_pred = model.predict(X_test)
    y_pred = [np.argmax(y) for y in y_pred]
    y_pred = np.array(y_pred)
    pure_name = fn.split('/')[-1].split('.')[0]
    result[f'{pure_name}_{sn}'] = np.mean(y_pred == y_test)   # test set의 정확도

100%|██████████| 10/10 [03:34<00:00, 21.44s/it]


In [None]:
result  # test accuracy

{'apple_integrated': 0.8348,
 'cabbage_green': 0.8571428571428571,
 'cabbage_red': 0.9778794813119756,
 'chinese_chinese cabbage': 0.9264,
 'garlic_uiseong': 0.9736,
 'mandarin_hallabong': 0.6783103168155971,
 'mandarin_onjumilgam': 0.8251968503937008,
 'onion_red': 0.9943729903536977,
 'onion_white': 0.9952267303102625,
 'pear_integrated': 0.79,
 'persimmon_bansi': 0.9721407624633431,
 'persimmon_booyu': 0.9968186638388123,
 'persimmon_daebong': 0.8540478905359179,
 'potato_seolbong': 0.9209914794732765,
 'potato_sumi': 0.6685950413223141,
 'radish_winter radish': 0.9268}