# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import os
from PIL import Image
from tqdm.notebook import tqdm
import pickle 
import random

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchvision import transforms
import torchvision.models as models

In [2]:
if torch.cuda.is_available():
  device = torch.device('cuda:0')
  print('GPU is avalible.')
  print('Working on:', torch.cuda.get_device_name())
else:
  device = torch.device('cpu')
  print('GPU is not avalible.')
  print('Working on CPU')

GPU is avalible.
Working on: GeForce GTX 750 Ti


建立字典，用來儲存圖片中含有的瑕疵

In [4]:
train_data = {}
with open('./C2_TrainDev/train.csv', 'r', encoding='utf-8-sig') as fh:
  for index, row in enumerate(fh):
    row=row.strip()
    row=row.split(',')
    while True:
      if '' in row:
        row.remove('')
      else:
        break
    
    train_data[row[0]]=[]
    while len(row) > 1:
      train_data[row[0]].append(row[5])
      for idx in range(5):
        row.remove(row[1])
    train_data[row[0]] = set(train_data[row[0]])

#===============================================================================
dev_data = {}
with open('./C2_TrainDev/dev.csv', 'r', encoding='utf-8-sig') as fh:
  for index, row in enumerate(fh):
    row=row.strip()
    row=row.split(',')
    while True:
      if '' in row:
        row.remove('')
      else:
        break
    
    dev_data[row[0]]=[]
    while len(row) > 1:
      dev_data[row[0]].append(row[5])
      for idx in range(5):
        row.remove(row[1])
    dev_data[row[0]] = set(dev_data[row[0]])

圖片處理

In [6]:
composed = transforms.Compose([
      transforms.ToTensor(),
      transforms.Resize((224, 224)),
      transforms.RandomRotation(degrees=15),
      transforms.RandomHorizontalFlip(p=0.5),
      transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225]),
      ])

Dataset

In [7]:
class Data(Dataset):
  def __init__(self, root, folder = 'Train', transforms=None):
    self.root = root
    self.transforms = transforms
    self.folder = folder
    self.imgs = sorted(os.listdir(os.path.join(root, self.folder)))

  def __getitem__(self, idx):
    # load images
    img_path = os.path.join(self.root, self.folder, self.imgs[idx])
    img = Image.open(img_path).convert("RGB")

    label = torch.zeros(1,5)

    if self.folder == 'Train':
      data = train_data
    else:
      data = dev_data

    if '不良-乳汁吸附' in data[self.imgs[idx]]:
      label[0][0] = 1
    if '不良-機械傷害' in data[self.imgs[idx]]:
      label[0][1] = 1
    if '不良-炭疽病' in data[self.imgs[idx]]:
      label[0][2] = 1
    if '不良-著色不佳' in data[self.imgs[idx]]:
      label[0][3] = 1
    if '不良-黑斑病' in data[self.imgs[idx]]:
      label[0][4] = 1

    if self.transforms is not None:
      img = self.transforms(img)

    return img, label

  def __len__(self):
    return len(self.imgs)

In [8]:
train_set = Data(root= "./C2_TrainDev/", folder = 'Train', transforms=composed)
dev_set = Data(root= "./C2_TrainDev/", folder = 'Dev', transforms=composed)

data loader

In [9]:
Train_loader = DataLoader(train_set, batch_size=1)
Dev_loader = DataLoader(dev_set, batch_size=1)

model

In [10]:
class Net(nn.Module):   
    # Constructor
    def __init__(self):
        super(Net, self).__init__()
        self.alexnet = models.alexnet(pretrained=True)
        self.vgg16 = models.vgg16(pretrained=True)
    def forward(self, x):
        x1 = self.alexnet(x)
        x2 = self.vgg16(x)
        x = torch.cat((x1,x2),dim=1)
        return x

In [11]:
model = Net()
model.to(device);
model.eval();

Downloading: "https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth" to C:\Users\EXIA/.cache\torch\hub\checkpoints\alexnet-owt-4df8aa71.pth


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=244418560.0), HTML(value='')))




Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to C:\Users\EXIA/.cache\torch\hub\checkpoints\vgg16-397923af.pth


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=553433881.0), HTML(value='')))




## buliding feature list
過程需要約40分鐘

In [12]:
# train_feature = []

# for image,label in tqdm(Train_loader):
#   image = image.to(device)
#   with torch.no_grad():
#     out = model(image).to('cpu').numpy()
#   train_feature.append((out,label))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25768.0), HTML(value='')))




In [22]:
# dev_feature = []

# for image,label in tqdm(Dev_loader):
#   image = image.to(device)
#   with torch.no_grad():
#     out = model(image).to('cpu').numpy()
#   dev_feature.append((out,label))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3681.0), HTML(value='')))




save to pickle

In [None]:
# with open('./C2_TrainDev/train_feature.pkl', 'wb') as f:
#   pickle.dump(train_feature, f)

# with open('./C2_TrainDev/dev_feature.pkl', 'wb') as f:
#   pickle.dump(dev_feature, f)

load from pickle

In [16]:
with open('./C2_TrainDev/train_feature.pkl', 'rb') as f:
    train_feature = pickle.load(f)
    
with open('./C2_TrainDev/dev_feature.pkl', 'rb') as f:
    dev_feature = pickle.load(f)

In [23]:
print('len of train_feature:',len(train_feature))
print('len of dev_feature:',len(dev_feature))

len of train_feature: 25768
len of dev_feature: 3681


## Splitting data into 5 stacks

init data

In [17]:
c0_pos = []
c0_neg = []
c1_pos = []
c1_neg = []
c2_pos = []
c2_neg = []
c3_pos = []
c3_neg = []
c4_pos = []
c4_neg = []
#==========================
c0_pos_dev = []
c0_neg_dev = []
c1_pos_dev = []
c1_neg_dev = []
c2_pos_dev = []
c2_neg_dev = []
c3_pos_dev = []
c3_neg_dev = []
c4_pos_dev = []
c4_neg_dev = []

splitting data

In [18]:
for img, label in train_feature:
    if label[0][0][0] == 1:
        c0_pos.append((img, 1))
    else:
        c0_neg.append((img, 0))
    
    if label[0][0][1] == 1:
        c1_pos.append((img, 1))
    else:
        c1_neg.append((img, 0))

    if label[0][0][2] == 1:
        c2_pos.append((img, 1))
    else:
        c2_neg.append((img, 0))
        
    if label[0][0][3] == 1:
        c3_pos.append((img, 1))
    else:
        c3_neg.append((img, 0))
        
    if label[0][0][4] == 1:
        c4_pos.append((img, 1))
    else:
        c4_neg.append((img, 0))

In [19]:
for img, label in dev_feature:
    if label[0][0][0] == 1:
        c0_pos_dev.append((img, 1))
    else:
        c0_neg_dev.append((img, 0))
    
    if label[0][0][1] == 1:
        c1_pos_dev.append((img, 1))
    else:
        c1_neg_dev.append((img, 0))

    if label[0][0][2] == 1:
        c2_pos_dev.append((img, 1))
    else:
        c2_neg_dev.append((img, 0))
        
    if label[0][0][3] == 1:
        c3_pos_dev.append((img, 1))
    else:
        c3_neg_dev.append((img, 0))
        
    if label[0][0][4] == 1:
        c4_pos_dev.append((img, 1))
    else:
        c4_neg_dev.append((img, 0))

In [21]:
print('Train:')
print('c0: pos= {:<5}  neg= {:<5}  sum= {:<5}'.format(len(c0_pos), len(c0_neg), len(c0_pos)+len(c0_neg)))
print('c1: pos= {:<5}  neg= {:<5}  sum= {:<5}'.format(len(c1_pos), len(c1_neg), len(c1_pos)+len(c1_neg)))
print('c2: pos= {:<5}  neg= {:<5}  sum= {:<5}'.format(len(c2_pos), len(c2_neg), len(c2_pos)+len(c2_neg)))
print('c3: pos= {:<5}  neg= {:<5}  sum= {:<5}'.format(len(c3_pos), len(c3_neg), len(c3_pos)+len(c3_neg)))
print('c4: pos= {:<5}  neg= {:<5}  sum= {:<5}'.format(len(c4_pos), len(c4_neg), len(c4_pos)+len(c4_neg)))
print()

print('Dev:')
print('c0: pos= {:<5}  neg= {:<5}  sum= {:<5}'.format(len(c0_pos_dev), len(c0_neg_dev), len(c0_pos_dev)+len(c0_neg_dev)))
print('c1: pos= {:<5}  neg= {:<5}  sum= {:<5}'.format(len(c1_pos_dev), len(c1_neg_dev), len(c1_pos_dev)+len(c1_neg_dev)))
print('c2: pos= {:<5}  neg= {:<5}  sum= {:<5}'.format(len(c2_pos_dev), len(c2_neg_dev), len(c2_pos_dev)+len(c2_neg_dev)))
print('c3: pos= {:<5}  neg= {:<5}  sum= {:<5}'.format(len(c3_pos_dev), len(c3_neg_dev), len(c3_pos_dev)+len(c3_neg_dev)))
print('c4: pos= {:<5}  neg= {:<5}  sum= {:<5}'.format(len(c4_pos_dev), len(c4_neg_dev), len(c4_pos_dev)+len(c4_neg_dev)))

Train:
c0: pos= 2122   neg= 23646  sum= 25768
c1: pos= 419    neg= 25349  sum= 25768
c2: pos= 11489  neg= 14279  sum= 25768
c3: pos= 14515  neg= 11253  sum= 25768
c4: pos= 953    neg= 24815  sum= 25768

Dev:
c0: pos= 308    neg= 3373   sum= 3681 
c1: pos= 60     neg= 3621   sum= 3681 
c2: pos= 1765   neg= 1916   sum= 3681 
c3: pos= 1938   neg= 1743   sum= 3681 
c4: pos= 170    neg= 3511   sum= 3681 


shuffle the data list

In [24]:
for i in [c0_pos, c0_neg, c1_pos, c1_neg, c2_pos, c2_neg, c3_pos, c3_neg, c4_pos, c4_neg]:
    random.Random(0).shuffle(i)
    
for i in [c0_pos_dev, c0_neg_dev, c1_pos_dev, c1_neg_dev, c2_pos_dev, c2_neg_dev, c3_pos_dev, c3_neg_dev, c4_pos_dev, c4_neg_dev]:
    random.Random(0).shuffle(i)

In [25]:
# 把正負資料合併並打散後，輸出資料

def data_list(pos_list, neg_list, train=True):
    if train == True:
        number = min(len(pos_list), len(neg_list))
        pos = pos_list[:number]
        neg = neg_list[:number]
    else:
        pos = pos_list
        neg = neg_list
    output = pos + neg        
    random.Random(0).shuffle(output)
    return output

In [26]:
c0 = data_list(c0_pos, c0_neg)
c1 = data_list(c1_pos, c1_neg)
c2 = data_list(c2_pos, c2_neg)
c3 = data_list(c3_pos, c3_neg)
c4 = data_list(c4_pos, c4_neg)

c0_dev = data_list(c0_pos_dev, c0_neg_dev, train=False)
c1_dev = data_list(c1_pos_dev, c1_neg_dev, train=False)
c2_dev = data_list(c2_pos_dev, c2_neg_dev, train=False)
c3_dev = data_list(c3_pos_dev, c3_neg_dev, train=False)
c4_dev = data_list(c4_pos_dev, c4_neg_dev, train=False)

In [27]:
# 檢查資料數量是否一致

def check_balance(data):
    pos=0
    neg=0
    for i,j in data:
        if j == 1:
            pos += 1
        else:
            neg += 1
    print('pos:',pos)
    print('neg:',neg)
    print('sum:',pos+neg)

In [28]:
for i in [c0, c1, c2, c3, c4]:
    check_balance(i)
    print('-'*10)

pos: 2122
neg: 2122
sum: 4244
----------
pos: 419
neg: 419
sum: 838
----------
pos: 11489
neg: 11489
sum: 22978
----------
pos: 11253
neg: 11253
sum: 22506
----------
pos: 953
neg: 953
sum: 1906
----------


In [29]:
for i in [c0_dev, c1_dev, c2_dev, c3_dev, c4_dev]:
    check_balance(i)
    print('-'*10)

pos: 308
neg: 3373
sum: 3681
----------
pos: 60
neg: 3621
sum: 3681
----------
pos: 1765
neg: 1916
sum: 3681
----------
pos: 1938
neg: 1743
sum: 3681
----------
pos: 170
neg: 3511
sum: 3681
----------


In [30]:
# 把資料中的 feature 跟 label 分離

def split_x_y(data):
    X=[]
    y=[]
    for i,j in data:
        X.append(i)
        y.append(j)
    return X,y

In [31]:
c0_X, c0_y = split_x_y(c0)
c1_X, c1_y = split_x_y(c1)
c2_X, c2_y = split_x_y(c2)
c3_X, c3_y = split_x_y(c3)
c4_X, c4_y = split_x_y(c4)

c0_X_dev, c0_y_dev = split_x_y(c0_dev)
c1_X_dev, c1_y_dev = split_x_y(c1_dev)
c2_X_dev, c2_y_dev = split_x_y(c2_dev)
c3_X_dev, c3_y_dev = split_x_y(c3_dev)
c4_X_dev, c4_y_dev = split_x_y(c4_dev)

## 儲存訓練資料

In [32]:
all_list = [c0_X, c1_X, c2_X, c3_X, c4_X, c0_y, c1_y, c2_y, c3_y, c4_y]
all_list_name = ['c0_X', 'c1_X', 'c2_X', 'c3_X', 'c4_X', 'c0_y', 'c1_y', 'c2_y', 'c3_y', 'c4_y']

for list_, name in zip(all_list, all_list_name):
    root = './C2_TrainDev/processed_data/baseline/train/' + name +'.pkl'
    print(root)
    with open(root, 'wb') as f:
            pickle.dump(list_, f)

./C2_TrainDev/processed_data/baseline/train/c0_X.pkl
./C2_TrainDev/processed_data/baseline/train/c1_X.pkl
./C2_TrainDev/processed_data/baseline/train/c2_X.pkl
./C2_TrainDev/processed_data/baseline/train/c3_X.pkl
./C2_TrainDev/processed_data/baseline/train/c4_X.pkl
./C2_TrainDev/processed_data/baseline/train/c0_y.pkl
./C2_TrainDev/processed_data/baseline/train/c1_y.pkl
./C2_TrainDev/processed_data/baseline/train/c2_y.pkl
./C2_TrainDev/processed_data/baseline/train/c3_y.pkl
./C2_TrainDev/processed_data/baseline/train/c4_y.pkl


In [33]:
all_list = [c0_X_dev, c1_X_dev, c2_X_dev, c3_X_dev, c4_X_dev, c0_y_dev, c1_y_dev, c2_y_dev, c3_y_dev, c4_y_dev]
all_list_name = ['c0_X_dev', 'c1_X_dev', 'c2_X_dev', 'c3_X_dev', 'c4_X_dev', 'c0_y_dev', 'c1_y_dev', 'c2_y_dev', 'c3_y_dev', 'c4_y_dev']

for list_, name in zip(all_list, all_list_name):
    root = './C2_TrainDev/processed_data/baseline/dev/' + name +'.pkl'
    print(root)
    with open(root, 'wb') as f:
            pickle.dump(list_, f)

./C2_TrainDev/processed_data/baseline/dev/c0_X_dev.pkl
./C2_TrainDev/processed_data/baseline/dev/c1_X_dev.pkl
./C2_TrainDev/processed_data/baseline/dev/c2_X_dev.pkl
./C2_TrainDev/processed_data/baseline/dev/c3_X_dev.pkl
./C2_TrainDev/processed_data/baseline/dev/c4_X_dev.pkl
./C2_TrainDev/processed_data/baseline/dev/c0_y_dev.pkl
./C2_TrainDev/processed_data/baseline/dev/c1_y_dev.pkl
./C2_TrainDev/processed_data/baseline/dev/c2_y_dev.pkl
./C2_TrainDev/processed_data/baseline/dev/c3_y_dev.pkl
./C2_TrainDev/processed_data/baseline/dev/c4_y_dev.pkl
