# Import Libraries

In [1]:
import cv2
import numpy as np
import pandas as pd

# Data path setting & analysis

In [2]:
IMAGE_PATH = '/data/backup/pervinco_2020/datasets/plant-pathology-2020-fgvc7/images/'
TEST_PATH = '/data/backup/pervinco_2020/datasets/plant-pathology-2020-fgvc7/test.csv'
TRAIN_PATH = '/data/backup/pervinco_2020/datasets/plant-pathology-2020-fgvc7/train.csv'
SUB_PATH = '/data/backup/pervinco_2020/datasets/plant-pathology-2020-fgvc7/sample_submission.csv'

In [3]:
sub = pd.read_csv(SUB_PATH)
test_data = pd.read_csv(TEST_PATH)
train_data = pd.read_csv(TRAIN_PATH)

In [4]:
train_data.head()

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Train_0,0,0,0,1
1,Train_1,0,1,0,0
2,Train_2,1,0,0,0
3,Train_3,0,0,1,0
4,Train_4,1,0,0,0


In [5]:
test_data.head()

Unnamed: 0,image_id
0,Test_0
1,Test_1
2,Test_2
3,Test_3
4,Test_4


In [6]:
# import to show progress bar
from tqdm import tqdm
tqdm.pandas()

SAMPLE_LEN = 100

def load_image(image_id):
    file_path = image_id + ".jpg"
    image = cv2.imread(IMAGE_PATH + file_path)
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

train_images = train_data["image_id"][:SAMPLE_LEN].progress_apply(load_image)

100%|██████████| 100/100 [00:02<00:00, 47.80it/s]


In [9]:
# python visualization tool
import plotly.express as px

fig = px.imshow(cv2.resize(train_images[0], (205, 136)))
fig.show()

Sample data들 중 첫번째 이미지를 ploting해서 이미지를 출력했다. RGB값은 이미지 위로 마우스 포인터를 올려 볼 수 있다.
이미지의 녹색 부분은 Blue channel 값이 매우 낮으나, 갈색 부분은 Blue channel 값이 높은 것을 확인 할 수 있다.
이는 이미지의 녹색(건강한)부분이 파란색 값이 낮고, 건강하지 않은 부분은 파란색 값이 높을 수 있다는 가능성을 나타낸다.
이것은 파란색 채널이 식물의 질병을 감지하는 열쇠가 될 수 있음을 시사한다.

## Channel Distributions

In [10]:
red_values = [np.mean(train_images[idx][:, :, 0]) for idx in range(len(train_images))]
green_values = [np.mean(train_images[idx][:, :, 1]) for idx in range(len(train_images))]
blue_values = [np.mean(train_images[idx][:, :, 2]) for idx in range(len(train_images))]
values = [np.mean(train_images[idx]) for idx in range(len(train_images))]

In [12]:
import plotly.figure_factory as ff

fig = ff.create_distplot([values], group_labels=["Channels"], colors=["purple"])
fig.update_layout(showlegend=False, template="simple_white")
fig.update_layout(title_text="Distribution of channel values")
fig.data[0].marker.line.color = 'rgb(0, 0, 0)'
fig.data[0].marker.line.width = 0.5
fig