In [1]:
import numpy as np
import os
import zipfile
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.models import Sequential
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

Set up data

In [2]:
#Set paths to data folder
path_data = os.path.join('../','Data')

In [3]:
#unzip train data
def unzip():
    with zipfile.ZipFile(os.path.join(path_data,'NonsegmentedV2.zip'),'r') as tr:
        tr.extractall(os.path.join(path_data,'train_images'))
    with zipfile.ZipFile(os.path.join(path_data,'ImagesFromTheWild.zip'),'r') as te:
        te.extractall(os.path.join(path_data,'test_images'))

In [4]:
#unzip()

In [5]:
train_path = os.path.join(path_data,'train_images')
test_path = os.path.join(path_data,'test_images')

Now the images are stored in the 'train_images' and 'test_images' directories


The images are organized in subfolders per class. Let's organize the images in a single dataframe to do some EDA

In [6]:
def images_df(path):
    data = []
    for folder in os.listdir(path):
        folder_path = os.path.join(path,folder)
        for image in os.listdir(folder_path):
            img_path = os.path.join(folder_path,image)
            data.append([image,folder,img_path])
    df = pd.DataFrame(data,columns = ['Image','Class','image path'])
    return df

In [22]:
df_train = images_df(train_path)

In [23]:
def plot_frequency_class(df,color):
    counts = df_train['Class'].value_counts()
    plt.figure(figsize=(10,5))
    counts.plot.bar(colormap = color)
    plt.show()
    counts_df = pd.DataFrame(counts)
    counts_df['percentage'] = (counts_df['Class'] / counts_df['Class'].sum())*100
    return counts_df

In [9]:
#plot_frequency_class(df_train,'summer')

Looks like we have a problem with an imbalanced dataset. Keep that in mind.

Now let's plot some examples per class

In [10]:
seeds_train = os.listdir(train_path)
def plot_examples(path,seeds):
    for seed in seeds:
        plt.figure(figsize = (40,40))
        full_path = os.path.join(path,seed)
        for image,i in zip(os.listdir(os.path.join(path,seed)),range(4)): #Loops 4 times in each folder picking 4 images per class
            img = plt.imread(os.path.join(full_path,image)) #image converted to array
            plt.subplot(12,6,i+1) 
            plt.axis('off')
            plt.title(seed)        
            plt.imshow(img)


In [11]:
#plot_examples(train_path,seeds_train)

Well it looks like we are going to have some problems here. 

The background could introduce some bias to the algorithm mainly for two reasons: It is the same in almost all the images, the model might extract information from the rocks and not from the plants, specially in classes like 'Loose Silky-bent' and 'Black Grass' that the seedlings are very thin. Second, the dataset is created in a laboratory using a high resolution camera (see [this](https://arxiv.org/pdf/1711.05458.pdf) paper) so if we train a model using this images it might not perform very well in an real environment. Real weeds in a farm will have a very different distribution if a farmer takes a photo using a smartphone. To solve this issues, we will probably need to use segmentation techniques and test the model in images that are not from a laboratory. 

Thanks to the authors of this paper for the dataset provided [PAPER: A Public Image Database for Benchmark of Plant Seedling Classification Algorithms](https://vision.eng.au.dk/plant-seedlings-dataset/)


Now we have three issues. Imbalanced dataset, the background of the images and the distribution of the data

But for now, let's look the test data. The authors proposed a dataset of images taken in a real world environment with no artifical light they called:  'images from the wild'

In [12]:
seed_test = os.listdir(test_path)

In [13]:
#plot_examples(test_path,seed_test)

In [14]:
df_test = images_df(test_path)

In [19]:
df_test

Unnamed: 0,Image,Class,image path
0,WP_20150521_15_19_40_Pro__highres_0.tiff,Fat Hen,../Data/test_images/Fat Hen/WP_20150521_15_19_...
1,WP_20150611_09_37_38_Pro__highres_4.tiff,Fat Hen,../Data/test_images/Fat Hen/WP_20150611_09_37_...
2,20160524_143721_0.tiff,Fat Hen,../Data/test_images/Fat Hen/20160524_143721_0....
3,WP_20160517_15_19_13_Pro_2.tiff,Fat Hen,../Data/test_images/Fat Hen/WP_20160517_15_19_...
4,IMG_1286_13.tiff,Fat Hen,../Data/test_images/Fat Hen/IMG_1286_13.tiff
...,...,...,...
91,WP_20150619_10_42_25_Pro__highres_3.tiff,Shepherds Purse,../Data/test_images/Shepherds Purse/WP_2015061...
92,WP_20150521_13_45_56_Pro__highres_9.tiff,Shepherds Purse,../Data/test_images/Shepherds Purse/WP_2015052...
93,IMG_1336_0.tiff,Shepherds Purse,../Data/test_images/Shepherds Purse/IMG_1336_0...
94,WP_20150611_09_35_37_Pro__highres_0.tiff,Shepherds Purse,../Data/test_images/Shepherds Purse/WP_2015061...


In [16]:
df_test['Class'].value_counts()

Cleavers                     12
Fat Hen                      12
Common wheat                 12
Shepherds Purse              12
Scentless Mayweed            12
Charlock                     12
Common Chickweed             12
Small-flowered Cranesbill    12
Name: Class, dtype: int64

In [17]:
seed_test

['Fat Hen',
 'Charlock',
 'Common wheat',
 'Common Chickweed',
 'Small-flowered Cranesbill',
 'Scentless Mayweed',
 'Cleavers',
 'Shepherds Purse']