# Import libraries

In [1]:
import os
import glob
from natsort import natsorted
from PIL import Image
# import random
import pandas as pd
import numpy as np

# `def` functions

`test_image_all_black()` tests if the image is all black (`img.getextrema() == (0,0)`), which means it doesn't have a mask for a particular label.

In [2]:
def test_image_all_black(path_img_file):
    """ Tests if image file is all black, meaning no mask/label == 0 """
    
    img = Image.open(path_img_file)
    
    if img.getextrema() == (0,0):  # test if all black
        return 0
    else:
        return 1
    
    
    
def make_label_dict(list_img_files):
    """ Make label dictionary structure """
    
    label_dict = {}
    
    for file in list_img_files:
    
        label = file.split('/')[-2]
        res = test_image_all_black(file)

        if file not in label_dict:
            label_dict[file]={}
            label_dict[file][label]=res
        else:
            label_dict[file][label]=res
            
    return label_dict



# Make dictionary of labels

I don't know how to deal with multiple layers. So, I am getting all the files with only one label and doing exploratory analysis on those data.

Each label directory contains all mask files. Image files in these directory that are entirely black do not have a mask for that particular label. If the image files have some white in them, they have a mask for that label.

I am creating a dictionary structure to gather the label for each image like so:


```
label_dict = {

    'filename_1.png' : {
        'cloud_shadow' : 1,
        'double_plant' : 0,
        (...)
    },
    
    'filename_2.png' : {
        'cloud_shadow' : 0,
        'double_plant' : 1,
        (...)
    },
    
    (...)
}
```

1 indicates the images has that label  
0 indicates no label

In [3]:
train_files = natsorted(glob.glob('dataset/Agriculture-Vision/train/labels/*/*.png'))
train_dict = make_label_dict(train_files)

In [4]:
val_files = natsorted(glob.glob('dataset/Agriculture-Vision/val/labels/*/*.png'))
val_dict = make_label_dict(val_files)

# Transform dictionary into a dataframe

I am transforming that dictionary structure into a `pd.DataFrame()` and removing all files that contain more than one label (because I am not sure how to deal with multiple labels).

## Train

In [11]:
train_df = pd.DataFrame.from_dict(train_dict, orient='index')
train_df = train_df.replace(np.nan, 0)
i = train_df.shape[0]
train_singles = train_df[train_df.sum(axis=1)==1]
f = train_singles.shape[0]
train_singles = train_singles.reset_index()
train_singles = train_singles.rename(columns={'index':'filename'})
print(f"{i-f} image files with multiple labels. Now, the exploratory dataset has {f} files.")

62970 image files with multiple labels. Now, the exploratory dataset has 14436 files.


## Val

In [12]:
val_df = pd.DataFrame.from_dict(val_dict, orient='index')
val_df = val_df.replace(np.nan, 0)
i = val_df.shape[0]
val_singles = val_df[val_df.sum(axis=1)==1]
f = val_singles.shape[0]
val_singles = val_singles.reset_index()
val_singles = val_singles.rename(columns={'index':'filename'})
print(f"{i-f} image files with multiple labels. Now, the exploratory dataset has {f} files.")

21872 image files with multiple labels. Now, the exploratory dataset has 4714 files.


I know I am leaving out a lot of files... but I don't know how to deal with multiple labels!

In [13]:
# save dataframes
train_singles.to_csv('training_singles.csv', index=False)
val_singles.to_csv('validation_singles.csv', index=False)