# Preprocessing
As mentioned in our proposal, we intend to:
* Group species into a larger category, instead by genus.
* Discard genus of less than 75 images.
* Apply data augmentation techniques, including rotations, flips, and Gaussian noise.
* Format data for ease of training, validating, and testing.

## Download
This uses the Kaggle API, but you can alternatively direct download. Make sure you have a key at `C:/Users/<username>/.kaggle/kaggle.json` (for Windows machines). We'll be doing our file manipulations in-place since the dataset is fairly large (~3.0 GB).

In [None]:
import nest_asyncio
nest_asyncio.apply()

: 

In [3]:
# download via API
!kaggle datasets download -d anaclaricerezende/mind-funga -p ../data --unzip

^C


Dataset URL: https://www.kaggle.com/datasets/anaclaricerezende/mind-funga
License(s): CC-BY-SA-3.0
Downloading mind-funga.zip to ../data




  0%|          | 0.00/2.98G [00:00<?, ?B/s]
  0%|          | 1.00M/2.98G [00:00<05:59, 8.88MB/s]
  0%|          | 6.00M/2.98G [00:00<01:41, 31.5MB/s]
  0%|          | 11.0M/2.98G [00:00<01:18, 40.3MB/s]
  1%|          | 17.0M/2.98G [00:00<01:08, 46.4MB/s]
  1%|          | 22.0M/2.98G [00:00<01:21, 38.8MB/s]
  1%|          | 26.0M/2.98G [00:00<01:34, 33.6MB/s]
  1%|          | 33.0M/2.98G [00:01<01:53, 27.8MB/s]
  1%|▏         | 40.0M/2.98G [00:01<01:29, 35.4MB/s]
  1%|▏         | 45.0M/2.98G [00:01<01:43, 30.3MB/s]
  2%|▏         | 49.0M/2.98G [00:01<02:07, 24.6MB/s]
  2%|▏         | 55.0M/2.98G [00:01<01:42, 30.5MB/s]
  2%|▏         | 59.0M/2.98G [00:02<02:17, 22.8MB/s]
  2%|▏         | 67.0M/2.98G [00:02<01:37, 32.1MB/s]
  2%|▏         | 73.0M/2.98G [00:02<01:26, 36.3MB/s]
  3%|▎         | 81.0M/2.98G [00:02<02:04, 24.9MB/s]
  3%|▎         | 87.0M/2.98G [00:03<01:43, 29.9MB/s]
  3%|▎         | 94.0M/2.98G [00:03<01:23, 36.9MB/s]
  3%|▎         | 100M/2.98G [00:03<01:21, 38.0MB/s] 
 

## Genus grouping

In [3]:
import os
import shutil

def move_files(src: str, dst: str) -> None:
    files = os.scandir(src)
    for file in files:
        shutil.move(file.path, os.path.join(dst, file.name))

In [5]:
# run once, don't touch again
data_path = '../data/MIND.Funga App'
subdir = os.scandir(data_path)

prev_genus = ''
for species in subdir:
    if species.is_dir():
        curr_genus = species.name.split(' ', 1)[0]
        new_path = os.path.join(data_path, curr_genus)
        if curr_genus != prev_genus or len(prev_genus) == 0: # if different genus
            prev_genus = curr_genus
            print(f'Preprocessing {curr_genus}, num files: {len(os.listdir(new_path))}')
            new_path = os.path.join(data_path, curr_genus)
            os.mkdir(new_path) # create new dir
            move_files(species.path, new_path)
            shutil.rmtree(species.path) # remove old dir
        else: # if same genus
            move_files(species.path, new_path)
            shutil.rmtree(species.path)

FileNotFoundError: [WinError 3] The system cannot find the path specified: '../data/MIND.Funga App\\Abrachium'

In [4]:
data_path = '../data/MIND.Funga App'
subdirs = os.scandir(data_path)
file_counts = [(genus, len(os.listdir(genus))) for genus in subdirs]
file_counts.sort(key=lambda x: x[1], reverse=True)

for genus, count in file_counts[10:]: # keep top 10 directories
    shutil.rmtree(genus)

## Data augmentation

In [8]:
# RUN THIS TO REMOVE AUGMENTED IMAGES

subdir = os.scandir(data_path)
for genus in subdir:
    if genus.is_dir():
        genus_name = genus.name
        genus_path = os.path.join(data_path, genus_name)
        for filename in os.listdir(genus_path):
            if "_hflip" in filename:
                file_path = os.path.join(genus_path, filename)
                os.remove(file_path)
            if "_90_rotate" in filename:
                file_path = os.path.join(genus_path, filename)
                os.remove(file_path)
            if "_180_rotate" in filename:
                file_path = os.path.join(genus_path, filename)
                os.remove(file_path)
            if "_270_rotate" in filename:
                file_path = os.path.join(genus_path, filename)
                os.remove(file_path)
            if "_gauss" in filename:
                file_path = os.path.join(genus_path, filename)
                os.remove(file_path)
            if "_random_erasing" in filename:
                file_path = os.path.join(genus_path, filename)
                os.remove(file_path)

In [5]:
import collections

In [13]:
def splitFunction(trainRatio = 0.75, valTratio = 0.15, testRatio = 0.1):
    src = '../data/MIND.Funga App'

    #Make a new folder called splittedImgFolder
    newLocation = '../splittedImgFolder'


    Allclasses = [d for d in os.listdir(src) if os.path.isdir(os.path.join(src, d))]
    for currClass in Allclasses:
        folder = os.path.join(src, currClass)

        #Its not copying the image correctly 
        #The clases got copied
        imgArr = [img for img in os.listdir(folder) if os.path.isfile(os.path.join(folder, img))]

        print(len(imgArr))
        
        numTrain = int(len(imgArr) * trainRatio)
        numVal = int(len(imgArr) * (valTratio))

        splitDictionary = collections.defaultdict(list)
        splitDictionary['trainSet'] = imgArr[:numTrain]
        splitDictionary['validationSet'] = imgArr[numTrain:numVal+numTrain]
        splitDictionary['testSet'] = imgArr[numVal+numTrain:]

        #loop through each split then add into folder
        for typeSet, images in splitDictionary.items():
            #make new folder. join location, typeSet
            newFolder = os.path.join(newLocation, typeSet, currClass)
            #construct folder
            os.makedirs(newFolder, exist_ok=True)
            #print(len(images))
            for img in images:
                from_ = os.path.join(folder, img)
                to_ = os.path.join(newFolder, img)

                shutil.copy2(from_, to_)

In [14]:
#Make sure the folder is first created
splitFunction(0.75, 0.15, 0.1)

In [9]:
# BE CAREFUL WHEN RUNNING THIS MORE THAN ONCE, IF YOU WANT TO RUN THIS MORE THAN ONCE, USE THE CELL ABOVE TO CLEAR THE DATABASE FIRST

import numpy as np
import random
from PIL import Image
import matplotlib.pyplot as plt

def da_horizontal_flip(img_path: str, save_dir: str, base_name: str) -> None:
    #print("get")
    img = Image.open(img_path)
    img_hflip = img.transpose(Image.FLIP_LEFT_RIGHT)
    img_hflip.save(os.path.join(save_dir, base_name.replace('.jpg', '_hflip.jpg')))

def da_90_rotate(img_path: str, save_dir: str, base_name: str) -> None:
    #print("get")
    img = Image.open(img_path)
    img_90_rotate = img.rotate(90, expand=False)
    img_90_rotate.save(os.path.join(save_dir, base_name.replace('.jpg', '_90_rotate.jpg')))

def da_180_rotate(img_path: str, save_dir: str, base_name: str) -> None:
    #print("get")
    img = Image.open(img_path)
    img_180_rotate = img.rotate(180, expand=False)
    img_180_rotate.save(os.path.join(save_dir, base_name.replace('.jpg', '_180_rotate.jpg')))

def da_270_rotate(img_path: str, save_dir: str, base_name: str) -> None:
    #print("get")
    img = Image.open(img_path)
    img_270_rotate = img.rotate(270, expand=False)
    img_270_rotate.save(os.path.join(save_dir, base_name.replace('.jpg', '_270_rotate.jpg')))

def da_gaussian_noise(img_path: str, save_dir: str, base_name: str) -> None:
    #print("get")
    img = Image.open(img_path)
    img_gauss = np.array(img)
    row, col, ch = img_gauss.shape
    mean = 0
    var = 0.2 # this is doubled already, use this to adjust the noise
    sigma = var**0.5
    gauss = np.random.normal(mean, sigma, (row, col, ch))
    gaussed = img_gauss + gauss
    finalgauss = Image.fromarray(np.uint8(gaussed))
    finalgauss.save(os.path.join(save_dir, base_name.replace('.jpg', '_gauss.jpg')))

def da_random_erasing(img_path: str, save_dir: str, base_name: str) -> None:
    img = Image.open(img_path)
    img_re = np.array(img)
    row, col, ch = img_re.shape
    s = random.uniform(0.1, 0.4) # these vary the size of the rectangle
    r = random.uniform(0.1, 0.4)
    random_row = int(row * s)
    random_col = int(col * r)
    random_x = np.random.randint(0, row - random_row)
    random_y = np.random.randint(0, col - random_col)
    img_re[random_x:random_x+random_row, random_y:random_y+random_col, :] = 0
    img_re = Image.fromarray(np.uint8(img_re))
    img_re.save(os.path.join(save_dir, base_name.replace('.jpg', '_random_erasing.jpg')))

# this is where we choose which function we want to use to data augment
subdir = os.scandir(data_path)
for genus in subdir:
    if genus.is_dir():
        genus_name = genus.name
        genus_path = os.path.join(data_path, genus_name)
        files = [entry for entry in os.scandir(genus_path) if entry.is_file()]
        for i in range(0, len(files), 2):
            img_file = files[i]
            da_horizontal_flip(img_file.path, genus_path, img_file.name)    
            da_90_rotate(img_file.path, genus_path, img_file.name)    
            da_180_rotate(img_file.path, genus_path, img_file.name)    
            da_270_rotate(img_file.path, genus_path, img_file.name)   
            da_gaussian_noise(img_file.path, genus_path, img_file.name)
            da_random_erasing(img_file.path, genus_path, img_file.name)

## Final steps
Run this at the very end.

In [10]:
import os
import shutil
from PIL import Image

try: # CUDA-specific install if running on Colab
    %load_ext cudf.pandas
except ModuleNotFoundError:
    print('CuDF not installed, defaulting to regular pandas')
import pandas as pd

CuDF not installed, defaulting to regular pandas


In [11]:
def generate_labelled_set(dimensions: tuple[int, int] = (300, 300), grayscale: bool = False) -> None:
    df = pd.DataFrame()
    df['Path'] = None
    df['Genus'] = ''
    df['Image'] = None
    
    data_path = '../data/MIND.Funga App'
    directories = os.scandir(data_path)
    for subdir in directories:
        genus = subdir.name
        files = os.scandir(subdir.path)
        
        if grayscale:
            add = pd.DataFrame(
                [{'Path': f"{file.path}",
                  'Genus': genus,
                  'Image': Image.open(file.path).resize(dimensions, Image.Resampling.LANCZOS).convert('L')}
                 for file in files]) # computationally efficient df concatenation
        else:
            add = pd.DataFrame(
                [{'Path': f"{file.path}",
                  'Genus': genus,
                  'Image': Image.open(file.path).resize(dimensions, Image.Resampling.LANCZOS)}
                 for file in files]) # computationally efficient df concatenation
        df = pd.concat([df, add])
        
    print(df.shape)
    df.to_pickle('../data/set.pkl')
    df = df.drop(columns='Image')
    df.to_csv('../data/set.csv', index=False)

In [12]:
generate_labelled_set(grayscale=True) # for random forest

(12316, 3)
