In [7]:
try: # CUDA-specific install if running on Colab
    %load_ext cudf.pandas
except ModuleNotFoundError:
    print('CuDF not installed, defaulting to regular pandas')
import pandas as pd

CuDF not installed, defaulting to regular pandas


# Preprocessing
As mentioned in our proposal, we intend to:
* Group species into a larger category, instead by genus.
* Discard genus of less than 75 images.
* Apply data augmentation techniques, including rotations, flips, and Gaussian noise.
* Format data for ease of training, validating, and testing.

## Download
This uses the Kaggle API, but you can alternatively direct download. Make sure you have a key at `C:/Users/<username>/.kaggle/kaggle.json` (for Windows machines). We'll be doing our file manipulations in-place since the dataset is fairly large (~3.0 GB).

In [23]:
# download via API
!kaggle datasets download -d anaclaricerezende/mind-funga -p ../data --unzip

Dataset URL: https://www.kaggle.com/datasets/anaclaricerezende/mind-funga
License(s): CC-BY-SA-3.0
Downloading mind-funga.zip to ../data




  0%|          | 0.00/2.98G [00:00<?, ?B/s]
  0%|          | 1.00M/2.98G [00:00<07:40, 6.93MB/s]
  0%|          | 8.00M/2.98G [00:00<01:22, 38.6MB/s]
  0%|          | 13.0M/2.98G [00:00<01:19, 39.8MB/s]
  1%|          | 19.0M/2.98G [00:00<01:13, 42.9MB/s]
  1%|          | 26.0M/2.98G [00:00<01:06, 47.6MB/s]
  1%|1         | 33.0M/2.98G [00:00<01:17, 40.8MB/s]
  1%|1         | 40.0M/2.98G [00:00<01:07, 46.6MB/s]
  1%|1         | 45.0M/2.98G [00:01<01:23, 37.6MB/s]
  2%|1         | 50.0M/2.98G [00:01<01:18, 40.0MB/s]
  2%|1         | 57.0M/2.98G [00:01<01:11, 43.7MB/s]
  2%|2         | 65.0M/2.98G [00:01<01:10, 44.3MB/s]
  2%|2         | 73.0M/2.98G [00:01<01:09, 44.8MB/s]
  3%|2         | 81.0M/2.98G [00:01<01:06, 46.9MB/s]
  3%|2         | 89.0M/2.98G [00:02<01:18, 39.7MB/s]
  3%|3         | 97.0M/2.98G [00:02<01:23, 37.0MB/s]
  3%|3         | 105M/2.98G [00:02<01:22, 37.3MB/s] 
  4%|3         | 113M/2.98G [00:02<01:16, 40.4MB/s]
  4%|3         | 121M/2.98G [00:03<01:13, 41.7MB/s]
  4

## Genus grouping

In [24]:
import os
import shutil

def move_files(src: str, dst: str) -> None:
    files = os.scandir(src)
    for file in files:
        shutil.move(file.path, os.path.join(dst, file.name))

In [25]:
# run once, don't touch again
data_path = '../data/MIND.Funga App'
subdir = os.scandir(data_path)

prev_genus = ''
for species in subdir:
    curr_genus = species.name.split(' ', 1)[0]
    new_path = os.path.join(data_path, curr_genus)
    if curr_genus != prev_genus or len(prev_genus) == 0: # if different genus
        prev_genus = curr_genus
        print('Preprocessing ' + curr_genus)
        new_path = os.path.join(data_path, curr_genus)
        os.mkdir(new_path) # create new dir
        move_files(species.path, new_path)
        shutil.rmtree(species.path) # remove old dir
    else: # if same genus
        move_files(species.path, new_path)
        shutil.rmtree(species.path)

Preprocessing Abrachium
Preprocessing Abundisporus
Preprocessing Aegis
Preprocessing Agaricus
Preprocessing Agrocybe
Preprocessing Aleurodiscus
Preprocessing Amanita
Preprocessing Amauroderma
Preprocessing Amparoina
Preprocessing Amylostereum
Preprocessing Antrodia
Preprocessing Antrodiella
Preprocessing Arambarria
Preprocessing Armilaria
Preprocessing Artolenzites
Preprocessing Artomyces
Preprocessing Ascopolyporus
Preprocessing Aseroe
Preprocessing Asteridiella
Preprocessing Asterostroma
Preprocessing Astrothelium
Preprocessing Atroporus
Preprocessing Aurantiopileus
Preprocessing Auricularia
Preprocessing Auriporia
Preprocessing Auriscalpium
Preprocessing Beauveria
Preprocessing Biscogniauxia
Preprocessing Boletinellus
Preprocessing Bondarzewia
Preprocessing Bresadolia
Preprocessing Brigantiaea
Preprocessing Brunneocorticium
Preprocessing Byssomerulius
Preprocessing Callistosporium
Preprocessing Calocera
Preprocessing Calvatia
Preprocessing Camarops
Preprocessing Camillea
Preprocessi

In [26]:
subdir = os.scandir(data_path)

i = 0
for genus in subdir:
    files = os.listdir(genus)
    if len(files) < 75:
        shutil.rmtree(genus.path)

## Data augmentation