This notebook is aimed to help in the download of images to be used to create the dataset.

In [1]:
import os
from pathlib import Path

from fastai.vision import *

# Download images (Google)

To download the images from a Google search, pres <Ctl+Alt+J> and the Web Console window will appear. Type the following command:
 
    urls = Array.from(document.querySelectorAll('.rg_di .rg_meta')).map(el=>JSON.parse(el.textContent).ou);
    window.open('data:text/csv;charset=utf-8,' + escape(urls.join('\n')));
    
This will open a dialog box asking to save the file. Save it with the name of the mushroom.

**Note**: I used the latin name in order to filter the search.


In [2]:
data_path = Path.home() / 'repos_github/mushroom-identifier/data'
google_path = data_path / 'google'

In [3]:
csv_folder = google_path / 'csv'
raw_folder = google_path / 'raw_images'

In [9]:
for csv_file in csv_folder.iterdir():
    print(f'Processing {csv_file.name} file...')
    folder_name = raw_folder / csv_file.stem
    if not folder_name.is_dir() and (csv_file.suffix == '.csv'):
        print(f'Creating folder {folder_name}...')
        os.mkdir(folder_name)
    if csv_file.suffix == '.csv':
        print(f'Downloading {csv_file.stem} images...')
        download_images(csv_file, folder_name)

# Check image format, rename

In [None]:
def is_image(file, valid_img_types=['jpeg', 'png', 'tiff', 'bmp']):
    """"""
    img_type = imghdr.what(file)
    if (img_type not in valid_img_types) or (img_type is None):
        if img_type is None:
            try:
                # import pdb; pdb.set_trace()
                im=Image_PIL.open(file)
                return True
            except IOError:
                return False
        elif img_type not in valid_img_types:
            return False
    else:
        return True

In [None]:
unknown_folder = test_train_path / '_unknown'
unknown_folder

In [None]:
df_labels = pd.DataFrame(columns=['name', 'labels'])

for mushroom in train_folder.iterdir():
    try:
        mushroom_ = mushroom.stem
        print(f'- processing folder {mushroom}')
        i = 0  # Initialize image count
        j = 0  # Initialize bad image count
        for image in mushroom.iterdir():
            if is_image(image):
                # rename image:
                new_name = f'{mushroom_}_{str(i).zfill(4)}.jpg'
                # Prevent overwriting previously downloaded images with same target name
                while (train_folder / mushroom_ / new_name).exists():
                    # import pdb; pdb.set_trace()
                    i +=1
                    new_name = f'{mushroom_}_{str(i).zfill(4)}.jpg'
                os.rename(str(image), str(train_folder / mushroom_ / new_name ))
                image = image.with_name(new_name)
                dict_img = {
                    'name': Path(mushroom_) / image.name,
                    'labels': (f'{dict_mushrooms[mushroom_]["latin"]};'
                               f'{dict_mushrooms[mushroom_]["name"]};'
                               f'{dict_mushrooms[mushroom_]["edibility"]};'
                               f'{dict_mushrooms[mushroom_]["poisonous"]};')  
                }
                df_labels = df_labels.append(dict_img, ignore_index=True)
                i += 1
            elif is_image(image) is False:
                suffix = image.suffix
                if image.suffix in ['mp4', 'gif', '.webp', '.ashx', '.webp']:
                    print(f'Removing not an image file: {image}')
                    #import pdb; pdb.set_trace()
                    os.remove(image)
                    #os.rename(str(image), str( unknown_folder / new_name ))
                else:
                    new_name = f'{mushroom_}_{str(j).zfill(4)}.{suffix}'
                    j += 1
                    print(f'Renaming corrupt image {image}')
                    os.rename(str(image), str( unknown_folder / new_name ))   
    except NotADirectoryError as e:
        import pdb; pdb.set_trace()
        if  '.DS_Store' in str(mushroom):
            print(f'Removing .DS_Store file...')
            #import pdb; pdb.set_trace()
            os.remove(mushroom)
        else:
            print(e)
