# Imports

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import json
from pathlib import Path
from PIL import Image as Image_PIL
import imghdr # built in module
import numpy as np
from fastai import *
from fastai.vision import *
from fastai.metrics import error_rate, accuracy

In [None]:
!which python

# Paths

In [None]:
# Data
data_path = Path.home() / 'CloudStation/data_science/01-python_lab/99_datasets/mushrooms'
data_path_other = Path('../data')
test_train_path = data_path / 'test_and_train'
train_folder = test_train_path / 'train'
test_folder = test_train_path / 'test'
google_path = data_path / 'google'

# Project
project_path = Path.home() / 'repos_github/mushroom-identifier/'
app_path = project_path / 'app'
model_path = app_path / 'models'

In [None]:
model_path

## Load the dictionary

In [None]:
dict_mushrooms_path = data_path_other / 'dict_mushrooms.json'

In [None]:
with open(dict_mushrooms_path, 'r') as fp:
    dict_mushrooms = json.load(fp)

# Train the model

## Create `labels.csv` file

In [None]:
dict_mushrooms_path = data_path_other / 'dict_muchrooms.json'

In [None]:
def is_image(file, valid_img_types=['jpeg', 'png', 'tiff', 'bmp']):
    """"""
    img_type = imghdr.what(file)
    if (img_type not in valid_img_types) or (img_type is None):
        if img_type is None:
            try:
                # import pdb; pdb.set_trace()
                im=Image_PIL.open(file)
                return True
            except IOError:
                return False
        elif img_type not in valid_img_types:
            return False
    else:
        return True

In [None]:
unknown_folder = test_train_path / '_unknown'
unknown_folder

In [None]:
df_labels = pd.DataFrame(columns=['name', 'labels'])

for mushroom in train_folder.iterdir():
    try:
        mushroom_ = mushroom.stem
        print(f'- processing folder {mushroom}')
        i = 0  # Initialize image count
        j = 0  # Initialize bad image count
        for image in mushroom.iterdir():
            if is_image(image):
                # rename image:
                new_name = f'{mushroom_}_{str(i).zfill(4)}.jpg'
                # Prevent overwriting previously downloaded images with same target name
                while (train_folder / mushroom_ / new_name).exists():
                    # import pdb; pdb.set_trace()
                    i +=1
                    new_name = f'{mushroom_}_{str(i).zfill(4)}.jpg'
                os.rename(str(image), str(train_folder / mushroom_ / new_name ))
                image = image.with_name(new_name)
                dict_img = {
                    'name': Path(mushroom_) / image.name,
                    'labels': f'{dict_mushrooms[mushroom_]["name"]} ({dict_mushrooms[mushroom_]["latin"]}): ' +\
                            f'{dict_mushrooms[mushroom_]["edibility"]}',  
                }
                df_labels = df_labels.append(dict_img, ignore_index=True)
                i += 1
            elif is_image(image) is False:
                suffix = image.suffix
                if image.suffix in ['mp4', 'gif', '.webp', '.ashx', '.webp']:
                    print(f'Removing not an image file: {image}')
                    #import pdb; pdb.set_trace()
                    os.remove(image)
                    #os.rename(str(image), str( unknown_folder / new_name ))
                else:
                    new_name = f'{mushroom_}_{str(j).zfill(4)}.{suffix}'
                    j += 1
                    print(f'Renaming corrupt image {image}')
                    os.rename(str(image), str( unknown_folder / new_name ))   
    except NotADirectoryError as e:
        import pdb; pdb.set_trace()
        if  '.DS_Store' in str(mushroom):
            print(f'Removing .DS_Store file...')
            #import pdb; pdb.set_trace()
            os.remove(mushroom)
        else:
            print(e)


In [None]:
df_labels.to_csv(data_path_other / 'labels.csv', index=False)

## generate databunch

Add some data augmentation with `get_transforms`:

In [None]:
tfms = get_transforms(
    flip_vert=True,
    max_lighting=0.1,
    max_zoom=1.05,
    max_warp=0.5
)

Define batch size, image size:

In [None]:
bs = 32 # 64       # batch_size
size = 128
np.random.seed(42) # set random seed so we always get the same validation set

Create an `ImageDataBunch` from `path` by splitting the data in folder and labelled in a file `csv_labels` between a training and validation set.

Use `valid_pct` to indicate the percentage of the total images to use as the validation set. An optional test folder contains unlabelled data and suffix contains an optional suffix to add to the filenames in `csv_labels` (such as '.jpg').
* `fn_col` is the index (or the name) of the the column containing the filenames and 
* `label_col` is the index (indices) (or the name(s)) of the column(s) containing the labels.
* Use `header` to specify the format of the csv header, and
* `delimiter` to specify a non-standard csv-field separator.

In case your csv has no header, column parameters can only be specified as indices. If `label_delim` is passed, split what's in the label column according to that separator.

In [None]:
test_train_path

doc(ImageDataBunch)

In [None]:
src = (ImageList.from_folder(path=train_folder)
       .split_by_rand_pct(0.2)
       .label_from_folder())

In [None]:
img_data = (src.transform(tfms, size=128)
        .databunch()
        .normalize(imagenet_stats))

In [None]:
img_data.normalize(imagenet_stats)

First image of the training dataset:

In [None]:
img_data.train_ds[0]

First image of the validation dataset:

In [None]:
img_data.valid_ds[0]

Show some images from the dataset:

In [None]:
img_data.show_batch(rows=3, figsize=(10,8))

Number of classes in the dataset

In [None]:
# Number of categories
img_data.c

In [None]:
# Names of the categories
img_data.classes

## Train the model, with image size=128 

In [None]:
model = cnn_learner(img_data, models.resnet34, metrics=[accuracy, error_rate])

In [None]:
model.fit_one_cycle(1)

In [None]:
model.save(Path.home()/'mushrooms_1_cycle')

In [None]:
model.fit_one_cycle(2)

In [None]:
model.save('train_7_cycles')

In [None]:
interpret = ClassificationInterpretation.from_learner(model)

In [None]:
interpret.plot_top_losses(4, figsize=(20, 25))

In [None]:
interpret.plot_confusion_matrix(figsize=(20,20), dpi=60)

In [None]:
model.lr_find()

In [None]:
model.recorder.plot()

In [None]:
model.unfreeze()
model.fit_one_cycle(3, max_lr=slice(1e-03, 1e-02))

In [None]:
model.fit_one_cycle(5, max_lr=slice(1e-03, 1e-02))

In [None]:
model.save('stage-1-128-rn34_lr_8_cycles')

## Train with image size=256

In [None]:
size=256

In [None]:
img_data = ImageDataBunch.from_folder(
    path=train_folder,
    valid_pct=0.2,  
    ds_tfms=get_transforms(),
    size=size,
    bs=bs,
)

In [None]:
model.data = img_data

In [None]:
Veure minut 51 aprox del video Lesson 3, 
Do some data augmentation.
Create new databunch with images of size=256

In [None]:
model.freeze()
model.lr_find()
model.recorder.plot()

In [None]:
lr = 1e-3/2
model.fit_one_cycle(2, slice(lr))

In [None]:
model.fit_one_cycle(3, slice(lr))

In [None]:
model.save('train_final5_cycles')

In [None]:
model.load('train_final5_cycles')

In [None]:
model.export('export_resnet34_model.pkl')

# Test model with other images:

In [None]:
learn = load_learner(model_path)

In [None]:
image_path = Path.home() / 'rovello.jpg'

In [None]:
img = open_image(image_path)

In [None]:
Path.home() / 'rovello.jpg'

In [None]:
learn.predict(img)

In [None]:
Path.home()

# Deploy Web Page with Elastic Beanstalk

Tutorial Client: https://docs.aws.amazon.com/elasticbeanstalk/latest/dg/eb-cli3.html

Titorial Bundle: https://docs.aws.amazon.com/elasticbeanstalk/latest/dg/applications-sourcebundle.html

# References

* [Fastai multi label](https://gilberttanner.com/blog/fastai-multi-label-image-classification)
* [Google images scrapping](https://medium.com/@intprogrammer/how-to-scrape-google-for-images-to-train-your-machine-learning-classifiers-on-565076972ce)
* [Basic mushroom info](https://bolets.info/)
* [Mushroom names in Latin and additional info](https://ca.wikipedia.org/)
* [Instagram @natros56](https://www.instagram.com/explore/tags/indexboletsnatros56_cat/)




# Not Used