# Train Food Recognition Model

In [1]:
LMDB_TRAIN_FILEPATH = "/mnt/data_ssd/lmdb/seefood_train_data_efficientnet_b3"
LMDB_TEST_FILEPATH = "/mnt/data_ssd/lmdb/seefood_test_data_efficientnet_b3"
LMDB_FLICKR30K_FILEPATH = "/mnt/data_ssd/lmdb/flickr_data_efficientnet_b3"

## Imports and Utilities

In [2]:
%load_ext lab_black

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import logging
import pickle
from datetime import datetime

from seefood.plotting import plot_interactive_scatter, plot_images
from seefood.data import LMDBDataset
from seefood.features import EfficientNetFeatureExtractor
from seefood.model import food_recognition_model

from bokeh.io import output_notebook
from bokeh.layouts import row
from bokeh.plotting import show

import altair as alt

import torch
import umap
import numpy as np
import pandas as pd
import optuna

Using cache found in /home/mike/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master


In [5]:
output_notebook()

In [6]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [7]:
logging.basicConfig(
    format=("[%(funcName)s:%(lineno)d] - %(message)s"), level=logging.INFO
)

## Load Data

In [8]:
dataloader_train = torch.utils.data.DataLoader(
    LMDBDataset(LMDB_TRAIN_FILEPATH), batch_size=100000, shuffle=False, num_workers=0,
)

image_paths_train, X_train, _ = next(iter(dataloader_train))
image_paths_train[0], len(image_paths_train)

('/mnt/data_ssd/datasets/sparkrecipes/323873/000009', 100000)

In [9]:
dataloader_eval = torch.utils.data.DataLoader(
    LMDBDataset(LMDB_FLICKR30K_FILEPATH), batch_size=5000, shuffle=False, num_workers=0,
)

image_paths_eval, X_eval, _ = next(iter(dataloader_eval))
image_paths_eval[0], len(image_paths_eval)

('/mnt/data_ssd/datasets/flickr30k/7589467042.jpg', 5000)

In [10]:
dataloader_test = torch.utils.data.DataLoader(
    LMDBDataset(LMDB_TEST_FILEPATH), batch_size=25000, shuffle=False, num_workers=0,
)

image_paths_test, X_test, _ = next(iter(dataloader_test))
image_paths_test[0], len(image_paths_test)

('/mnt/data_ssd/datasets/sparkrecipes/114054/000015', 25000)

## Tune Hyperparameters

In [None]:
def objective(trial):
    covariance_type = trial.suggest_categorical(
        "model__covariance_type", ["full", "tied", "diag", "spherical"]
    )
    pca_n_components = trial.suggest_int("pca__n_components", 128, 1024, step=64)
    gmm_n_components = trial.suggest_int("model__n_components", 1, 128)

    pipe = (
        food_recognition_model()
        .set_params(
            pca__n_components=pca_n_components,
            model__covariance_type=covariance_type,
            model__n_components=gmm_n_components,
        )
        .fit(X_train)
    )

    feature_transform = pipe[:-1]
    model = pipe[-1]

    return model.bic(feature_transform.fit_transform(X_train))

In [None]:
study = optuna.create_study()

In [None]:
study.optimize(objective, n_trials=150, show_progress_bar=True)

In [None]:
study.best_params

In [None]:
optuna.visualization.plot_contour(
    study, params=["model__covariance_type", "model__n_components"]
)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(
    study, params=["model__covariance_type", "model__n_components"]
)

In [None]:
optuna.visualization.plot_slice(
    study, params=["model__covariance_type", "model__n_components"]
)

In [None]:
optuna.visualization.plot_slice(study, params=["pca_whiten", "pca__n_components"])

## Train Final Model

In [None]:
model = (
    food_recognition_model()
    .set_params(
            pca__n_components=512,
            model__covariance_type="full",
            model__n_components=16,
    )
    .fit(X_train)
)


From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.

[default_log_callback:35] - [StandardScaler(copy=True, with_mean=True, with_std=True)] shape=(100000, 1536) time=2s
[default_log_callback:35] - [PCA(copy=True, iterated_power='auto', n_components=512, random_state=0,
    svd_solver='auto', tol=0.0, whiten=True)] shape=(100000, 512) time=25s

From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.

[default_log_callback:35] - [PrintPreprocessStats(pca=None)] shape=(100000, 512) time=0s


In [None]:
feature_transform = model[:-1]
gmm = model[-1]

In [None]:
gmm.bic(feature_transform.fit_transform(X_train))

## Persist Model in Google Cloud Storage

In [None]:
!mkdir -p models

In [None]:
pickle.dump(
    model,
    open(f'models/food_recognition_{datetime.now().strftime("%Y%m%d")}.pickle', "wb"),
)

In [None]:
!gsutil cp models/*.pickle gs://seefood-models/

## Analyze Results

In [None]:
X = np.concatenate([X_train, X_eval])
X.shape

In [None]:
def extract_path_suffix(image_paths):
    return [p[len("(/mnt/data_ssd/datasets/") - 1 :] for p in image_paths]

In [None]:
all_path_suffixes = extract_path_suffix(image_paths_train) + extract_path_suffix(
    image_paths_eval
)
all_path_suffixes[0]

In [None]:
img_disk_path = image_paths_train + image_paths_eval
img_urls = [f"http://localhost:8080/{path}" for path in all_path_suffixes]
scores = model.score_samples(X)
labels = ["blue"] * len(image_paths_train) + ["red"] * len(image_paths_eval)
umap_x = umap_mapper.embedding_[:, 0]
umap_y = umap_mapper.embedding_[:, 1]

In [None]:
df_results = pd.DataFrame(
    {
        "img_disk_path": img_disk_path,
        "img_url": img_urls,
        "score": scores,
        "label": labels,
        "umap_x": umap_x,
        "umap_y": umap_y,
    }
)
df_results

### Scatter Plots of Embeddings

In [None]:
umap_mapper = umap.UMAP(
    n_components=2, n_neighbors=5, min_dist=0.01, metric="cosine"
).fit(X)

In [None]:
gmm_score_fig = plot_interactive_scatter(
    umap_mapper.embedding_, img_urls, values=scores
)
class_fig = plot_interactive_scatter(umap_mapper.embedding_, img_urls, labels=labels)

show(row(gmm_score_fig, class_fig))

### Score Distribution

In [None]:
alt.Chart(df_results).mark_bar().encode(
    alt.X("score:Q", bin=alt.Bin(maxbins=150)), y="count()", color="label"
).properties(width=650, height=500).interactive()

In [None]:
df_food_results = df_results[df_results.label == "blue"]
df_flickr_results = df_results[df_results.label == "red"]

In [None]:
threshold = df_food_results.score.quantile(0.1)
threshold

In [None]:
df_food_results[df_food_results.score < threshold].shape[0] / df_food_results.shape[0]

In [None]:
df_flickr_results[df_flickr_results.score < threshold].shape[
    0
] / df_flickr_results.shape[0]

### Analyze Images with Low / High Scores

In [None]:
plot_images(df_food_results.sort_values(by="score", ascending=True).img_disk_path)

In [None]:
plot_images(df_food_results.sort_values(by="score", ascending=False).img_disk_path)

In [None]:
plot_images(df_flickr_results.sort_values(by="score", ascending=True).img_disk_path)

In [None]:
plot_images(df_flickr_results.sort_values(by="score", ascending=False).img_disk_path)

### Filtered Scatter

In [None]:
def show_filtered_scatter(df, filter_func):
    labels = filter_func(df).map(lambda truth_value: "red" if truth_value else "blue")

    return plot_interactive_scatter(
        df[["umap_x", "umap_y"]].values, df.img_url, labels=labels
    )

In [None]:
show(
    row(
        show_filtered_scatter(df_food_results, lambda df: df.score < threshold),
        show_filtered_scatter(df_flickr_results, lambda df: df.score < threshold),
    )
)

### Test Data

In [None]:
test_scores = model.score_samples(X_test)

In [None]:
df_test_results = pd.DataFrame(
    {"img_disk_path": image_paths_test, "score": test_scores}
)
df_test_results

In [None]:
df_test_results.score.hist(bins=150)

In [None]:
df_test_results[df_test_results.score < threshold].shape[0] / df_test_results.shape[0]

In [None]:
plot_images(df_test_results.sort_values(by="score", ascending=False).img_disk_path)

In [None]:
plot_images(df_test_results.sort_values(by="score", ascending=True).img_disk_path)