In [63]:
LMDB_FILEPATH_TRAIN = "/mnt/lmdb_storage/seefood_train_data"
LMDB_FILEPATH_TEST = "/mnt/lmdb_storage/seefood_test_data"

## Imports

In [2]:
%load_ext lab_black

In [3]:
%load_ext google.cloud.bigquery
%load_ext line_profiler

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
import sys

sys.path.insert(0, "../..")

In [6]:
import time
import copy

import pandas as pd
import numpy as np
import altair as alt
import lmdb
import pickle
import os
import string

import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split

from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision import datasets, models, transforms
from torch.utils.tensorboard import SummaryWriter
from PIL import Image, ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True

from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [7]:
def get_metrics(name, y_test, y_pred):
    return pd.DataFrame(
        {
            "experiment_name": name,
            "r2_score": [r2_score(y_test, y_pred)],
            "explained_variance_score": [explained_variance_score(y_test, y_pred)],
            "max_error": [max_error(y_test, y_pred)],
            "mean_absolute_error": [mean_absolute_error(y_test, y_pred)],
            "mean_squared_error": [mean_squared_error(y_test, y_pred)],
            "median_absolute_error": [median_absolute_error(y_test, y_pred)],
        }
    )

## Load data

# Train Neural Network

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [84]:
IMAGE_SIZE = 224


class Features:
    def __init__(self, features, target):
        self.shape = features.shape
        self.features = features.numpy().tobytes()
        self.target = target.round().item()

    def get_features(self):
        features = np.frombuffer(self.features, dtype=np.float32)
        return torch.from_numpy(features.reshape(self.shape))


class LMDBDataset(torch.utils.data.Dataset):
    def __init__(self, lmdb_filename):
        self.env = lmdb.open(
            lmdb_filename,
            max_readers=1,
            readonly=True,
            lock=False,
            readahead=False,
            meminit=False,
        )
        print(self.env.stat())
        with self.env.begin(write=False) as txn:
            self.length = txn.stat()["entries"]

    def __getitem__(self, index):
        with self.env.begin(write=False) as txn:
            key = f"{index:08}".encode("ascii")
            buf = txn.get(key)

        features = pickle.loads(buf)
        return features.get_features(), features.target

    def __len__(self):
        return self.length


class CalorieNet(nn.Module):
    """ Predicts calories given an image displaying food """

    def __init__(self):
        super(CalorieNet, self).__init__()
        self.regressor = nn.Sequential(
            nn.Dropout(0.2), nn.Linear(1280, 512), nn.Linear(512, 1)
        )

    def forward(self, x):
        x = x.mean([2, 3])
        y = self.regressor(x)
        y = y.squeeze()
        return y

In [85]:
dataloaders = {
    "train": torch.utils.data.DataLoader(
        LMDBDataset(LMDB_FILEPATH_TRAIN),
        batch_size=64,
        shuffle=False,
        num_workers=0,
        pin_memory=False,
    ),
    "val": torch.utils.data.DataLoader(
        LMDBDataset(LMDB_FILEPATH_TEST),
        batch_size=64,
        shuffle=False,
        num_workers=0,
        pin_memory=False,
    ),
}

{'psize': 4096, 'depth': 3, 'branch_pages': 27, 'leaf_pages': 5671, 'overflow_pages': 54489506, 'entries': 878863}
{'psize': 4096, 'depth': 3, 'branch_pages': 14, 'leaf_pages': 2793, 'overflow_pages': 26838188, 'entries': 432874}


In [86]:
device

device(type='cuda', index=0)

In [87]:
net = CalorieNet().to(device)

In [88]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=5):
    writer = SummaryWriter()
    since = time.time()
    training_loss = []
    validation_loss = []

    for epoch in tqdm(range(num_epochs)):
        for phase in ["train", "val"]:
            if phase == "train":
                model.train()
            else:
                model.eval()
            running_loss = 0.0

            i = 0
            for inputs, targets in tqdm(dataloaders[phase]):
                inputs = inputs.to(device)
                targets = targets.to(device)

                optimizer.zero_grad()
                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(inputs)
                    loss = criterion(outputs, targets)
                    if phase == "train":
                        loss.backward()
                        optimizer.step()
                running_loss += loss.item() * inputs.size(0)

            if phase == "train":
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            if phase == "train":
                training_loss.append(epoch_loss)
                writer.add_scalar("Loss/train", epoch_loss, epoch)
            else:
                validation_loss.append(epoch_loss)
                writer.add_scalar("Loss/val", epoch_loss, epoch)

    time_elapsed = time.time() - since

    print(f"Training complete in {time_elapsed/60}m {time_elapsed % 60}s")
    writer.close()
    return model, (training_loss, validation_loss)

In [89]:
criterion = nn.L1Loss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [97]:
%lprun -f train_model model, metrics = train_model(net, criterion, optimizer, exp_lr_scheduler, num_epochs=100)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=13733.0), HTML(value='')))

Timer unit: 1e-06 s

Total time: 7.87162 s
File: <ipython-input-88-a9b305f5875d>
Function: train_model at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def train_model(model, criterion, optimizer, scheduler, num_epochs=5):
     2         1       2509.0   2509.0      0.0      writer = SummaryWriter()
     3         1          3.0      3.0      0.0      since = time.time()
     4         1          1.0      1.0      0.0      training_loss = []
     5         1          1.0      1.0      0.0      validation_loss = []
     6                                           
     7         1      36856.0  36856.0      0.5      for epoch in tqdm(range(num_epochs)):
     8         1          3.0      3.0      0.0          for phase in ["train", "val"]:
     9         1          1.0      1.0      0.0              if phase == "train":
    10         1         85.0     85.0      0.0                  model.train()
    11          

*** KeyboardInterrupt exception caught in code being profiled.

In [None]:
training_loss, validation_loss = metrics
pd.DataFrame({"training": training_loss, "validation": validation_loss}).plot.line(
    figsize=(15, 10)
)

In [None]:
predict_dataloader = torch.utils.data.DataLoader(
    ImageDataset(X_nn_val, y_nn_val, data_transforms["val"]),
    batch_size=100,
    shuffle=False,
    num_workers=3,
)

In [None]:
y_pred_nn = []
for inputs, _ in predict_dataloader:
    inputs = inputs.to(device)
    y_pred_nn.append(model(inputs).to(cpu))

In [None]:
y_pred_nn_np = torch.flatten(torch.cat(y_pred_nn)).detach().numpy()

In [None]:
y_pred_nn

In [None]:
df_nn_results = get_metrics("nn", y_nn_val, y_pred_nn_np)
df_nn_results

## Create Baseline

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin
from sklearn.utils.validation import check_is_fitted


class BaselineModel(BaseEstimator, RegressorMixin):
    def __init__(self):
        pass

    def fit(self, _, y):
        self.mean_ = y.mean()
        return self

    def predict(self, X):
        check_is_fitted(self, [])

        return np.array(X.shape[0] * [self.mean_])

In [None]:
baseline_model = BaselineModel()

In [None]:
baseline_model.fit(X_nn_train, y_nn_train)

In [None]:
y_pred_baseline = baseline_model.predict(X_nn_val)

In [None]:
df_baseline_results = get_metrics("baseline", y_nn_val, y_pred_baseline)
df_baseline_results

## Compare NN to Baseline

In [None]:
df_results = pd.concat([df_baseline_results, df_nn_results]).reset_index(drop=True).T
df_results.columns = df_results.loc["experiment_name"].values
df_results = df_results.iloc[1:]

In [None]:
df_results

In [None]:
df_results.plot.bar(log=True, figsize=(12, 7))

## Inspect predictions

In [None]:
ylim = (0, 1200)

In [None]:
df_nn.loc[X_nn_train.index].total_calories.plot.hist(
    bins=300, figsize=(16, 11), ylim=ylim
)

In [None]:
df_predictions = df_nn.loc[X_nn_val.index].assign(
    predicted_calories=np.exp(y_pred_nn_np)
)

In [None]:
df_predictions[["total_calories", "predicted_calories"]].plot.hist(
    bins=300, figsize=(16, 11), alpha=0.8, ylim=ylim
)

In [None]:
df_predictions_sample = df_predictions
line = (
    alt.Chart(df_predictions_sample)
    .mark_line()
    .encode(x="total_calories", y="total_calories")
)

scatter = (
    alt.Chart(df_predictions_sample)
    .mark_circle(color="red")
    .encode(
        x="total_calories",
        y="predicted_calories",
        tooltip=["title", "total_calories", "predicted_calories"],
    )
).interactive()

(line + scatter).properties(width=800, height=800)

In [None]:
NON_FEATURE_COLS = ["title", "total_calories", "servings", "predicted_calories"]

In [None]:
df_high_cal = df_predictions[df_predictions.predicted_calories > 300]
df_low_cal = df_predictions[df_predictions.predicted_calories < 120]

In [None]:
df_high_cal[NON_FEATURE_COLS].sample(10)

In [None]:
df_low_cal[NON_FEATURE_COLS].sample(10)

In [None]:
df_high_cal[NON_FEATURE_COLS].describe()

In [None]:
df_low_cal[NON_FEATURE_COLS].describe()

In [None]:
high_cal_wc = WordCloud().generate(" ".join(df_high_cal.title.str.lower()))
low_cal_wc = WordCloud().generate(" ".join(df_low_cal.title.str.lower()))

In [None]:
plt.figure(figsize=(15, 15))
plt.imshow(high_cal_wc, interpolation="nearest")

In [None]:
plt.figure(figsize=(15, 15))
plt.imshow(low_cal_wc, interpolation="nearest")

# Persist Model

In [None]:
!ls ../../models

In [None]:
torch.save(model_ft, "../../models/calorie_regression_mobilenet_.pt")