# Section 1: General EDA

In [None]:
# General modules
import tarfile
import pandas as pd
import numpy as np
import os
import json
import warnings
import time
import shutil

# Image visualization modules
from PIL import Image
import matplotlib.pyplot as plt

# Metrics
from sklearn import metrics

# ResNet/Model modules
from sklearn.model_selection import train_test_split
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets

In [None]:
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
images_path = '/content/drive/Othercomputers/My Computer/food-101/images'
all_image_classes = os.listdir(images_path)
print("There are {0} image clasess in the food-101 dataset".format(len(all_image_classes)))

There are 101 image clasess in the food-101 dataset


In [None]:
metadata_path = '/content/drive/Othercomputers/My Computer/food-101/meta'
metadata_info = os.listdir(metadata_path)
print(metadata_info)

['train.txt', 'train.json', 'test.txt', 'test.json', 'labels.txt', 'classes.txt']


In [None]:
# 'classes.txt'
meta_class_data = os.path.join(metadata_path, 'classes.txt')

with open(meta_class_data, 'r') as f:
    meta_classes = f.read().splitlines()  # Classes listed in the 'classes.txt' file in the 'meta' folder

print(meta_classes)
print("")
print("{0} food classes were found in the 'classes.txt' file in the 'meta' folder.".format(len(meta_classes)))

['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio', 'beef_tartare', 'beet_salad', 'beignets', 'bibimbap', 'bread_pudding', 'breakfast_burrito', 'bruschetta', 'caesar_salad', 'cannoli', 'caprese_salad', 'carrot_cake', 'ceviche', 'cheesecake', 'cheese_plate', 'chicken_curry', 'chicken_quesadilla', 'chicken_wings', 'chocolate_cake', 'chocolate_mousse', 'churros', 'clam_chowder', 'club_sandwich', 'crab_cakes', 'creme_brulee', 'croque_madame', 'cup_cakes', 'deviled_eggs', 'donuts', 'dumplings', 'edamame', 'eggs_benedict', 'escargots', 'falafel', 'filet_mignon', 'fish_and_chips', 'foie_gras', 'french_fries', 'french_onion_soup', 'french_toast', 'fried_calamari', 'fried_rice', 'frozen_yogurt', 'garlic_bread', 'gnocchi', 'greek_salad', 'grilled_cheese_sandwich', 'grilled_salmon', 'guacamole', 'gyoza', 'hamburger', 'hot_and_sour_soup', 'hot_dog', 'huevos_rancheros', 'hummus', 'ice_cream', 'lasagna', 'lobster_bisque', 'lobster_roll_sandwich', 'macaroni_and_cheese', 'macarons', 'miso_sou

In [None]:
with open(meta_class_data, 'r') as f:
    meta_classes = f.read().splitlines()  # Classes listed in the 'classes.txt' file in the 'meta' folder

print(meta_classes)
print("")
print("{0} food classes were found in the 'classes.txt' file in the 'meta' folder.".format(len(meta_classes)))

['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio', 'beef_tartare', 'beet_salad', 'beignets', 'bibimbap', 'bread_pudding', 'breakfast_burrito', 'bruschetta', 'caesar_salad', 'cannoli', 'caprese_salad', 'carrot_cake', 'ceviche', 'cheesecake', 'cheese_plate', 'chicken_curry', 'chicken_quesadilla', 'chicken_wings', 'chocolate_cake', 'chocolate_mousse', 'churros', 'clam_chowder', 'club_sandwich', 'crab_cakes', 'creme_brulee', 'croque_madame', 'cup_cakes', 'deviled_eggs', 'donuts', 'dumplings', 'edamame', 'eggs_benedict', 'escargots', 'falafel', 'filet_mignon', 'fish_and_chips', 'foie_gras', 'french_fries', 'french_onion_soup', 'french_toast', 'fried_calamari', 'fried_rice', 'frozen_yogurt', 'garlic_bread', 'gnocchi', 'greek_salad', 'grilled_cheese_sandwich', 'grilled_salmon', 'guacamole', 'gyoza', 'hamburger', 'hot_and_sour_soup', 'hot_dog', 'huevos_rancheros', 'hummus', 'ice_cream', 'lasagna', 'lobster_bisque', 'lobster_roll_sandwich', 'macaroni_and_cheese', 'macarons', 'miso_sou

In [None]:
# 'labels.txt'
labels_data = os.path.join(metadata_path, 'labels.txt')

# 'test.json'
test_json_data = os.path.join(metadata_path, 'test.json')

# 'test.txt'
test_txt_data = os.path.join(metadata_path, 'test.txt')

# 'train.json'
train_json_data = os.path.join(metadata_path, 'train.json')

# 'train.txt'
train_txt_data = os.path.join(metadata_path, 'train.txt')

In [None]:
food_classes = ['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio', 'beef_tartare', 'beet_salad', 'beignets', 'bibimbap', 'bread_pudding', 'breakfast_burrito', 'bruschetta', 'caesar_salad', 'cannoli', 'caprese_salad', 'carrot_cake', 'ceviche', 'cheesecake', 'cheese_plate', 'chicken_curry', 'chicken_quesadilla', 'chicken_wings', 'chocolate_cake', 'chocolate_mousse', 'churros', 'clam_chowder', 'club_sandwich', 'crab_cakes', 'creme_brulee', 'croque_madame', 'cup_cakes', 'deviled_eggs', 'donuts', 'dumplings', 'edamame', 'eggs_benedict', 'escargots', 'falafel', 'filet_mignon', 'fish_and_chips', 'foie_gras', 'french_fries', 'french_onion_soup', 'french_toast', 'fried_calamari', 'fried_rice', 'frozen_yogurt', 'garlic_bread', 'gnocchi', 'greek_salad', 'grilled_cheese_sandwich', 'grilled_salmon', 'guacamole', 'gyoza', 'hamburger', 'hot_and_sour_soup', 'hot_dog', 'huevos_rancheros', 'hummus', 'ice_cream', 'lasagna', 'lobster_bisque', 'lobster_roll_sandwich', 'macaroni_and_cheese', 'macarons', 'miso_soup', 'mussels', 'nachos', 'omelette', 'onion_rings', 'oysters', 'pad_thai', 'paella', 'pancakes', 'panna_cotta', 'peking_duck', 'pho', 'pizza', 'pork_chop', 'poutine', 'prime_rib', 'pulled_pork_sandwich', 'ramen', 'ravioli', 'red_velvet_cake', 'risotto', 'samosa', 'sashimi', 'scallops', 'seaweed_salad', 'shrimp_and_grits', 'spaghetti_bolognese', 'spaghetti_carbonara', 'spring_rolls', 'steak', 'strawberry_shortcake', 'sushi', 'tacos', 'takoyaki', 'tiramisu', 'tuna_tartare', 'waffles']
len(food_classes)

101

Let's pick 50 food classes randomly to train our model with:

NOTE: If you try to do run the code below, you will get a different set of 50 subclasses of food as it is generated randomly. To reproduce the model results, refer to Section 2.

In [None]:
# random_food_classes = random.sample(food_classes, 50)
# random_food_classes

Now, we need to get a path to each subclass in `random_food_classes`. To do this, we can use variable, `images_path` and append each subclass string-value to it.

In [None]:
# food_classes_path = []

# for subclass in random_food_classes:
#     sc_path = os.path.join(images_path, subclass)
#     food_classes_path.append(sc_path)

# food_classes_path[0]

'/content/drive/Othercomputers/My Computer/food-101/images/tacos'

In [None]:
# food_classes_path

Now, let's save our subclasses and paths conveniently in a dictionary.

In [None]:
# food_classes_dict = dict(zip(random_food_classes, food_classes_path))
# food_classes_dict.keys()

Now, let's download these classes for safe-keeping and easy loading:

In [None]:
# with open('food_classes_path.txt', 'w') as f:
#     for item in food_classes_path:
#         f.write("%s\n" % item)

In [None]:
# import json

# with open('food_classes_dict.txt', 'w') as f:
#     f.write(json.dumps(food_classes_dict))

In [None]:
# from google.colab import files

In [None]:
# files.download('food_classes_path.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# files.download('food_classes_dict.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Section 2: Data Preprocessing

Next, we need to get a list of each file from each subclass path in `subclasses_path` so that we can visualize some examples and do some basic statistics (if necessary).

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/Large Scale Machine Learning 2/Final Project/food_classes_path.txt', 'r') as f:
    food_classes_path = [line.strip() for line in f]

with open('/content/drive/MyDrive/Colab Notebooks/Large Scale Machine Learning 2/Final Project/food_classes_dict.txt', 'r') as f:
    food_classes_dict = json.loads(f.read())

In [None]:
# with open('food_classes_path.txt', 'r') as f:
#     food_classes_path = [line.strip() for line in f]

# with open('food_classes_dict.txt', 'r') as f:
#     food_classes_dict = json.loads(f.read())

In [None]:
food_classes_path[-1]

'/content/drive/Othercomputers/My Computer/food-101/images/carrot_cake'

In [None]:
# Lists to store images conveniently
tacos_imgs = []
hamburger_imgs = []
chocolate_cake_imgs = []
bread_pudding_imgs = []
creme_brulee_imgs = []
fried_rice_imgs = []
macarons_imgs = []
bruschetta_imgs = []
lobster_bisque_imgs = []
garlic_bread_imgs = []
fried_calamari_imgs = []
deviled_eggs_imgs = []
gyoza_imgs = []
french_toast_imgs = []
steak_imgs = []
omelette_imgs = []
pancakes_imgs = []
chicken_wings_imgs = []
samosa_imgs = []
spaghetti_bolognese_imgs = []
pizza_imgs = []
fish_and_chips_imgs = []
crab_cakes_imgs = []
panna_cotta_imgs = []
baby_back_ribs_imgs = []
pork_chop_imgs = []
paella_imgs = []
bibimbap_imgs = []
huevos_rancheros_imgs = []
takoyaki_imgs = []
seaweed_salad_imgs = []
onion_rings_imgs = []
hummus_imgs = []
foie_gras_imgs = []
risotto_imgs = []
chicken_curry_imgs = []
croque_madame_imgs = []
falafel_imgs = []
lobster_roll_sandwich_imgs = []
peking_duck_imgs = []
shrimp_and_grits_imgs = []
donuts_imgs = []
mussels_imgs = []
edamame_imgs = []
ceviche_imgs = []
grilled_salmon_imgs = []
hot_and_sour_soup_imgs = []
nachos_imgs = []
ramen_imgs = []
carrot_cake_imgs = []

In [None]:
food_classes_dict.keys()

dict_keys(['tacos', 'hamburger', 'chocolate_cake', 'bread_pudding', 'creme_brulee', 'fried_rice', 'macarons', 'bruschetta', 'lobster_bisque', 'garlic_bread', 'fried_calamari', 'deviled_eggs', 'gyoza', 'french_toast', 'steak', 'omelette', 'pancakes', 'chicken_wings', 'samosa', 'spaghetti_bolognese', 'pizza', 'fish_and_chips', 'crab_cakes', 'panna_cotta', 'baby_back_ribs', 'pork_chop', 'paella', 'bibimbap', 'huevos_rancheros', 'takoyaki', 'seaweed_salad', 'onion_rings', 'hummus', 'foie_gras', 'risotto', 'chicken_curry', 'croque_madame', 'falafel', 'lobster_roll_sandwich', 'peking_duck', 'shrimp_and_grits', 'donuts', 'mussels', 'edamame', 'ceviche', 'grilled_salmon', 'hot_and_sour_soup', 'nachos', 'ramen', 'carrot_cake'])

In [None]:
# Tacos
taco_imgs_dir = os.listdir(food_classes_dict['tacos'])

for img in taco_imgs_dir:
    tacos_imgs.append(os.path.join(food_classes_dict['tacos'], img))

# Hamburger
hamburger_imgs_dir = os.listdir(food_classes_dict['hamburger'])

for img in hamburger_imgs_dir:
    hamburger_imgs.append(os.path.join(food_classes_dict['hamburger'], img))

# Chocolate Cake
chocolate_cake_imgs_dir = os.listdir(food_classes_dict['chocolate_cake'])

for img in chocolate_cake_imgs_dir:
    chocolate_cake_imgs.append(os.path.join(food_classes_dict['chocolate_cake'], img))

# Bread Pudding
bread_pudding_imgs_dir = os.listdir(food_classes_dict['bread_pudding'])

for img in bread_pudding_imgs_dir:
    bread_pudding_imgs.append(os.path.join(food_classes_dict['bread_pudding'], img))

# Creme Brulee
creme_brulee_imgs_dir = os.listdir(food_classes_dict['creme_brulee'])

for img in creme_brulee_imgs_dir:
    creme_brulee_imgs.append(os.path.join(food_classes_dict['creme_brulee'], img))

# Fried Rice
fried_rice_imgs_dir = os.listdir(food_classes_dict['fried_rice'])

for img in fried_rice_imgs_dir:
    fried_rice_imgs.append(os.path.join(food_classes_dict['fried_rice'], img))

# Macarons
macarons_imgs_dir = os.listdir(food_classes_dict['macarons'])

for img in macarons_imgs_dir:
    macarons_imgs.append(os.path.join(food_classes_dict['macarons'], img))

# Bruschetta
bruschetta_imgs_dir = os.listdir(food_classes_dict['bruschetta'])

for img in bruschetta_imgs_dir:
    bruschetta_imgs.append(os.path.join(food_classes_dict['bruschetta'], img))

# Lobster Bisque
lobster_bisque_imgs_dir = os.listdir(food_classes_dict['lobster_bisque'])

for img in lobster_bisque_imgs_dir:
    lobster_bisque_imgs.append(os.path.join(food_classes_dict['lobster_bisque'], img))

# Garlic Bread
garlic_bread_imgs_dir = os.listdir(food_classes_dict['garlic_bread'])

for img in garlic_bread_imgs_dir:
    garlic_bread_imgs.append(os.path.join(food_classes_dict['garlic_bread'], img))

# Fried Calamari
fried_calamari_imgs_dir = os.listdir(food_classes_dict['fried_calamari'])

for img in fried_calamari_imgs_dir:
    fried_calamari_imgs.append(os.path.join(food_classes_dict['fried_calamari'], img))

# Deviled Eggs
deviled_eggs_imgs_dir = os.listdir(food_classes_dict['deviled_eggs'])

for img in deviled_eggs_imgs_dir:
    deviled_eggs_imgs.append(os.path.join(food_classes_dict['deviled_eggs'], img))

# Gyoza
gyoza_imgs_dir = os.listdir(food_classes_dict['gyoza'])

for img in gyoza_imgs_dir:
    gyoza_imgs.append(os.path.join(food_classes_dict['gyoza'], img))

# French Toast
french_toast_imgs_dir = os.listdir(food_classes_dict['french_toast'])

for img in french_toast_imgs_dir:
    french_toast_imgs.append(os.path.join(food_classes_dict['french_toast'], img))

# Steak
steak_imgs_dir = os.listdir(food_classes_dict['steak'])

for img in steak_imgs_dir:
    steak_imgs.append(os.path.join(food_classes_dict['steak'], img))

# Omelette
omelette_imgs_dir = os.listdir(food_classes_dict['omelette'])

for img in omelette_imgs_dir:
    omelette_imgs.append(os.path.join(food_classes_dict['omelette'], img))

# Pancakes
pancakes_imgs_dir = os.listdir(food_classes_dict['pancakes'])

for img in pancakes_imgs_dir:
    pancakes_imgs.append(os.path.join(food_classes_dict['pancakes'], img))

# Chicken Wings
chicken_wings_imgs_dir = os.listdir(food_classes_dict['chicken_wings'])

for img in chicken_wings_imgs_dir:
    chicken_wings_imgs.append(os.path.join(food_classes_dict['chicken_wings'], img))

# Samosa
samosa_imgs_dir = os.listdir(food_classes_dict['samosa'])

for img in samosa_imgs_dir:
    samosa_imgs.append(os.path.join(food_classes_dict['samosa'], img))

# Spaghetti Bolognese
spaghetti_bolognese_imgs_dir = os.listdir(food_classes_dict['spaghetti_bolognese'])

for img in spaghetti_bolognese_imgs_dir:
    spaghetti_bolognese_imgs.append(os.path.join(food_classes_dict['spaghetti_bolognese'], img))

# Pizza
pizza_imgs_dir = os.listdir(food_classes_dict['pizza'])

for img in pizza_imgs_dir:
    pizza_imgs.append(os.path.join(food_classes_dict['pizza'], img))

# Fish and Chips
fish_and_chips_imgs_dir = os.listdir(food_classes_dict['fish_and_chips'])

for img in fish_and_chips_imgs_dir:
    fish_and_chips_imgs.append(os.path.join(food_classes_dict['fish_and_chips'], img))

# Crab Cakes
crab_cakes_imgs_dir = os.listdir(food_classes_dict['crab_cakes'])

for img in crab_cakes_imgs_dir:
    crab_cakes_imgs.append(os.path.join(food_classes_dict['crab_cakes'], img))

# Panna Cotta
panna_cotta_imgs_dir = os.listdir(food_classes_dict['panna_cotta'])

for img in panna_cotta_imgs_dir:
    panna_cotta_imgs.append(os.path.join(food_classes_dict['panna_cotta'], img))

# Baby Back Ribs
baby_back_ribs_imgs_dir = os.listdir(food_classes_dict['baby_back_ribs'])

for img in baby_back_ribs_imgs_dir:
    baby_back_ribs_imgs.append(os.path.join(food_classes_dict['baby_back_ribs'], img))

# Pork Chop
pork_chop_imgs_dir = os.listdir(food_classes_dict['pork_chop'])

for img in pork_chop_imgs_dir:
    pork_chop_imgs.append(os.path.join(food_classes_dict['pork_chop'], img))

# Paella
paella_imgs_dir = os.listdir(food_classes_dict['paella'])

for img in paella_imgs_dir:
    paella_imgs.append(os.path.join(food_classes_dict['paella'], img))

# Bibimbap
bibimbap_imgs_dir = os.listdir(food_classes_dict['bibimbap'])

for img in bibimbap_imgs_dir:
    bibimbap_imgs.append(os.path.join(food_classes_dict['bibimbap'], img))

# Huevos Rancheros
huevos_rancheros_imgs_dir = os.listdir(food_classes_dict['huevos_rancheros'])

for img in huevos_rancheros_imgs_dir:
    huevos_rancheros_imgs.append(os.path.join(food_classes_dict['huevos_rancheros'], img))

# Takoyaki
takoyaki_imgs_dir = os.listdir(food_classes_dict['takoyaki'])

for img in takoyaki_imgs_dir:
    takoyaki_imgs.append(os.path.join(food_classes_dict['takoyaki'], img))

# Seaweed Salad
seaweed_salad_imgs_dir = os.listdir(food_classes_dict['seaweed_salad'])

for img in seaweed_salad_imgs_dir:
    seaweed_salad_imgs.append(os.path.join(food_classes_dict['seaweed_salad'], img))

# Onion Rings
onion_rings_imgs_dir = os.listdir(food_classes_dict['onion_rings'])

for img in onion_rings_imgs_dir:
    onion_rings_imgs.append(os.path.join(food_classes_dict['onion_rings'], img))

# Hummus
hummus_imgs_dir = os.listdir(food_classes_dict['hummus'])

for img in hummus_imgs_dir:
    hummus_imgs.append(os.path.join(food_classes_dict['hummus'], img))

# Foie Gras
foie_gras_imgs_dir = os.listdir(food_classes_dict['foie_gras'])

for img in foie_gras_imgs_dir:
    foie_gras_imgs.append(os.path.join(food_classes_dict['foie_gras'], img))

# Risotto
risotto_imgs_dir = os.listdir(food_classes_dict['risotto'])

for img in risotto_imgs_dir:
    risotto_imgs.append(os.path.join(food_classes_dict['risotto'], img))

# Chicken Curry
chicken_curry_imgs_dir = os.listdir(food_classes_dict['chicken_curry'])

for img in chicken_curry_imgs_dir:
    chicken_curry_imgs.append(os.path.join(food_classes_dict['chicken_curry'], img))

# Croque Madame
croque_madame_imgs_dir = os.listdir(food_classes_dict['croque_madame'])

for img in croque_madame_imgs_dir:
    croque_madame_imgs.append(os.path.join(food_classes_dict['croque_madame'], img))

# Falafel
falafel_imgs_dir = os.listdir(food_classes_dict['falafel'])

for img in falafel_imgs_dir:
    falafel_imgs.append(os.path.join(food_classes_dict['falafel'], img))

# Lobster Roll Sandwich
lobster_roll_sandwich_imgs_dir = os.listdir(food_classes_dict['lobster_roll_sandwich'])

for img in lobster_roll_sandwich_imgs_dir:
    lobster_roll_sandwich_imgs.append(os.path.join(food_classes_dict['lobster_roll_sandwich'], img))

# Peking Duck
peking_duck_imgs_dir = os.listdir(food_classes_dict['peking_duck'])

for img in peking_duck_imgs_dir:
    peking_duck_imgs.append(os.path.join(food_classes_dict['peking_duck'], img))

# Shrimp and Grits
shrimp_and_grits_imgs_dir = os.listdir(food_classes_dict['shrimp_and_grits'])

for img in shrimp_and_grits_imgs_dir:
    shrimp_and_grits_imgs.append(os.path.join(food_classes_dict['shrimp_and_grits'], img))

# Donuts
donuts_imgs_dir = os.listdir(food_classes_dict['donuts'])

for img in donuts_imgs_dir:
    donuts_imgs.append(os.path.join(food_classes_dict['donuts'], img))

# Mussels
mussels_imgs_dir = os.listdir(food_classes_dict['mussels'])

for img in mussels_imgs_dir:
    mussels_imgs.append(os.path.join(food_classes_dict['mussels'], img))

# Edamame
edamame_imgs_dir = os.listdir(food_classes_dict['edamame'])

for img in edamame_imgs_dir:
    edamame_imgs.append(os.path.join(food_classes_dict['edamame'], img))

# Ceviche
ceviche_imgs_dir = os.listdir(food_classes_dict['ceviche'])

for img in ceviche_imgs_dir:
    ceviche_imgs.append(os.path.join(food_classes_dict['ceviche'], img))

# Grilled Salad
grilled_salmon_imgs_dir = os.listdir(food_classes_dict['grilled_salmon'])

for img in grilled_salmon_imgs_dir:
    grilled_salmon_imgs.append(os.path.join(food_classes_dict['grilled_salmon'], img))

# Hot and Sour Soup
hot_and_sour_soup_imgs_dir = os.listdir(food_classes_dict['hot_and_sour_soup'])

for img in hot_and_sour_soup_imgs_dir:
    hot_and_sour_soup_imgs.append(os.path.join(food_classes_dict['hot_and_sour_soup'], img))

# Nachos
nachos_imgs_dir = os.listdir(food_classes_dict['nachos'])

for img in nachos_imgs_dir:
    nachos_imgs.append(os.path.join(food_classes_dict['nachos'], img))

# Ramen
ramen_imgs_dir = os.listdir(food_classes_dict['ramen'])

for img in ramen_imgs_dir:
    ramen_imgs.append(os.path.join(food_classes_dict['ramen'], img))

# Carrot Cake
carrot_cake_imgs_dir = os.listdir(food_classes_dict['carrot_cake'])

for img in carrot_cake_imgs_dir:
    carrot_cake_imgs.append(os.path.join(food_classes_dict['carrot_cake'], img))


In [None]:
all_subclass_imgs = {
    'tacos': tacos_imgs,
    'hamburger': hamburger_imgs,
    'chocolate_cake': chocolate_cake_imgs,
    'bread_pudding': bread_pudding_imgs,
    'creme_brulee': creme_brulee_imgs,
    'fried_rice': fried_rice_imgs,
    'macarons': macarons_imgs,
    'bruschetta': bruschetta_imgs,
    'lobster_bisque': lobster_bisque_imgs,
    'garlic_bread': garlic_bread_imgs,
    'fried_calamari': fried_calamari_imgs,
    'deviled_eggs': deviled_eggs_imgs,
    'gyoza': gyoza_imgs,
    'french_toast': french_toast_imgs,
    'steak': steak_imgs,
    'omelette': omelette_imgs,
    'pancakes': pancakes_imgs,
    'chicken_wings': chicken_wings_imgs,
    'samosa': samosa_imgs,
    'spaghetti_bolognese': spaghetti_bolognese_imgs,
    'pizza': pizza_imgs,
    'fish_and_chips': fish_and_chips_imgs,
    'crab_cakes': crab_cakes_imgs,
    'panna_cotta': panna_cotta_imgs,
    'baby_back_ribs': baby_back_ribs_imgs,
    'pork_chop': pork_chop_imgs,
    'paella': paella_imgs,
    'bibimbap': bibimbap_imgs,
    'huevos_rancheros': huevos_rancheros_imgs,
    'takoyaki': takoyaki_imgs,
    'seaweed_salad': seaweed_salad_imgs,
    'onion_rings': onion_rings_imgs,
    'hummus': hummus_imgs,
    'foie_gras': foie_gras_imgs,
    'risotto': risotto_imgs,
    'chicken_curry': chicken_curry_imgs,
    'croque_madame': croque_madame_imgs,
    'falafel': falafel_imgs,
    'lobster_roll_sandwich': lobster_roll_sandwich_imgs,
    'peking_duck': peking_duck_imgs,
    'shrimp_and_grits': shrimp_and_grits_imgs,
    'donuts': donuts_imgs,
    'mussels': mussels_imgs,
    'edamame': edamame_imgs,
    'ceviche': ceviche_imgs,
    'grilled_salmon': grilled_salmon_imgs,
    'hot_and_sour_soup': hot_and_sour_soup_imgs,
    'nachos': nachos_imgs,
    'ramen': ramen_imgs,
    'carrot_cake': carrot_cake_imgs
}

In [None]:
# from google.colab import files

# with open('all_subclass_imgs.txt', 'w') as f:
#     f.write(json.dumps(all_subclass_imgs))

# files.download('all_subclass_imgs.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
for key, value in all_subclass_imgs.items():
    print("{0} has {1} images.".format(key, len(value)))

# Section 3: Preprocessing for ML Model

## 3.1 Split the data from train into `current_train` and `current_val` (`val_size = 10%`). `random_state` is fixed for reproducibility.

Before proceeding forward and splitting the data, we need to use only the subset of the data containing the subclasses we are going to evaluate. We also need to do the same for the test dataset too.

Let's do this below:

In [None]:
subclasses = ['tacos',
              'hamburger',
              'chocolate_cake',
              'bread_pudding',
              'creme_brulee',
              'fried_rice',
              'macarons',
              'bruschetta',
              'lobster_bisque',
              'garlic_bread',
              'fried_calamari',
              'deviled_eggs',
              'gyoza',
              'french_toast',
              'steak',
              'omelette',
              'pancakes',
              'chicken_wings',
              'samosa',
              'spaghetti_bolognese',
              'pizza',
              'fish_and_chips',
              'crab_cakes',
              'panna_cotta',
              'baby_back_ribs',
              'pork_chop',
              'paella',
              'bibimbap',
              'huevos_rancheros',
              'takoyaki',
              'seaweed_salad',
              'onion_rings',
              'hummus',
              'foie_gras',
              'risotto',
              'chicken_curry',
              'croque_madame',
              'falafel',
              'lobster_roll_sandwich',
              'peking_duck',
              'shrimp_and_grits',
              'donuts',
              'mussels',
              'edamame',
              'ceviche',
              'grilled_salmon',
              'hot_and_sour_soup',
              'nachos',
              'ramen',
              'carrot_cake']

In [None]:
# Create a dictionary where the keys are the indices and the values are the food subclasses
food_class_labels = {i: subclass for i, subclass in enumerate(subclasses)}

# Write the dictionary to a JSON file
with open('subclasses.json', 'w') as f:
    json.dump(food_class_labels, f)

In [None]:
# Training data preprocessing
train = pd.read_csv(train_txt_data, sep=" ", header=None, usecols=[0])
filtered_train_data = [x for x in train[0] if any(subclass in x for subclass in subclasses)]

print("Before filtering, there were {0} observations in the training data set.".format(len(train)))
print("After filtering, there are {0} observations in the training data set.".format(len(filtered_train_data)))

Before filtering, there were 75750 observations in the training data set.
After filtering, there are 37500 observations in the training data set.


In [None]:
# Test data preprocessing
test = pd.read_csv(test_txt_data, sep=" ", header=None, usecols=[0])
filtered_test_data = [x for x in test[0] if any(subclass in x for subclass in subclasses)]

print("Before filtering, there were {0} observations in the training data set.".format(len(test)))
print("After filtering, there are {0} observations in the training data set.".format(len(filtered_test_data)))

Before filtering, there were 25250 observations in the training data set.
After filtering, there are 12500 observations in the training data set.


Great. Now using `filtered_train_data`, we can create the `current_train` and `current_val` variables.

In [None]:
current_train, current_val = train_test_split(filtered_train_data, test_size=0.1, random_state=42)
print("Of the total {0} observations in filtered_train_data, there are {1} observations in current_train and {2} observations in current_val.".format(len(filtered_train_data), len(current_train), len(current_val)))

Of the total 37500 observations in filtered_train_data, there are 33750 observations in current_train and 3750 observations in current_val.


In [None]:
current_train[0:5]

['panna_cotta/3697933',
 'grilled_salmon/937466',
 'carrot_cake/3855103',
 'baby_back_ribs/2511963',
 'hamburger/3759470']

In [None]:
current_val[0:5]

['omelette/210179',
 'hot_and_sour_soup/274720',
 'carrot_cake/1436835',
 'nachos/3517672',
 'samosa/1982708']

# Section 4: Creating Training Directories for Data

Please note: The code in Section 4 only needs to be run once only. Do not run it more than once.

## `current_train` folder + data

In [None]:
# Please update these paths accordingly for your local machine.
images_path = '/content/drive/Othercomputers/My Computer/food-101/images'  # Path to images folder within project directory
project_path = '/content/drive/Othercomputers/My Computer/food-101'  # Path to project directory, food-101 folder

In [None]:
current_train_path = os.path.join(project_path, 'current_train')
os.makedirs(current_train_path, exist_ok=True)

# Create subfolders within current_train_path for each subclass of subclasses variable

for subclass in subclasses:
    current_train_subclass_path = os.path.join(current_train_path, subclass)
    os.makedirs(current_train_subclass_path, exist_ok=True)

In [None]:
# First, let's create a dictionary of values

current_train_dict = {}

for img in current_train:
    subclass_name = img.split('/')[0]
    if subclass_name not in current_train_dict:
        current_train_dict[subclass_name] = []
    current_train_dict[subclass_name].append(img)

for key, value in current_train_dict.items():
    for img in value:
        img_path = os.path.join(images_path, img + '.jpg')
        shutil.copy(img_path, os.path.join(current_train_path, key))

## `current_val` folder + data

In [None]:
current_val_path = os.path.join(project_path, 'current_val')
os.makedirs(current_val_path, exist_ok=True)

# Create subfolders within current_train_path for each subclass of subclasses variable

for subclass in subclasses:
    current_val_subclass_path = os.path.join(current_val_path, subclass)
    os.makedirs(current_val_subclass_path, exist_ok=True)

In [None]:
# First, let's create a dictionary of values

current_val_dict = {}

for img in current_val:
    subclass_name = img.split('/')[0]
    if subclass_name not in current_val_dict:
        current_val_dict[subclass_name] = []
    current_val_dict[subclass_name].append(img)

for key, value in current_val_dict.items():
    for img in value:
        img_path = os.path.join(images_path, img + '.jpg')
        shutil.copy(img_path, os.path.join(current_val_path, key))

## `filtered_test_data` folder + data

In [None]:
filtered_test_data_path = os.path.join(project_path, 'filtered_test_data')
os.makedirs(filtered_test_data_path, exist_ok=True)

# Create subfolders within current_train_path for each subclass of subclasses variable

for subclass in subclasses:
    current_test_subclass_path = os.path.join(filtered_test_data_path, subclass)
    os.makedirs(current_test_subclass_path, exist_ok=True)

In [None]:
# First, let's create a dictionary of values
current_test_data_dict = {}

for img in filtered_test_data:
    subclass_name = img.split('/')[0]
    if subclass_name not in current_test_data_dict:
        current_test_data_dict[subclass_name] = []
    current_test_data_dict[subclass_name].append(img)

for key, value in current_test_data_dict.items():
    for img in value:
        img_path = os.path.join(images_path, img + '.jpg')
        shutil.copy(img_path, os.path.join(filtered_test_data_path, key))

# Section 5: ML Models

In [None]:
current_train_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    # transforms.Normalize(mean=means,std=stdevs)
])


In [None]:
current_train_path = '/content/drive/Othercomputers/My Computer/food-101/current_train'  # Path to current_train folder that is contained within the Food-101 folder

current_train_temp = datasets.ImageFolder(root = current_train_path, transform=current_train_transform)

current_train_dataloader = torch.utils.data.DataLoader(current_train_temp, batch_size = 128, shuffle = True, pin_memory = True, num_workers = 2)

In [None]:
dataiter = iter(current_train_dataloader)
images, labels = next(dataiter)

## 5.1 ResNet

In [None]:
resnet50 = models.resnet50(pretrained=True)
print(resnet50)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 89.7MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

### Fine-tuning

Since we are only working with a subset of 50 classes, we can replace the Fully-Connected layer in the output layer with a new Fully-Connected layer containing 50 neurons only, one neuron for each of the subclasses.

In [None]:
# Glorot initialization (https://pytorch.org/docs/stable/nn.init.html)
resnet50.fc = torch.nn.Linear(resnet50.fc.in_features, 50)
torch.nn.init.xavier_normal_(resnet50.fc.weight)

Parameter containing:
tensor([[ 0.0170, -0.0124,  0.0219,  ...,  0.0018, -0.0076,  0.0001],
        [-0.0320,  0.0167,  0.0280,  ..., -0.0133,  0.0163, -0.0001],
        [ 0.0219,  0.0554, -0.0112,  ..., -0.0083,  0.0165, -0.0193],
        ...,
        [ 0.0675,  0.0302,  0.0134,  ...,  0.0272, -0.0175, -0.0418],
        [ 0.0203,  0.0322,  0.0031,  ...,  0.0090,  0.0146,  0.0179],
        [-0.0180,  0.0010, -0.0246,  ..., -0.0181,  0.0038,  0.0392]],
       requires_grad=True)

The next cell can take quite a while to run so I have hardcoded the means and stdevs as well for convenience but you are welcome to rerun the cell below if you want.

In [None]:
means = []
stdevs = []
total_batches = len(current_train_dataloader)

for i, (X, _) in enumerate(current_train_dataloader):
    means.append(X.mean(dim = (0, 2, 3)))
    stdevs.append(X.std(dim = (0, 2, 3)))
    print(f"Processed batch {i+1}/{total_batches}")

mean_ = torch.stack(means).mean(dim = 0)
stdev_ = torch.stack(stdevs).mean(dim = 0)

print("Means:", mean_)
print("Standard Deviations:", stdev_)

In [None]:
means = torch.tensor([0.5447, 0.4402, 0.3355])
stdevs = torch.tensor([0.2713, 0.2733, 0.2785])

# Means: tensor([0.5447, 0.4402, 0.3355])
# Standard Deviations: tensor([0.2713, 0.2733, 0.2785])

Let's update `current_train_transform` and normalize it with the `means` and `stdevs`

In [None]:
current_train_path = '/content/drive/Othercomputers/My Computer/food-101/current_train'  # Path to current_train folder that is contained within the Food-101 folder

current_train_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=means,std=stdevs)
])

current_train_dataset = datasets.ImageFolder(root = current_train_path, transform = current_train_transform)

current_train_dataloader = torch.utils.data.DataLoader(current_train_dataset, batch_size = 128, shuffle = True, pin_memory = True, num_workers = 2)

Let's also create a `torchvision.transforms` object for `current_val`.

In [None]:
current_val_path = '/content/drive/Othercomputers/My Computer/food-101/current_val'  # Path to current_val folder that is contained within the Food-101 folder

current_val_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=means,std=stdevs)
])

current_val_dataset = datasets.ImageFolder(root = current_val_path, transform = current_val_transform)

current_val_dataloader = torch.utils.data.DataLoader(current_val_dataset, batch_size = 128, shuffle = True, pin_memory = True, num_workers = 2)

Let's also create a `torchvision.transforms` object for `filtered_test_data`.

In [None]:
filtered_test_data_path = '/content/drive/Othercomputers/My Computer/food-101/filtered_test_data'  # Path to filtered_test_data folder that is contained within the Food-101 folder

test_data_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor()])

test_dataset = datasets.ImageFolder(root = filtered_test_data_path, transform = test_data_transform)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size = 128, shuffle = True, pin_memory = True, num_workers = 2)

# Section 6: Training ResNet50

In [None]:
!nvidia-smi

Thu Nov  9 19:20:40 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
n_GPUs = torch.cuda.device_count()
print("There are {0} gpus available to use.".format(n_GPUs))

There are 1 gpus available to use.


In [None]:
# With Fine-Tuning
device = 'cuda' if torch.cuda.is_available() else 'cpu'
lr = 1e-5
weight_decay = 5e-4
epochs = 14
criterion = torch.nn.CrossEntropyLoss()
params = [param for name, param in resnet50.named_parameters() if 'fc' not in str(name)]
optimizer = torch.optim.Adam(params, lr=lr, weight_decay=weight_decay)

model_checkpoint = 1  # Save data every epoch

In [None]:
resnet50.to(device)

In [None]:
start = time.time()
print("Training for {0} epochs on {1} GPU(s).".format(epochs, n_GPUs))

for epoch in range(1, epochs+1):
    print("Epoch {0} / {1}".format(epoch, epochs))

    resnet50.train()

    train_loss = torch.tensor(0., device = device)
    train_accuracy = torch.tensor(0., device = device)

    for batch_idx, (data, target) in enumerate(current_train_dataloader):
        data = data.to(device)
        target = target.to(device)  # y

        optimizer.zero_grad()
        output = resnet50(data)  # preds
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            train_loss += loss * current_train_dataloader.batch_size
            train_accuracy += (output.argmax(dim=1) == target).float().sum()

    if current_val_dataloader is not None:
        resnet50.eval()

        valid_loss = torch.tensor(0., device = device)
        valid_accuracy = torch.tensor(0., device = device)

        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(current_val_dataloader):
                data = data.to(device)
                target = target.to(device)  # y
                output = resnet50(data)  # preds
                loss = criterion(output, target)
                valid_loss += loss * current_val_dataloader.batch_size
                valid_accuracy += (output.argmax(dim=1) == target).float().sum()

    print("Training loss: {0}".format(train_loss / len(current_train_dataloader.dataset)))
    print("Training accuracy: {0}".format(train_accuracy / len(current_train_dataloader.dataset)))

    if current_val_dataloader is not None:
        print("Validation loss: {0}".format(valid_loss / len(current_val_dataloader.dataset)))
        print("Validation accuracy: {0}".format(valid_accuracy / len(current_val_dataloader.dataset)))

    if epoch % model_checkpoint == 0:
        checkpoint = {
            'epoch': epoch,
            'resnet50_state_dict': resnet50.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
        }
        torch.save(checkpoint, './checkpoint.pth.tar')

        print()

    end = time.time()

    print("Total training time was {0} seconds.".format(end - start))
    print("")

Training for 14 epochs on 1 GPU(s).
Epoch 1 / 14
Training loss: 2.0432629585266113
Training accuracy: 0.5347259044647217
Validation loss: 1.52080237865448
Validation accuracy: 0.6410666704177856

Total training time was 367.214152097702 seconds.

Epoch 2 / 14
Training loss: 1.229315161705017
Training accuracy: 0.704444408416748
Validation loss: 1.1371982097625732
Validation accuracy: 0.7237333655357361

Total training time was 733.8126406669617 seconds.

Epoch 3 / 14
Training loss: 0.8809308409690857
Training accuracy: 0.7841481566429138
Validation loss: 0.9650610685348511
Validation accuracy: 0.7567999958992004

Total training time was 1100.878668308258 seconds.

Epoch 4 / 14
Training loss: 0.6587837338447571
Training accuracy: 0.8402962684631348
Validation loss: 0.8809050917625427
Validation accuracy: 0.7781333327293396

Total training time was 1465.950237751007 seconds.

Epoch 5 / 14
Training loss: 0.48785531520843506
Training accuracy: 0.8866666555404663
Validation loss: 0.83603173

## Evaluating model on test data

In [None]:
from sklearn import metrics

resnet50.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for batch_idx, (data, target) in enumerate(test_dataloader):
        data = data.to(device)
        target = target.to(device)  # y_true values
        output = resnet50(data)  # y_pred values
        y_true.extend(target.cpu().numpy())
        y_pred.extend(output.argmax(dim=1).cpu().numpy())

accuracy = metrics.accuracy_score(y_true, y_pred)
precision = metrics.precision_score(y_true, y_pred, average='weighted')
recall = metrics.recall_score(y_true, y_pred, average='weighted')
f1 = metrics.f1_score(y_true, y_pred, average='weighted')

print("Test accuracy: {0}".format(accuracy))
print("Precision: {0}".format(precision))
print("Recall: {0}".format(recall))
print("F1 Score: {0}".format(f1))

Test accuracy: 0.57008
Precision: 0.6547480922121868
Recall: 0.57008
F1 Score: 0.5742206075240507


# Save model to Google Drive

In [None]:
resnet50_checkpoint = torch.load('/content/checkpoint.pth.tar', map_location=torch.device('cpu'))

project_path = '/content/drive/MyDrive/Colab Notebooks/Large Scale Machine Learning 2/Final Project'  # Path to project directory, food-101 folder
checkpoint_path = os.path.join(project_path, 'checkpoint.pth.tar')
torch.save(resnet50_checkpoint, checkpoint_path)

# 0.5 marks for answer

please, provide detailed plan (and if possible results) of load testing of your service (latency, rps, what services would you use, how would you measure results, how can you track the effect of the model size/speed/quality)