# Preparing Dataset
---


In [1]:
%matplotlib widget

In [2]:
!ls

00_dataset.ipynb	     ant_transfer_learning_tutorial.ipynb
01_training.ipynb	     artifacts
01_training__output.ipynb    data
01_training_raw_food.ipynb   lightning_logs
02_torch_optimization.ipynb  milestone1_partial_solution_3classes.ipynb
Untitled.ipynb		     requirements.txt
Untitled1.ipynb		     train.py
__pycache__		     utility.py


## Imports

In [3]:
import datetime
import wandb
from matplotlib import pyplot as plt
import sklearn
import os
import glob
import pandas as pd
import numpy as np
import torch
import fiftyone as fo
import warnings
import itertools
import timeit
import json
import torchinfo
import pytorch_lightning as pl
import sklearn.model_selection
import joblib

import utility


In [4]:
pl.seed_everything(99)

Global seed set to 99


99

## Parameters


In [5]:
model_name = 'resnet34'

## Setup

uncomment to download

In [6]:
#!wget http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz
#!tar -xvf food-101.tar.gz

## Prepare dataset

In [7]:
df = pd.read_csv("data/food-101/meta/train.txt",header=None,names=['file_path'])
df['label_name'] = df['file_path'].apply(lambda x:x.split("/")[-2])
df['file_path'] = "data/food-101/images/" + df["file_path"]+".jpg"




Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
NumExpr defaulting to 8 threads.


In [8]:
df.head()

Unnamed: 0,file_path,label_name
0,data/food-101/images/apple_pie/1005649.jpg,apple_pie
1,data/food-101/images/apple_pie/1014775.jpg,apple_pie
2,data/food-101/images/apple_pie/1026328.jpg,apple_pie
3,data/food-101/images/apple_pie/1028787.jpg,apple_pie
4,data/food-101/images/apple_pie/1043283.jpg,apple_pie


In [9]:
label_encoder = sklearn.preprocessing.LabelEncoder()


In [10]:
label_encoder.fit(df['label_name'])


LabelEncoder()

In [11]:
df['label'] = label_encoder.transform(df['label_name'])


In [12]:
df_eval = pd.read_csv("data/food-101/meta/test.txt",header=None,names=['file_path'])
df_eval['label_name'] = df_eval['file_path'].apply(lambda x:x.split("/")[-2])
df_eval['file_path'] = "data/food-101/images/" + df_eval["file_path"]+".jpg"
df_eval['label'] = label_encoder.transform(df_eval['label_name'])


In [13]:
#!ls data/food-101/images

In [14]:
df_eval.head()

Unnamed: 0,file_path,label_name,label
0,data/food-101/images/apple_pie/1011328.jpg,apple_pie,0
1,data/food-101/images/apple_pie/101251.jpg,apple_pie,0
2,data/food-101/images/apple_pie/1034399.jpg,apple_pie,0
3,data/food-101/images/apple_pie/103801.jpg,apple_pie,0
4,data/food-101/images/apple_pie/1038694.jpg,apple_pie,0


In [15]:
# split the dataset; 85% for training and 15 % for testing
df_train, df_test = sklearn.model_selection.train_test_split(df, test_size=0.15, stratify=df['label'],shuffle = True) 

# split the training dataset: 80% for actual training and 20% for validation
df_train, df_val = sklearn.model_selection.train_test_split(df_train, test_size=0.20, random_state=1 , stratify=df_train['label'])

In [16]:
print ({"train": len(df_train), "test":len(df_test), "val":len(df_val) ,"eval": len(df_eval)  })

{'train': 51509, 'test': 11363, 'val': 12878, 'eval': 25250}


In [17]:
df_train['label_name'].value_counts()

prime_rib               510
club_sandwich           510
garlic_bread            510
chicken_wings           510
strawberry_shortcake    510
                       ... 
frozen_yogurt           510
pork_chop               510
omelette                510
beet_salad              510
pancakes                509
Name: label_name, Length: 101, dtype: int64

In [18]:
label_encoder.classes_

array(['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio',
       'beef_tartare', 'beet_salad', 'beignets', 'bibimbap',
       'bread_pudding', 'breakfast_burrito', 'bruschetta', 'caesar_salad',
       'cannoli', 'caprese_salad', 'carrot_cake', 'ceviche',
       'cheese_plate', 'cheesecake', 'chicken_curry',
       'chicken_quesadilla', 'chicken_wings', 'chocolate_cake',
       'chocolate_mousse', 'churros', 'clam_chowder', 'club_sandwich',
       'crab_cakes', 'creme_brulee', 'croque_madame', 'cup_cakes',
       'deviled_eggs', 'donuts', 'dumplings', 'edamame', 'eggs_benedict',
       'escargots', 'falafel', 'filet_mignon', 'fish_and_chips',
       'foie_gras', 'french_fries', 'french_onion_soup', 'french_toast',
       'fried_calamari', 'fried_rice', 'frozen_yogurt', 'garlic_bread',
       'gnocchi', 'greek_salad', 'grilled_cheese_sandwich',
       'grilled_salmon', 'guacamole', 'gyoza', 'hamburger',
       'hot_and_sour_soup', 'hot_dog', 'huevos_rancheros', 'hummus',
       

## Artifacts

In [19]:
df_train.to_parquet("artifacts/data/df_train.parquet",index=False)
df_test.to_parquet("artifacts/data/df_test.parquet",index=False)
df_val.to_parquet("artifacts/data/df_val.parquet",index=False)
df_eval.to_parquet("artifacts/data/df_eval.parquet",index=False)

In [20]:
joblib.dump(label_encoder, 'artifacts/label_encoder.joblib')


['artifacts/label_encoder.joblib']

In [21]:
with open("artifacts/classes.txt","w") as f:
    content = json.dumps(label_encoder.classes_.tolist())
    f.write(content)
