In [1]:
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import base64
import csv
import os
import seaborn as sns
import numpy as np
from glob import glob
from shutil import copyfile

%matplotlib inline

pd.set_option('display.max_columns', 100)

In [None]:
df = pd.read_csv('dataset/recipes_df10-12-2017.csv', error_bad_lines=False, encoding='utf-8', sep = '\t')

In [None]:
len(set(df.recipe_id))

In [None]:
df.tail()

In [None]:
ing_cols = [col for col in df.columns if '_ing' in col]
tag_cols = [col for col in df.columns if '_tag' in col]
all_cols = df.columns.values.tolist()
rest_cols = set(all_cols) - set(ing_cols) - set(tag_cols)
rest_cols

In [None]:
df_with_pics = df[df.has_picture == 'yes'].copy()

In [None]:
id_v = 'recipe_id'
dv   = 'printed_per_day'

In [None]:
df_with_pics[[id_v, dv]].head()

In [None]:
df_with_pics = df_with_pics.drop_duplicates(subset=[id_v, dv])

In [None]:
print(len(df_with_pics.recipe_id))
print(len(set(df_with_pics.recipe_id)))

In [None]:
df_with_pics[dv].plot.hist()

In [None]:
df_with_pics['label'] = (df_with_pics[dv] >= df_with_pics[dv].quantile(q=0.5)).astype(str)
df_with_pics['label'].head()

In [None]:
np.exp(df[dv].quantile(q=0.5))

In [None]:
np.exp(df_with_pics[dv].quantile(q=0.5))

# Save pictures into folders with category in their names

In [1]:
#current_dir = os.getcwd()
PROJECT_DIR = 'C:\\Users\\Natalia\\Documents\\GitHub\\Extracting-food-preferences'
CLASSIFICATION_DIR = PROJECT_DIR + '\\classification_with_pics'
PICTURES_DIR = PROJECT_DIR + '\\pictures\\search_pics'

In [None]:
%cd $CLASSIFICATION_DIR
%mkdir valid
%mkdir sample
%mkdir results
%mkdir -p sample\train
%mkdir -p sample\valid
%mkdir -p sample\results

In [None]:
# made a renamed copy of pictures
# remove images without labels
#nl = (set(g) - set(df_with_pics[id_v]+'.jpg'))
#g = list(set(g) - set(nl))
#for i in range(len(g)): copyfile(g[i], 
#                                 CLASSIFICATION_DIR + '\\train\\'+ df_with_pics[df_with_pics[id_v]+'.jpg' == g[i]].label.item() + '.' + g[i])

In [None]:
# Set train data folder as a working diretory
%cd $CLASSIFICATION_DIR/train

In [None]:
# move 30k pictures from training set to validation set
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(30000): os.rename(shuf[i], CLASSIFICATION_DIR+'\\valid\\' + shuf[i])

In [None]:
# copy 200 images to sample set for experimentation
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(200): copyfile(shuf[i], CLASSIFICATION_DIR+'\\sample\\train\\' + shuf[i])

In [None]:
# Set validation data folder as a working diretory
%cd $CLASSIFICATION_DIR/valid

In [None]:
# copy 50 images from validation set to sample validation
g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(50): copyfile(shuf[i], CLASSIFICATION_DIR+'\\sample\\valid\\' + shuf[i])

# Rearrange image files into their respective directories

In [None]:
#Divide cat/dog images into separate directories
%cd $CLASSIFICATION_DIR/sample/train
%mkdir popular
%mkdir unpopular
!move True.*.jpg popular/
!move False.*.jpg unpopular/

%cd $CLASSIFICATION_DIR/sample/valid
%mkdir popular
%mkdir unpopular
!move True.*.jpg popular/
!move False.*.jpg unpopular/

%cd $CLASSIFICATION_DIR/valid
%mkdir popular
%mkdir unpopular
!move True.*.jpg popular/
!move False.*.jpg unpopular/

%cd $CLASSIFICATION_DIR/train
%mkdir popular
%mkdir unpopular
!move True.*.jpg popular/
!move False.*.jpg unpopular/

# Finetune the VGG

In [3]:
#import modules
from utils import *
from vgg16 import Vgg16

#Instantiate plotting tool
#In Jupyter notebooks, you will need to run this command before doing any plotting
%matplotlib inline

In [36]:
from datetime import datetime

In [4]:
%cd $CLASSIFICATION_DIR

#Set path to sample/ path if desired
path = CLASSIFICATION_DIR + '\\' + 'sample\\'
#test_path = DATA_HOME_DIR + '/test/' #We use all the test data
results_path=path + 'results\\'
train_path=path + 'train\\'
valid_path=path + 'valid\\'

C:\Users\Natalia\Documents\GitHub\Extracting-food-preferences\classification_with_pics


In [5]:
#import Vgg16 helper class
vgg = Vgg16()

In [41]:
#Set constants. You can experiment with no_of_epochs to improve the model
batch_size=50
no_of_epochs=10

In [42]:
#Finetune the model
batches = vgg.get_batches(train_path, batch_size=batch_size)
val_batches = vgg.get_batches(valid_path, batch_size=batch_size*2)
vgg.finetune(batches)

#Not sure if we set this for all fits
vgg.model.optimizer.lr = 0.01

Found 200 images belonging to 2 classes.
Found 50 images belonging to 2 classes.


In [None]:
start = datetime.now()
#Notice we are passing in the validation dataset to the fit() method
#For each epoch we test our model against the validation set
latest_weights_filename = None
for epoch in range(no_of_epochs):
    print("Running epoch: {}".format(epoch))
    vgg.fit(batches, val_batches, nb_epoch=1)
    latest_weights_filename = 'ft{}.h5'.format(epoch)
    vgg.model.save_weights(results_path+latest_weights_filename)
print("Completed {} fit operations".format(no_of_epochs))
end = datetime.now()
total = end - start
print('Time to run the script on CPU is {}'.format(total))

Running epoch: 0
Epoch 1/1