# Loading Libraries

In [None]:
import os
import zipfile
from tqdm.auto import tqdm
import pickle
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
import PIL

## Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing Data from Kaggle

In [None]:
# first import data
os.environ['KAGGLE_USERNAME'] = "rachaelgrudt" # username from the json file
os.environ['KAGGLE_KEY'] = "b592453cc4212cee2b898e9f2d7f834b" # key from the json file

os.chdir('/content/drive/My Drive/flickr_30k')

FileNotFoundError: ignored

In [None]:
!kaggle datasets download -d nunenuh/flickr30k

Downloading flickr30k.zip to /content/drive/My Drive/flickr_30k
100% 8.16G/8.16G [02:07<00:00, 116MB/s]
100% 8.16G/8.16G [02:08<00:00, 68.2MB/s]


In [None]:
with zipfile.ZipFile("/content/drive/My Drive/flickr_30k/flickr30k.zip","r") as zip_ref:
    zip_ref.extractall("/content/drive/My Drive/flickr_30k/")

OSError: ignored

## Extract features from the 31,783 images in the flicker30 dataset

this code is adapted from https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/


In [None]:
# extract features from each photo in the directory
# splitting into multiple parts so don't hit google limit
def extract_features(direct,start,stop):
  # load the model
  model = VGG16()
  # want the second to last layer to be the model output (this is the image features)
  model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
  # extract features from each photo
  features = {}
  for name in tqdm(os.listdir(direct)[start:stop]):
    # load an image from file
    filename = direct + '/' + name
    image = load_img(filename, target_size=(224, 224))
    # preprocess the image
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    # get features
    feature = model.predict(image, verbose=0)
    # get image id
    image_id = name.split('.')[0]
    # store feature
    features[image_id] = feature
  return features


In [None]:
directory = '/content/drive/My Drive/flickr_30k/images'

## Because the feature extraction takes so long (~ 2.5 hr per 5,000 images) I split the data into 7 sets of 5,000 so it did not have to all be run at once. I combine each section into a single "features.pkl" file at the end 

In [None]:
features_1 = extract_features(directory,0,5000)
pickle.dump(features_1, open('/content/drive/My Drive/flickr_30k/features_1.pkl', 'wb'))

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [None]:
features_2 = extract_features(directory,5000,10000)
pickle.dump(features_2, open('/content/drive/My Drive/flickr_30k/features_2.pkl', 'wb'))

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




NameError: ignored

In [None]:
features_3 = extract_features(directory,10000,15000)
pickle.dump(features_3, open('/content/drive/My Drive/flickr_30k/features_3.pkl', 'wb'))

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




NameError: ignored

In [None]:
features_4 = extract_features(directory,15000,20000)
pickle.dump(features_4, open('/content/drive/My Drive/flickr_30k/features_4.pkl', 'wb'))

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [None]:
features_5 = extract_features(directory,20000,25000)
pickle.dump(features_5, open('/content/drive/My Drive/flickr_30k/features_5.pkl', 'wb'))

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [None]:
features_6 = extract_features(directory,25000,30000)
pickle.dump(features_6, open('/content/drive/My Drive/flickr_30k/features_6.pkl', 'wb'))

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [None]:
features_7 = extract_features(directory,30000,len(os.listdir(directory)))
pickle.dump(features_7, open('/content/drive/My Drive/flickr_30k/features_7.pkl', 'wb'))

HBox(children=(FloatProgress(value=0.0, max=1783.0), HTML(value='')))




In [None]:
features_1 = pickle.load(open('/content/drive/My Drive/flickr_30k/features_1.pkl',"rb"))
features_2 = pickle.load(open('/content/drive/My Drive/flickr_30k/features_2.pkl',"rb"))
features_3 = pickle.load(open('/content/drive/My Drive/flickr_30k/features_3.pkl',"rb"))
features_4 = pickle.load(open('/content/drive/My Drive/flickr_30k/features_4.pkl',"rb"))
features_5 = pickle.load(open('/content/drive/My Drive/flickr_30k/features_5.pkl',"rb"))
features_6 = pickle.load(open('/content/drive/My Drive/flickr_30k/features_6.pkl',"rb"))
features_7 = pickle.load(open('/content/drive/My Drive/flickr_30k/features_7.pkl',"rb"))

In [None]:
features = features_1.copy()
features.update(features_2)
features.update(features_3)
features.update(features_4)
features.update(features_5)
features.update(features_6)
features.update(features_7)
pickle.dump(features, open('/content/drive/My Drive/flickr_30k/features.pkl', 'wb'))