In [1]:
import os
import re
import gc
import cv2
import h5py
import torch
import string
import random
import numpy as np
import tensorflow as tf

from tqdm import tqdm
from nltk import tokenize
from sklearn import preprocessing

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Rescaling, Resizing
from keras import Input
from keras.applications.vgg16 import VGG16
from keras.applications.vgg19 import VGG19
from keras.applications.xception import Xception
from tensorflow.keras.applications import ResNet50, ResNet101, ResNet152
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from tensorflow.keras import Model

In [2]:
IMAGE_SIZE = (224, 224)
NUM_CHANNELS = 3

In [3]:
mvsa_single_path = '../input/mvsa-data/mvsa-single-4316.hdf5'
mvsa_multiple_path = '../input/mvsa-data/mvsa-multiple-16387.hdf5'

In [4]:
def read_hdf5(path):
    read_file = h5py.File(path, 'r')
    
    feature_names = list(read_file.keys())
    loaded_data = []
    
    for name in feature_names:
        dataset = read_file[name][:]
        if dataset.dtype == np.dtype('object'):
            dataset = np.array([x.decode('UTF-8') for x in dataset])            
        loaded_data.append((name, dataset))

    return loaded_data

In [5]:
def load_mvsa_data(path, multiple=False):
    data = read_hdf5(path)
    for x in data:
        if x[0] == 'texts':
            texts = x[1]
        if x[0] == 'images':
            images = x[1]
        if x[0] == 'multimodal-labels':
            labels = x[1]
        if x[0] == 'text-labels':
            text_labels = x[1]
        if x[0] == 'image-labels':
            image_labels = x[1]
            
    if multiple == True:
        images_path = os.path.join(os.path.split(path)[0], os.path.split(path)[1].split('.')[0] + '-images.npz')
#         npzfile = np.load(images_path)
#         images = npzfile['arr_0']
        images = loadz(images_path)
        
    return texts, images, labels, text_labels, image_labels

def loadz(path):
    data = np.load(path)['arr_0']
    return data

# Load raw data

In [6]:
mvsa_single_texts, mvsa_single_images, \
mvsa_single_multimodal_labels, mvsa_single_text_labels, \
mvsa_single_image_labels = load_mvsa_data(mvsa_single_path)
num_mvsa_single = len(mvsa_single_texts)

mvsa_multiple_texts, mvsa_multiple_images, \
mvsa_multiple_multimodal_labels, mvsa_multiple_text_labels, \
mvsa_multiple_image_labels = load_mvsa_data(mvsa_multiple_path, multiple=True)
num_mvsa_multiple = len(mvsa_multiple_texts)

# ResNet101

In [7]:
resnet101 = ResNet101(input_shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], NUM_CHANNELS))
resnet101.trainable = False # Freeze pre-trained layers
resnet101_last = Model(inputs=resnet101.input, outputs=resnet101.layers[-2].output)

2022-06-29 16:09:52.102851: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet101_weights_tf_dim_ordering_tf_kernels.h5


In [8]:
# Summary of VGG-19 is hidden
image_inputs = Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], NUM_CHANNELS))
rescale_layer = Rescaling(scale=1./255.) (image_inputs)  # Rescale inputs
outputs = resnet101_last (rescale_layer)
model_resnet101 = Model(inputs=image_inputs, outputs=outputs)

In [9]:
print('MVSA-Single: Extracting ResNet101 features of images')
mvsa_single_resnet101 = model_resnet101.predict(mvsa_single_images, verbose=1)
print('ResNet101 last hidden layer dimension:', mvsa_single_resnet101.shape[1])
print('MVSA-Single with ResNet101 last hidden layer:', mvsa_single_resnet101.shape)

# save and load check
np.savez('./mvsa-single-resnet101', mvsa_single_resnet101)
x = loadz('./mvsa-single-resnet101.npz')
print((x == mvsa_single_resnet101).all())

MVSA-Single: Extracting ResNet101 features of images


2022-06-29 16:10:00.111114: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


ResNet101 last hidden layer dimension: 2048
MVSA-Single with ResNet101 last hidden layer: (4316, 2048)
True


In [10]:
print('MVSA-Multiple: Extracting ResNet101 features of images')
mvsa_multiple_resnet101 = model_resnet101.predict(mvsa_multiple_images, verbose=1)
print('ResNet101 last hidden layer dimension:', mvsa_multiple_resnet101.shape[1])
print('MVSA-Multiple with ResNet101 last hidden layer:', mvsa_multiple_resnet101.shape)

# save and load check
np.savez('./mvsa-multiple-resnet101', mvsa_multiple_resnet101)
x = loadz('./mvsa-multiple-resnet101.npz')
print((x == mvsa_multiple_resnet101).all())

MVSA-Multiple: Extracting ResNet101 features of images
ResNet101 last hidden layer dimension: 2048
MVSA-Multiple with ResNet101 last hidden layer: (16387, 2048)
True
