## Show and Tell: A Neural Image Caption Generator

# Data processing

In [1]:
from keras import backend as K
from keras.models import Model, Sequential
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.image import load_img, img_to_array
from keras.applications.vgg19 import VGG19, preprocess_input

import numpy as np
import h5py
import string
import pickle

from os import listdir
from os.path import join, isdir, isfile, exists

Using TensorFlow backend.


In [2]:
meta_info = {
    'image_dir': 'Flicker8k_Dataset/',
    'train_list': 'Flickr8k_text/Flickr_8k.trainImages.txt',
    'dev_list': 'Flickr8k_text/Flickr_8k.devImages.txt',
    'test_list': 'Flickr8k_text/Flickr_8k.testImages.txt',
    'text_dir': 'Flickr8k_text/'
}

print(listdir(meta_info['image_dir'])[:5])

['1000268201_693b08cb0e.jpg', '1001773457_577c3a7d70.jpg', '1002674143_1b742ab4b8.jpg', '1003163366_44323f5815.jpg', '1007129816_e794419615.jpg']


## Image preprocessing

In [3]:
""" feature extract CNN model
This paper used GoogLeNet (InceptionV1) which got good grades in ImageNet 2014
but for convenience of implementation, I used various models including InceptionV3 in built-in module of keras.
My model has the best performance at VGG19.
"""
def model_select(model_name):
    if model_name == 'VGG16':
        from keras.applications.vgg16 import VGG16, preprocess_input
        model = VGG16() # 4096
    elif model_name == 'VGG19':
        from keras.applications.vgg19 import VGG19, preprocess_input 
        model = VGG19() # 4096
    elif model_name == 'ResNet50':
        from keras.applications.resnet50 import ResNet50, preprocess_input
        model = ResNet50() # 4096
    elif model_name == 'InceptionV3':
        from keras.applications.inception_v3 import InceptionV3, preprocess_input
        model = InceptionV3() # 2048,
    elif model_name == 'InceptionResNetV2':
        from keras.applications.inception_resnet_v2 import InceptionResNetV2, preprocess_input
        model = InceptionResNetV2() # 1536,
    return model

In [4]:
model_name = 'VGG19'
base_model = model_select(model_name)
# using FC2 layer output
cnn_model = Model(inputs=base_model.inputs, outputs=base_model.layers[-2].output)
cnn_model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
bloc

### Image to feature

In [5]:
"""
Usually training set is the bigger,
so I prefer to testing with validation set first.
"""

dev_features = {}
dev_h5 = 'dev_features.h5'
with h5py.File(dev_h5, 'w') as h5f:
    with open(meta_info['dev_list']) as f:
        c = 0 # count
        contents = f.read()
        for line in contents.split('\n'):
            if line == '': # last line or error line
                print(c)
                continue
            if c % 100 == 0:
                print(c)
            # Unlike other models, inception models use the larger image sizes.
            if model_name.find('Inception') != -1:
                target_size = (299, 299)
            else:
                target_size = (224, 224)
                
            img_path = line
            img = load_img(meta_info['image_dir'] + img_path, target_size=target_size)
            img = img_to_array(img)
            img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
            img = preprocess_input(img)
            feature = cnn_model.predict(img)
            h5f.create_dataset(img_path.split('.')[0], data=feature)
            c += 1

0
100
200
300
400
500
600
700
800
900
1000


In [6]:
# feature test
with h5py.File('dev_features.h5', 'r') as h5f:
    print(h5f['2090545563_a4e66ec76b'][:])
    print(h5f['2090545563_a4e66ec76b'][:].shape)

[[0.        0.        1.0180278 ... 0.        0.        0.       ]]
(1, 4096)


In [7]:
train_features = {}
train_h5 = 'train_features.h5'
with h5py.File(train_h5, 'w') as h5f:
    with open(meta_info['train_list']) as f:
        c = 0 # count
        contents = f.read()
        for line in contents.split('\n'):
            if line == '': # last line or error line
                print(c)
                continue
            if c % 1000 == 0:
                print(c)

            if model_name.find('Inception') != -1:
                target_size = (299, 299)
            else:
                target_size = (224, 224)
                
            img_path = line
            img = load_img(meta_info['image_dir'] + img_path, target_size=target_size)
            img = img_to_array(img)
            img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
            img = preprocess_input(img)
            feature = cnn_model.predict(img)
            h5f.create_dataset(img_path.split('.')[0], data=feature)
            c += 1

0
1000
2000
3000
4000
5000
6000


In [8]:
test_features = {}
test_h5 = 'test_features.h5'
with h5py.File(test_h5, 'w') as h5f:
    with open(meta_info['test_list']) as f:
        c = 0 # count
        contents = f.read()
        for line in contents.split('\n'):
            if line == '': # last line or error line
                print(c)
                continue
            if c % 100 == 0:
                print(c)

            if model_name.find('Inception') != -1:
                target_size = (299, 299)
            else:
                target_size = (224, 224)
                
            img_path = line
            img = load_img(meta_info['image_dir'] + img_path, target_size=target_size)
            img = img_to_array(img)
            img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
            img = preprocess_input(img)
            feature = cnn_model.predict(img)
            h5f.create_dataset(img_path.split('.')[0], data=feature)
            c += 1

0
100
200
300
400
500
600
700
800
900
1000


## Text preprocessing

In [9]:
""" full captions to dictionary
The dictionary has full dataset(training, validation, and test captions), 
and numbers are eliminated from all captions.
Removing numbers improves performance (by about 3 points for bleu-1)
"""

captions = dict()
words = set()

with open(join(meta_info['text_dir'], 'Flickr8k.token.txt')) as f:
    contents = f.read()
    n_captions = 0
    for line in contents.split('\n'):
        if line == '':
            print(n_captions)
            continue
        if n_captions % 10000 == 0:
            print(n_captions)
        
        file, caption = line.split('\t')
        
        table = str.maketrans('', '', string.punctuation)
        
        caption2 = []
        for word in caption.split():
            # remove number
            if word.isalpha():
                caption2.append(word.translate(table))
        caption = ' '.join(caption2)
        
        img_id = file.split('.')[0]
        
        if img_id in captions.keys():
            captions[img_id].append(caption)
        else:
            captions[img_id] = [caption]
        n_captions += 1

        [words.add(word) for word in caption.split()]
        
print('number of images: %d' % len(captions))
print('number of catpions: %d' % n_captions)
print('number of words: %d' % len(words))

0
10000
20000
30000
40000
40460
number of images: 8092
number of catpions: 40460
number of words: 9068


In [10]:
# train set caption test
print(captions['2513260012_03d33305cf'])
# dev set caption test
print(captions['2090545563_a4e66ec76b'])
# test set caption test
print(captions['3385593926_d3e9c21170'])

['A black dog is running after a white dog in the snow', 'Black dog chasing brown dog through snow', 'Two dogs chase each other across the snowy ground', 'Two dogs play together in the snow', 'Two dogs running through a low lying body of water']
['the boy laying face down on a skateboard is being pushed along the ground by another boy', 'Two girls play on a skateboard in a courtyard', 'Two people play on a long skateboard', 'Two small children in red shirts playing on a skateboard', 'two young children on a skateboard going across a sidewalk']
['The dogs are in the snow in front of a fence', 'The dogs play on the snow', 'Two brown dogs playfully fight in the snow', 'Two brown dogs wrestle in the snow', 'Two dogs playing in the snow']


In [11]:
""" Only dev captions are taken from the full captions set.
Unlike above caption, this captions has sign of start and end for sequence.
Each [CLS], [SEP], based BERT
keras' tokenizer removes <>, so need to further processing in this process.
"""

dev_captions = dict()
dev_words = set()

with open(join(meta_info['text_dir'], 'Flickr_8k.devImages.txt')) as f:
    contents = f.read()
    n_dev_captions = 0
    for line in contents.split('\n'):
        if line == '':
            print(n_dev_captions)
            continue
        if n_dev_captions % 10000 == 0:
            print(n_dev_captions)
        
        file = line.split('.')[0]
        
        for caption in captions[file]:
            # start sign: [CLS]
            # end sign: [SEP]
            caption = '[CLS] ' + caption + ' [SEP]'
            caption = caption.replace('\n', '')
            
            if file in dev_captions.keys():
                dev_captions[file].append(caption)
            else:
                dev_captions[file] = [caption]
            n_dev_captions += 1
            
            [dev_words.add(word) for word in caption.split()]

print('number of catpions: %d' % len(dev_captions))
print('number of catpions: %d' % n_dev_captions)
print('number of words: %d' % len(dev_words))

0
5000
number of catpions: 1000
number of catpions: 5000
number of words: 3409


In [12]:
# dev set caption test
print(dev_captions['2090545563_a4e66ec76b'])

['sssss the boy laying face down on a skateboard is being pushed along the ground by another boy eeeee', 'sssss Two girls play on a skateboard in a courtyard eeeee', 'sssss Two people play on a long skateboard eeeee', 'sssss Two small children in red shirts playing on a skateboard eeeee', 'sssss two young children on a skateboard going across a sidewalk eeeee']


In [13]:
"""
Unlike a dev set, training set must count the maximum number of words in single sentence.
Variable M do that role.
"""

train_captions = dict()
train_words = set()

M = 0 # max length in single sentence

with open(join(meta_info['text_dir'], 'Flickr_8k.trainImages.txt')) as f:
    contents = f.read()
    n_train_captions = 0
    for line in contents.split('\n'):
        if line == '':
            print(n_train_captions)
            continue
        if n_train_captions % 10000 == 0:
            print(n_train_captions)
        
        file = line.split('.')[0]
        
        for caption in captions[file]:
            caption = '[CLS] ' + caption + ' [SEP]'
            caption = caption.replace('\n', '')
            
            if file in train_captions.keys():
                train_captions[file].append(caption)
            else:
                train_captions[file] = [caption]
            n_train_captions += 1
            
            t = caption.split()
            if len(t) > M:
                M = len(t)
            [train_words.add(word) for word in t]

# n_vocabs = len(train_words) # all word, based str.split()

print('number of catpions: %d' % len(train_captions))
print('number of catpions: %d' % n_train_captions)
print('number of words: %d' % len(train_words))

# print('vocabulary size: %d' % n_vocabs)
print('max number of words in single sentence: %d' % M)

0
10000
20000
30000
number of catpions: 6000
number of catpions: 30000
number of words: 7816
max number of words in single sentence: 36


In [14]:
# train set caption test
print(train_captions['2513260012_03d33305cf'])

['sssss A black dog is running after a white dog in the snow eeeee', 'sssss Black dog chasing brown dog through snow eeeee', 'sssss Two dogs chase each other across the snowy ground eeeee', 'sssss Two dogs play together in the snow eeeee', 'sssss Two dogs running through a low lying body of water eeeee']


In [15]:
test_captions = dict()
test_words = set()

with open(join(meta_info['text_dir'], 'Flickr_8k.testImages.txt')) as f:
    contents = f.read()
    n_test_captions = 0
    for line in contents.split('\n'):
        if line == '':
            print(n_test_captions)
            continue
        if n_test_captions % 10000 == 0:
            print(n_test_captions)
        
        file = line.split('.')[0]
        
        for caption in captions[file]:
            caption = '[CLS] ' + caption + ' [SEP]'
            caption = caption.replace('\n', '')
            
            if file in test_captions.keys():
                test_captions[file].append(caption)
            else:
                test_captions[file] = [caption]
            n_test_captions += 1
            
            [test_words.add(word) for word in caption.split()]

print('number of catpions: %d' % len(test_captions))
print('number of catpions: %d' % n_test_captions)
print('number of words: %d' % len(test_words))

0
5000
number of catpions: 1000
number of catpions: 5000
number of words: 3266


In [16]:
# test set caption test
print(test_captions['3385593926_d3e9c21170'])

['sssss The dogs are in the snow in front of a fence eeeee', 'sssss The dogs play on the snow eeeee', 'sssss Two brown dogs playfully fight in the snow eeeee', 'sssss Two brown dogs wrestle in the snow eeeee', 'sssss Two dogs playing in the snow eeeee']


In [17]:
""" make tokenizer using keras.
Making tokenizer, only use train captions.
"""
def make_tokenizer(captions):
    texts = []
    for _, caption_list in captions.items():
        for caption in caption_list:
            texts.append(caption)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    return tokenizer

In [18]:
tokenizer = make_tokenizer(train_captions)
n_vocabs = len(tokenizer.word_index) + 1 # because index 0, plus 1
print('number of vocabulary: %d' % n_vocabs)
# print(tokenizer.word_index)

number of vocabulary: 7277


In [19]:
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)
# print(len(tokenizer.word_index))

In [21]:
""" Make sequence, Make next word based ground truth.
If single sentence consisting of N words, N + 1(because nd sign) sequences are created.
Ex) Hi, I am a boy.
sequence                 -> next word
[]   []   []   []   [Hi] -> I
[]   []   []   [Hi] [I]  -> am
[]   []   [Hi] [I]  [am] -> a
...
[Hi] [I] [am] [a] [boy] -> '[SEP]' (end sign)
"""
train_sequences = list()
train_next_word = list()

c = 0
train_sequences_h5 = 'train_sequences.h5'
train_next_word_h5 = 'train_next_word.h5'
h5f1 = h5py.File(train_sequences_h5, 'w')
h5f2 = h5py.File(train_next_word_h5, 'w')
for img_id, captions in train_captions.items():
#     print(img_id)
    Xtrain = list()
    ytrain = list()
    for caption in captions:
        sequence = tokenizer.texts_to_sequences([caption])[0]
        
        for i in range(1, len(sequence)): # except start sign
            if c % 100000 == 0:
                print(c)
            train_sequences.append(pad_sequences([sequence[:i]], M)[0])
            Xtrain.append(pad_sequences([sequence[:i]], M)[0])
            train_next_word.append(to_categorical([sequence[i]], num_classes=n_vocabs)[0])
            ytrain.append(to_categorical([sequence[i]], num_classes=n_vocabs)[0])
            c += 1
    h5f1.create_dataset(img_id, data=Xtrain)
    h5f2.create_dataset(img_id, data=ytrain)
h5f1.close()
h5f2.close()
print(c)

0
100000
200000
300000
352425


In [22]:
# test sequences and next word
print(train_sequences[0])
print(train_next_word[0])
print(train_sequences[1])
print(train_next_word[1])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2]
[0. 1. 0. ... 0. 0. 0.]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1]
[0. 0. 0. ... 0. 0. 0.]


In [23]:
dev_sequences = list()
dev_next_word = list()

c = 0
dev_sequences_h5 = 'dev_sequences.h5'
dev_next_word_h5 = 'dev_next_word.h5'
h5f1 = h5py.File(dev_sequences_h5, 'w')
h5f2 = h5py.File(dev_next_word_h5, 'w')
for img_id, captions in dev_captions.items():
#     print(img_id)
    Xdev = list()
    ydev = list()
    for caption in captions:
        text = tokenizer.texts_to_sequences([caption])[0]
        
        for i in range(1, len(text)):
            if c % 10000 == 0:
                print(c)
            dev_sequences.append(pad_sequences([text[:i]], M)[0])
            Xdev.append(pad_sequences([text[:i]], M)[0])
            dev_next_word.append(to_categorical([text[i]], num_classes=n_vocabs)[0])
            ydev.append(to_categorical([text[i]], num_classes=n_vocabs)[0])
            c += 1
    h5f1.create_dataset(img_id, data=Xdev)
    h5f2.create_dataset(img_id, data=ydev)
h5f1.close()
h5f2.close()
print(c)

0
10000
20000
30000
40000
50000
58661


In [24]:
test_sequences = list()
test_next_word = list()

c = 0
test_sequences_h5 = 'test_sequences.h5'
test_next_word_h5 = 'test_next_word.h5'
h5f1 = h5py.File(test_sequences_h5, 'w')
h5f2 = h5py.File(test_next_word_h5, 'w')
for img_id, captions in test_captions.items():
#     print(img_id)
    Xtest = list()
    ytest = list()
    for caption in captions:
        text = tokenizer.texts_to_sequences([caption])[0]
        
        for i in range(1, len(text)):
            if c % 10000 == 0:
                print(c)
            test_sequences.append(pad_sequences([text[:i]], M)[0])
            Xtest.append(pad_sequences([text[:i]], M)[0])
            test_next_word.append(to_categorical([text[i]], num_classes=n_vocabs)[0])
            ytest.append(to_categorical([text[i]], num_classes=n_vocabs)[0])
            c += 1
    h5f1.create_dataset(img_id, data=Xtest)
    h5f2.create_dataset(img_id, data=ytest)
h5f1.close()
h5f2.close()
print(c)

0
10000
20000
30000
40000
50000
58389


# Data processing end.

## Bellow code isn't need to look.

# h5 -> Pickle

In [None]:
train_sequences = list()
train_next_word = list()

c = 0
train_sequences_pkl = 'train_sequences.pkl'
train_next_word_pkl = 'train_next_word.pkl'

X = dict()
Y = dict()

for img_id, captions in train_captions.items():
#     print(img_id)
    Xtrain = list()
    ytrain = list()
    for caption in captions:
        text = tokenizer.texts_to_sequences([caption])[0]
        
        for i in range(1, len(text)):
            if c % 100000 == 0:
                print(c)
            train_sequences.append(pad_sequences([text[:i]], M)[0])
            Xtrain.append(pad_sequences([text[:i]], M)[0])
            train_next_word.append(to_categorical([text[i]], num_classes=n_vocabs)[0])
            ytrain.append(to_categorical([text[i]], num_classes=n_vocabs)[0])
            c += 1
    X[img_id] = Xtrain
    Y[img_id] = ytrain
with open(train_sequences_pkl, 'wb') as f:
    pickle.dump(X, f, protocol=pickle.HIGHEST_PROTOCOL)
with open(train_next_word_pkl, 'wb') as f:
    pickle.dump(Y, f, protocol=pickle.HIGHEST_PROTOCOL)
print(c)

In [None]:
with open(train_sequences_pkl, 'rb') as f:
    test = pickle.load(f)
    print(test['2513260012_03d33305cf'])

# not needed

In [23]:
train_id_word = dict()

for i, word in enumerate(train_words):
    train_id_word[i] = word
    train_word_id[word] = i

print(len(train_id_word))
print(len(train_word_id))

8256
8256


In [24]:
dev_id_word = dict()
dev_word_id = dict()

for i, word in enumerate(dev_words):
    dev_id_word[i] = word
    dev_word_id[word] = i

print(len(dev_id_word))
print(len(dev_word_id))

3523
3523


In [10]:
sequences = list()
nextwords = list()

data = {}
for captions in train_captions.items():
#     print(captions)
    data[captions[0]] = []
    for caption in captions[1]:
        t = []
        for word in caption.split():
            t.append(train_word_id[word])
        data[captions[0]].append(t)
#     print(data)
print(len(data))

6000


In [11]:
id_seq = {}
id_y = {}
c = 0
for key, value in data.items():
    sub_seqs = []
    Y = []
    for seq in value:
        for i in range(1, len(seq)):
            if c % 100000 == 0:
                print(c)
            sub_seqs.append(sequence.pad_sequences([seq[:i]], max_length)[0])
            y = to_categorical([seq[i]], num_classes=n_vocab + 1)
            Y.append(y[0])
            c += 1
            
    id_seq[key] = sub_seqs
    id_y[key] = Y
print(c)
#         print(id_seq)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
383454


In [12]:
h5file_path = 'train_id_seq.h5'
with h5py.File(h5file_path, 'w') as h5f:
    for key, value in id_seq.items():
        h5f.create_dataset(key, data=value)
# print(feature_np)
# np.squeeze(feature_np)
# print(feature_np.shape)

In [13]:
h5file_path = 'train_id_seq.h5'
with h5py.File(h5file_path, 'r') as h5f:
    print(h5f['667626_18933d713e'][:])
# print(feature_np)
# np.squeeze(feature_np)
# print(feature_np.shape)

[[8133    0    0 ...    0    0    0]
 [8133 4381    0 ...    0    0    0]
 [8133 4381  850 ...    0    0    0]
 ...
 [8133 4752 3548 ...    0    0    0]
 [8133 4752 3548 ...    0    0    0]
 [8133 4752 3548 ...    0    0    0]]


In [15]:
h5file_path = 'train_id_y.h5'
with h5py.File(h5file_path, 'w') as h5f:
    for key, value in id_y.items():
        h5f.create_dataset(key, data=value)
# print(feature_np)
# np.squeeze(feature_np)
# print(feature_np.shape)

In [16]:
h5file_path = 'train_id_y.h5'
with h5py.File(h5file_path, 'r') as h5f:
    print(h5f['667626_18933d713e'][:])
# print(feature_np)
# np.squeeze(feature_np)
# print(feature_np.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
sequences = list()
nextwords = list()

data = {}
for captions in dev_captions.items():
#     print(captions)
    data[captions[0]] = []
    for caption in captions[1]:
        t = []
        for word in caption.split():
            t.append(dev_word_id[word])
        data[captions[0]].append(t)
#     print(data)
print(len(data))

1000


In [11]:
id_seq = {}
id_y = {}
c = 0
for key, value in data.items():
    sub_seqs = []
    Y = []
    for seq in value:
        for i in range(1, len(seq)):
            if c % 10000 == 0:
                print(c)
            sub_seqs.append(sequence.pad_sequences([seq[:i]], max_length, padding='post')[0])
            y = to_categorical([seq[i]], num_classes=n_vocab)
            Y.append(y[0])
            c += 1
    id_seq[key] = sub_seqs
    id_y[key] = Y
print(c)
#         print(id_seq)

0
10000
20000
30000
40000
50000
60000
64445


In [12]:
h5file_path = 'dev_id_seq.h5'
with h5py.File(h5file_path, 'w') as h5f:
    for key, value in id_seq.items():
        h5f.create_dataset(key, data=value)
# print(feature_np)
# np.squeeze(feature_np)
# print(feature_np.shape)

In [13]:
h5file_path = 'dev_id_seq.h5'
with h5py.File(h5file_path, 'r') as h5f:
    print(h5f['2090545563_a4e66ec76b'][:])
# print(feature_np)
# np.squeeze(feature_np)
# print(feature_np.shape)

[[1626    0    0 ...    0    0    0]
 [1626 1622    0 ...    0    0    0]
 [1626 1622  830 ...    0    0    0]
 ...
 [1626 3127 2829 ...    0    0    0]
 [1626 3127 2829 ...    0    0    0]
 [1626 3127 2829 ...    0    0    0]]


In [14]:
h5file_path = 'dev_id_y.h5'
with h5py.File(h5file_path, 'w') as h5f:
    for key, value in id_y.items():
        h5f.create_dataset(key, data=value)
# print(feature_np)
# np.squeeze(feature_np)
# print(feature_np.shape)

In [15]:
h5file_path = 'dev_id_y.h5'
with h5py.File(h5file_path, 'r') as h5f:
    print(h5f['2090545563_a4e66ec76b'][:])
# print(feature_np)
# np.squeeze(feature_np)
# print(feature_np.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
