In [1]:
root_path = '/home/eric/Documents/Hashtag-recommendation-for-social-images/image_text_hashtagging/src/image_text_classification/'
captions_filename = root_path + 'image_text_data.txt'
image_path='/home/eric/data/social_images/'

In [2]:
from collections import Counter
from itertools import chain
import os
import pickle
from string import digits
import time
from tqdm import tqdm
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import h5py
import numpy as np
import pandas as pd
import cv2
start_time = time.monotonic()

  from ._conv import register_converters as _register_converters


In [3]:
print('Loading data ...')
data_filename=captions_filename
data = pd.read_table(data_filename, sep='*')
data = np.asarray(data)
np.random.shuffle(data)
image_files = data[:, 0]
captions = data[:, 2]
tweets=data[:,1]
number_of_captions = image_files.shape[0]
print('Loaded', number_of_captions, 'captions')
print(image_files[5])
print(captions[5])
print(tweets[5])

Loading data ...
Loaded 57177 captions
dataset/design/2018-12-27_16-58-04_UTC.jpg
tbt throwbackthursday art photography gallery ootd floral artofvisuals tb wanderlust interior design classic vintage style
ve always had this feeling no matter where am in my life that it either memory or dream best last for at peggy guggenheim former home palazzo venier dei leoni on the grand canal


In [4]:
def lemmatize_sentence(caption):
    incorrect_chars = digits + ";.,'/*?¿><:{}[\]|+"
    char_translator = str.maketrans('', '', incorrect_chars)
    quotes_translator = str.maketrans('', '', '"')
    clean_caption = caption.strip().lower()
    clean_caption = clean_caption.translate(char_translator)
    clean_caption = clean_caption.translate(quotes_translator)
    clean_caption = clean_caption.split(' ')
    return clean_caption

In [5]:
max_caption_length=70
print('Removing captions longer than', max_caption_length, '...')
reduced_image_files = []
reduced_tweets=[]
reduced_captions = []
previous_file_size = len(captions)
for image_arg, caption in enumerate(captions):
    lemmatized_caption = lemmatize_sentence(caption)
    if (len(lemmatized_caption) <= max_caption_length):
        reduced_captions.append(lemmatized_caption)
        reduced_tweets.append(tweets[image_arg])
        reduced_image_files.append(image_files[image_arg])
captions = reduced_captions
tweets=reduced_tweets
image_files = reduced_image_files
current_file_size = len(captions)
file_difference = previous_file_size - current_file_size
print('Number of files removed:', file_difference)
print('Current number of files:', current_file_size)
initial_number_of_captions = previous_file_size
number_of_captions_removed = file_difference
current_number_of_captions = current_file_size


Removing captions longer than 70 ...
Number of files removed: 4
Current number of files: 57173


In [6]:
word_frequencies = Counter(chain(*captions)).most_common()

In [7]:
word_frequency_treshold=1
#TODO Add option to remove captions that have a words not in vocabulary
print('Removing words with a frequency less than',word_frequency_treshold,'...')
frequent_threshold_arg=len(word_frequencies)  # set default frequent_threshold_arg
for frequency_arg, frequency_data in enumerate(word_frequencies):
    frequency = frequency_data[1]
    if frequency <= word_frequency_treshold:
        frequent_threshold_arg = frequency_arg
        break
previous_vocabulary_size = len(word_frequencies)
if word_frequency_treshold != 0:
    word_frequencies = np.asarray(word_frequencies[0:frequent_threshold_arg])
else:
    word_frequencies = np.asarray(word_frequencies)
current_vocabulary_size = word_frequencies.shape[0]
vocabulary_difference = (previous_vocabulary_size -
                                current_vocabulary_size)
print('Number of words removed:',vocabulary_difference)
print('Current number of words:',current_vocabulary_size)
initial_number_of_words = previous_vocabulary_size
number_of_words_removed = vocabulary_difference
current_number_of_words = current_vocabulary_size

Removing words with a frequency less than 1 ...
Number of words removed: 0
Current number of words: 998


In [8]:
BOS = '<S>' #Beginning Of Sentence
EOS = '<E>' #End Of Sentence
PAD = '<P>'
words = word_frequencies[:, 0]
word_to_id = {PAD:0, BOS:1, EOS:2}
word_to_id.update({word:word_id for word_id, word
                                in enumerate(words, 3)})
id_to_word = {word_id:word for word, word_id
                                in word_to_id.items()}

Extract image features

In [9]:
from keras.applications.inception_v3 import preprocess_input
from keras.applications import InceptionV3
from keras.preprocessing import image
from keras.models import Model

IMG_FEATS = 2048
image_directory=image_path
base_model = InceptionV3(weights='imagenet')
model =  Model(inputs=base_model.input,
                                outputs=base_model.get_layer('avg_pool').output)
extracted_features = []
image_feature_files = list(set(image_files))
print(image_feature_files[:5])
number_of_images = len(image_feature_files)
for image_arg,image_file in tqdm(enumerate(image_feature_files)):
    _image_path = image_directory + image_file
    img = image.load_img(_image_path, target_size=(299, 299))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    CNN_features = model.predict(img)
    extracted_features.append(np.squeeze(CNN_features))
extracted_features = np.asarray(extracted_features)

Using TensorFlow backend.
0it [00:00, ?it/s]

['dataset/model/2018-11-07_22-20-32_UTC.jpg', 'dataset/pretty/2018-12-23_13-57-00_UTC.jpg', 'dataset/fun/2019-01-02_16-10-00_UTC.jpg', 'dataset/flowers/2018-12-25_09-29-58_UTC.jpg', 'dataset/goodmorning/2019-01-02_18-44-18_UTC.jpg']


57173it [44:50, 21.25it/s]


In [12]:
print(image_feature_files[:5])

['dataset/model/2018-11-07_22-20-32_UTC.jpg', 'dataset/pretty/2018-12-23_13-57-00_UTC.jpg', 'dataset/fun/2019-01-02_16-10-00_UTC.jpg', 'dataset/flowers/2018-12-25_09-29-58_UTC.jpg', 'dataset/goodmorning/2019-01-02_18-44-18_UTC.jpg']


In [14]:
print('Writing image features to h5...')
IMG_FEATS = 2048
cnn_extractor='inception'
dataset_file = h5py.File(cnn_extractor +'_image_name_to_features.h5')
number_of_features = len(image_feature_files)
for image_arg, image_file in tqdm(enumerate(image_feature_files)):
    file_id = dataset_file.create_group(image_file)
    image_data = file_id.create_dataset('image_features',
                                        (IMG_FEATS,), dtype='float32')
    image_data[:] = extracted_features[image_arg,:]
dataset_file.close()

470it [00:00, 2348.58it/s]

Writing image features to h5...


57173it [00:23, 2485.00it/s]


In [18]:
data_file = open('complete_data.txt','w')
data_file.write('image_names*tweets*hashtags\n')
for image_arg, image_name in enumerate(image_files):
    caption = ' '.join(captions[image_arg])
    data_file.write('%s*%s*%s\n' %(image_name,tweets[image_arg],caption))
data_file.close()

In [19]:
pickle.dump(word_to_id, open('word_to_id.p', 'wb'))
pickle.dump(id_to_word, open('id_to_word.p', 'wb'))

In [20]:
elapsed_time = time.monotonic() - start_time

In [22]:
log_file = open('data_parameters.log','w')
log_file.write('data_filename %s \n' %data_filename)
log_file.write('BOS: %s \n' % BOS)
log_file.write('EOS: %s \n' % EOS)
log_file.write('PAD: %s \n' % PAD)
log_file.write('IMG_FEATS: %s \n' %IMG_FEATS)
log_file.write('word_frequency_threshold: %s \n'
                        %word_frequency_treshold)
log_file.write('max_caption_length: %s \n'
                        %max_caption_length)
log_file.write('initial_data_size: %s \n'
                        %initial_number_of_captions)
log_file.write('captions_larger_than_threshold: %s \n'
                        %number_of_captions_removed)
log_file.write('current_data_size: %s \n'
                        %current_number_of_captions)
log_file.write('initial_word_size: %s \n'
                        %initial_number_of_words)
log_file.write('words_removed_by_frequency_threshold %s \n'
                        %number_of_words_removed)
log_file.write('current_word_size: %s \n'
                        %current_number_of_words)
log_file.write('cnn_extractor: %s \n' %cnn_extractor)
log_file.write('elapsed_time: %s' %elapsed_time)
log_file.close()

In [23]:
train_porcentage=0.90
complete_data = pd.read_table('complete_data.txt',sep='*')
data_size = complete_data.shape[0]
 # training_size = int(data_size*train_porcentage)
training_size = int(data_size*1)
complete_training_data = complete_data[0:training_size]
test_data = complete_data[training_size:]
test_data.to_csv('test_data.txt',sep='*',index=False)
# splitting between validation and training 
training_size = int(training_size*train_porcentage)
validation_data = complete_training_data[training_size:]
training_data = complete_training_data[0:training_size]
validation_data.to_csv('validation_data.txt',sep='*',index=False)
training_data.to_csv('training_data.txt',sep='*',index=False)
print('num of training data size:',training_size)
print('num of validation data size:',len(complete_training_data)-training_size)

num of training data size: 51455
num of validation data size: 5718
