In [0]:
!git clone https://github.com/christiansafka/img2vec.git
!git clone https://github.com/nlpaueb/bio_image_caption.git

In [0]:
import bio_image_caption.SiVL19.create_vocabulary as vocab
import bio_image_caption.SiVL19.frequency_baseline as freq
import bio_image_caption.SiVL19.onenn_baseline as onenn
import bio_image_caption.SiVL19.wmd_evaluation as wmd
import bio_image_caption.SiVL19.create_json_files as json_file
import os

In [0]:
# create the dataset folder, download data and split to train and test
!python bio_image_caption/SiVL19/get_iu_xray.py

In [0]:
# define the dataset folder
dataset_folder = "iu_xray/"
# define the sub folder that contains the images
images_path = os.path.join(dataset_folder, "iu_xray_images")

In [0]:
# get statistics and create vocabulary for the dataset
print("Processing IU X-ray:")
mean_length = vocab.create_vocabulary(dataset_folder)

In [0]:
print("Frequency baseline for IU X-ray:")
freq.most_frequent(dataset_folder, mean_length)

In [0]:
print("1NN baseline for IU X-ray:")
onenn.most_similar(dataset_folder, images_path, cuda=True)

In [0]:
# download pre-trained word embeddings
!wget https://archive.org/download/pubmed2018_w2v_200D.tar/pubmed2018_w2v_200D.tar.gz
# unzip word embeddings
!tar xvzf pubmed2018_w2v_200D.tar.gz

In [0]:
# define pretrained embeddings path
embeddings_path = "pubmed2018_w2v_200D/pubmed2018_w2v_200D.bin"

# evaluate results with wmd
print("IU X-ray evaluation:")
wmd.evaluate(dataset_folder, embeddings_path)

In [0]:
# create json files with the test images and the results of each baseline to use for MS COCO evaluation
json_file.create_jsons(dataset_folder)