This code evaluate the model after different epochs of training to find a good variance/bias trade off. The models are evaluated on a validation dataset using BLEU and METEOR translation values. 


# Update NLTK package

In [None]:
!pip install --upgrade nltk

Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 4.4MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.5-cp36-none-any.whl size=1434675 sha256=ae09a7444f8f6eca29d72cc5cf6875f851a2d8e0abea4b3a8157c5171a97cafe
  Stored in directory: /root/.cache/pip/wheels/ae/8c/3f/b1fe0ba04555b08b57ab52ab7f86023639a526d8bc8d384306
Successfully built nltk
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.5


# Load Libraries and Datasets

In [None]:
from numpy import argmax
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm
import numpy as np
import pickle
import re
import tensorflow as tf
import keras
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Model

from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint
from keras.utils import plot_model
from numpy import array
from random import sample
import pickle
import nltk
nltk.download('wordnet')
from nltk.translate import meteor_score
from nltk.translate.bleu_score import corpus_bleu

In [None]:
def save_obj(obj, name ):
    with open('/content/drive/MyDrive/flickr_30k/RG_trial_models/obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('/content/drive/MyDrive/flickr_30k/RG_trial_models/obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
validation_descriptions = load_obj('validation_descriptions_11282020')
validation_features = load_obj('validation_features_11282020')

In [None]:
# load tokenizer used in part 2, to ensure word tokenization consistency
with open('/content/drive/MyDrive/flickr_30k/RG_trial_models/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# prepare functions for model perfomance evaluation

In [None]:
max_len = 70

def word_for_id(integer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None
 
# generate a description for an image
def generate_desc_argmax(model,  photo):

	in_text_argmax = 'startseq'
	# iterate over the whole length of the sequence
	for i in range(max_len):

		sequence = tokenizer.texts_to_sequences([in_text_argmax])[0]
		sequence = pad_sequences([sequence], maxlen=max_len)

		yhat = model.predict([photo,sequence], verbose=0)
		yhat = argmax(yhat)
		word_argmax = word_for_id(yhat)

		# append as input for generating the next word
		in_text_argmax += ' ' + word_argmax
		# stop if we predict the end of the sequence
		if word_argmax == 'endseq':
			break
	return in_text_argmax

def generate_desc_beam(model, photo, beam_size):
  in_text_beam = [[tokenizer.texts_to_sequences(['startseq'])[0],0]]
	# iterate over the whole length of the sequence
  while len(in_text_beam[0][0]) < max_len:
    # for every set of predictions
    temp = []
    for i in in_text_beam:
      sequence = i[0]
      sequence = pad_sequences([sequence], maxlen=max_len, padding='post')
      prediction = model.predict([photo, sequence], verbose=0)
      top_predictions = np.argsort(prediction[0])[-beam_size:]
      for j in top_predictions:
        next_seq = i[0][:] + [j]
        next_prob = i[1] + prediction[0][j]
        temp += [[next_seq, next_prob]]
    in_text_beam = temp
    in_text_beam = sorted(in_text_beam, reverse=False, key=lambda x: x[1])
    in_text_beam = in_text_beam[-beam_size:]
  in_text_beam = in_text_beam[-1][0]
  final_text = 'startseq'
  intermediate_caption = [word_for_id(i) for i in in_text_beam]
  for word in intermediate_caption[1:]:
    final_text += ' ' + word
    if word == 'endseq':
      break
  return final_text

In [None]:
# trying to add beam search to model evaluation
def evaluate_model(model, descriptions, photos):
  actual, predicted_argmax, predicted_beam3,  = list(), list(), list()
# step over the whole set
  for key, desc_list in tqdm(descriptions.items()):
    references = [d.split() for d in desc_list]
    actual.append(references)
    # generate description
    yhat = generate_desc_argmax(model, photos[key])
    predicted_argmax.append(yhat.split())

    yhat3 = generate_desc_beam(model, photos[key], 3)
    predicted_beam3.append(yhat3.split())

  return actual, predicted_argmax,  predicted_beam3

In [None]:
# need to evaluate the models on the validation set
# 3 different models (128, 256, 512), 7 epochs per model
# and 5 different fits for each model

# Run first set of models on validation data

In [None]:
# to save time only using first 1,000 images 
list1 = list(validation_descriptions.keys())[0:1000]
mini_validation_features =  {k: validation_features[k] for k in list1}
mini_validation_descriptions =  {k: validation_descriptions[k] for k in list1}

In [None]:
# reload previous results, because the models have to be run over several days
all_results = load_obj('validate_prediction_11282020')

for i in [256, 128, 512]:
  model_loc = '/content/drive/MyDrive/flickr_30k/RG_trial_models/model_'
  if i == 256:
    model_loc += 'v1_111920_'
  elif i == 128:
    model_loc += str(126) +'_112820_'
  elif i== 512:
    model_loc += str(i) +'_112820_'
  for j in tqdm(range(13, 20,2)):
    model_loc2 = model_loc + str(j) + '.h5'
    curr_model  = tf.keras.models.load_model(model_loc2)
    a,b,c = evaluate_model(curr_model, mini_validation_descriptions, mini_validation_features)
    all_results[i][j] = [a,b,c]
    save_obj(all_results, 'validate_prediction_11282020')

  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/1000 [00:00<?, ?it/s][A
  0%|          | 1/1000 [00:14<3:57:10, 14.24s/it][A
  0%|          | 2/1000 [00:21<3:21:43, 12.13s/it][A
  0%|          | 3/1000 [00:28<2:56:32, 10.62s/it][A
  0%|          | 4/1000 [00:35<2:40:09,  9.65s/it][A
  0%|          | 5/1000 [00:43<2:27:36,  8.90s/it][A
  1%|          | 6/1000 [00:50<2:19:17,  8.41s/it][A
  1%|          | 7/1000 [00:57<2:14:23,  8.12s/it][A
  1%|          | 8/1000 [01:05<2:10:01,  7.86s/it][A
  1%|          | 9/1000 [01:12<2:06:33,  7.66s/it][A
  1%|          | 10/1000 [01:19<2:04:31,  7.55s/it][A
  1%|          | 11/1000 [01:26<2:02:51,  7.45s/it][A
  1%|          | 12/1000 [01:33<2:00:50,  7.34s/it][A
  1%|▏         | 13/1000 [01:41<2:00:23,  7.32s/it][A
  1%|▏         | 14/1000 [01:48<2:00:41,  7.34s/it][A
  2%|▏         | 15/1000 [01:55<2:00:56,  7.37s/it][A
  2%|▏         | 16/1000 [02:03<2:01:46,  7.43s/it][A
  2%|▏         | 17/1000 [02:10<2:01:32,  7.42s

In [None]:
all_results = load_obj('validate_prediction_11282020')

# Prepare model outputs for analysis using BLEU and METEOR

In [None]:
# in order to evaluate the results using METEOR and BLEU the outputs of the model must be converted to proper format and 'startseq' and 'endseq' tokens removed
BLEU_dict = {}
METEOR_dict = {}
for i in [128, 256, 512]:
  for j in range(13,20,2):
    for k in [0,1,2]:
      if k == 0:
        name = '_actual'
      elif k == 1:
        name = '_argmax'
      else:
        name = '_beam3'
      curr_model = str(i) + '_' + str(j) + name
      BLEU_dict[curr_model] = []
      METEOR_dict[curr_model] = []
      for picture in all_results[i][j][k]:
        curr_pic_list_bleu = []
        curr_pic_list_met = []
        if k == 0:
          for desc in picture:
            # remove startseq and endseq tags
            desc_bleu = desc[1:-1]
            curr_pic_list_bleu +=[desc_bleu]
            desc_met = ' '.join(desc[1:-1])
            curr_pic_list_met +=[desc_met]
          BLEU_dict[curr_model] += [curr_pic_list_bleu]
          METEOR_dict[curr_model] += [curr_pic_list_met]
        else:
          desc_bleu = picture[1:-1]
          desc_met = ' '.join(picture[1:-1])
          BLEU_dict[curr_model] += [desc_bleu]
          METEOR_dict[curr_model] += [desc_met]

In [None]:
# BLEU 1 ranking 128, epoch 13 is highest same for BLEU 2, same for BLEU 3 and BLEU 4 and for METEOR
for i in [128, 256, 512]:
  print('Size: ', i)
  for j in range(13,20,2):
    curr_actual = str(i) + '_' + str(j) + '_actual'
    curr_argmax = str(i) + '_' + str(j) + '_argmax'
    curr_beam = str(i) + '_' + str(j) + '_beam3'
    print('Epoch: ', j)
    print('BLEU-1, argmax: %f' % corpus_bleu(BLEU_dict[curr_actual], BLEU_dict[curr_argmax], weights=(1.0, 0, 0, 0)))
    #print('BLEU-1, beam3: %f' % corpus_bleu(BLEU_dict[curr_actual], BLEU_dict[curr_beam], weights=(1.0, 0, 0, 0)))
    print('BLEU-2, argmax: %f' % corpus_bleu(BLEU_dict[curr_actual], BLEU_dict[curr_argmax], weights=(0.5, 0.5, 0, 0)))
    #print('BLEU-2, beam3: %f' % corpus_bleu(BLEU_dict[curr_actual], BLEU_dict[curr_beam], weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3, argmax: %f' % corpus_bleu(BLEU_dict[curr_actual], BLEU_dict[curr_argmax], weights=(0.33, 0.33, 0.33, 0)))
    #print('BLEU-3, beam3: %f' % corpus_bleu(BLEU_dict[curr_actual], BLEU_dict[curr_beam], weights=(0.33, 0.33, 0.33, 0)))
    print('BLEU-4, argmax: %f' % corpus_bleu(BLEU_dict[curr_actual], BLEU_dict[curr_argmax], weights=(0.25, 0.25, 0.25, 0.25)))
    #print('BLEU-4, beam3: %f' % corpus_bleu(BLEU_dict[curr_actual], BLEU_dict[curr_beam], weights=(0.25, 0.25, 0.25, 0.25)))
    count = 0
    beam_total = 0
    argmax_total = 0
    for k in range(len(METEOR_dict[curr_actual])):
      actual = METEOR_dict[curr_actual][k]
      #b = METEOR_dict[curr_beam][k]
      a = METEOR_dict[curr_argmax][k]
     # curr_val_beam = meteor_score.meteor_score(actual, b)
      curr_val_argmax = meteor_score.meteor_score(actual, a)
      count += 1
     # beam_total += curr_val_beam
      argmax_total += curr_val_argmax
    print('METEOR SCORE, argmax:', argmax_total/count)
    #print('METEOR SCORE, beam:', beam_total/count)

Size:  128
Epoch:  13
BLEU-1, argmax: 0.481402
BLEU-2, argmax: 0.270718
BLEU-3, argmax: 0.154571
BLEU-4, argmax: 0.083933
METEOR SCORE, argmax: 0.2682418315834853
Epoch:  15
BLEU-1, argmax: 0.467202
BLEU-2, argmax: 0.263137
BLEU-3, argmax: 0.150437
BLEU-4, argmax: 0.080573
METEOR SCORE, argmax: 0.2677250449345576
Epoch:  17
BLEU-1, argmax: 0.473735
BLEU-2, argmax: 0.266067
BLEU-3, argmax: 0.149640
BLEU-4, argmax: 0.078844
METEOR SCORE, argmax: 0.26542692880908725
Epoch:  19
BLEU-1, argmax: 0.461498
BLEU-2, argmax: 0.259307
BLEU-3, argmax: 0.147672
BLEU-4, argmax: 0.079723
METEOR SCORE, argmax: 0.2636230948048126
Size:  256
Epoch:  13
BLEU-1, argmax: 0.466737
BLEU-2, argmax: 0.263972
BLEU-3, argmax: 0.151717
BLEU-4, argmax: 0.082430
METEOR SCORE, argmax: 0.26166125510593097
Epoch:  15
BLEU-1, argmax: 0.466362
BLEU-2, argmax: 0.264531
BLEU-3, argmax: 0.151298
BLEU-4, argmax: 0.081339
METEOR SCORE, argmax: 0.26260502277624137
Epoch:  17
BLEU-1, argmax: 0.458908
BLEU-2, argmax: 0.258737
BL

layer size 128 at epoch 13 fit best using all metrics beam3 performed slightly better than argmax for METEOR and argmax peformed better for BLEU

# Run second set of models on validation dataset

In [None]:
# want to make subset of 500 validation images to use (for speed purposes)
list2 = list(validation_descriptions.keys())[0:500]
mini_validation_features2 =  {k: validation_features[k] for k in list2}
mini_validation_descriptions2 =  {k: validation_descriptions[k] for k in list2}

In [None]:
round2_results = {}

model_loc = '/content/drive/MyDrive/flickr_30k/RG_trial_models/model_126_112820_'
for j in tqdm(range(11,15)):
  model_loc2 = model_loc + str(j) + '.h5'
  curr_model  = tf.keras.models.load_model(model_loc2)
  a,b,c = evaluate_model(curr_model, mini_validation_descriptions2, mini_validation_features2)
  round2_results[j] = [a,b,c]
  save_obj(round2_results, 'validate_prediction_part2_11302020')

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/500 [00:00<?, ?it/s][A
  0%|          | 1/500 [00:14<2:04:31, 14.97s/it][A
  0%|          | 2/500 [00:22<1:46:04, 12.78s/it][A
  1%|          | 3/500 [00:30<1:33:29, 11.29s/it][A
  1%|          | 4/500 [00:38<1:24:34, 10.23s/it][A
  1%|          | 5/500 [00:45<1:18:05,  9.46s/it][A
  1%|          | 6/500 [00:53<1:13:49,  8.97s/it][A
  1%|▏         | 7/500 [01:01<1:10:12,  8.54s/it][A
  2%|▏         | 8/500 [01:08<1:07:52,  8.28s/it][A
  2%|▏         | 9/500 [01:16<1:06:31,  8.13s/it][A
  2%|▏         | 10/500 [01:24<1:04:42,  7.92s/it][A
  2%|▏         | 11/500 [01:31<1:03:53,  7.84s/it][A
  2%|▏         | 12/500 [01:39<1:02:37,  7.70s/it][A
  3%|▎         | 13/500 [01:46<1:01:31,  7.58s/it][A
  3%|▎         | 14/500 [01:53<1:01:08,  7.55s/it][A
  3%|▎         | 15/500 [02:01<1:00:38,  7.50s/it][A
  3%|▎         | 16/500 [02:08<59:44,  7.41s/it]  [A
  3%|▎         | 17/500 [02:15<59:50,  7.43s/it][A
  4%|▎      

In [None]:
results_2 = load_obj('validate_prediction_part2_11302020')

In [None]:
BLEU_dict_2 = {}
METEOR_dict_2 = {}
for j in range(11,15,1):
  for k in [0,1,2]:
    if k == 0:
      name = '_actual'
    elif k == 1:
      name = '_argmax'
    else:
      name = '_beam3'
    curr_model = str(128) + '_' + str(j) + name
    BLEU_dict_2[curr_model] = []
    METEOR_dict_2[curr_model] = []
    for picture in results_2[j][k]:
      curr_pic_list_bleu = []
      curr_pic_list_met = []
      if k == 0:
        for desc in picture:
          # remove startseq and endseq tags
          desc_bleu = desc[1:-1]
          curr_pic_list_bleu +=[desc_bleu]
          desc_met = ' '.join(desc[1:-1])
          curr_pic_list_met +=[desc_met]
        BLEU_dict_2[curr_model] += [curr_pic_list_bleu]
        METEOR_dict_2[curr_model] += [curr_pic_list_met]
      else:
        desc_bleu = picture[1:-1]
        desc_met = ' '.join(picture[1:-1])
        BLEU_dict_2[curr_model] += [desc_bleu]
        METEOR_dict_2[curr_model] += [desc_met]

In [None]:
# BLEU 1 ranking 128, epoch 13 is highest same for BLEU 2, same for BLEU 3 and BLEU 4 and for METEOR

for j in range(11,15,1):
  curr_actual = str(128) + '_' + str(j) + '_actual'
  curr_argmax = str(128) + '_' + str(j) + '_argmax'
  curr_beam = str(128) + '_' + str(j) + '_beam3'
  print('Epoch: ', j)
  print('BLEU-1, argmax: %f' % corpus_bleu(BLEU_dict_2[curr_actual], BLEU_dict_2[curr_argmax], weights=(1.0, 0, 0, 0)))
  print('BLEU-1, beam3: %f' % corpus_bleu(BLEU_dict_2[curr_actual], BLEU_dict_2[curr_beam], weights=(1.0, 0, 0, 0)))
  print('BLEU-2, argmax: %f' % corpus_bleu(BLEU_dict_2[curr_actual], BLEU_dict_2[curr_argmax], weights=(0.5, 0.5, 0, 0)))
  print('BLEU-2, beam3: %f' % corpus_bleu(BLEU_dict_2[curr_actual], BLEU_dict_2[curr_beam], weights=(0.5, 0.5, 0, 0)))
  print('BLEU-3, argmax: %f' % corpus_bleu(BLEU_dict_2[curr_actual], BLEU_dict_2[curr_argmax], weights=(0.33, 0.33, 0.33, 0)))
  print('BLEU-3, beam3: %f' % corpus_bleu(BLEU_dict_2[curr_actual], BLEU_dict_2[curr_beam], weights=(0.33, 0.33, 0.33, 0)))
  print('BLEU-4, argmax: %f' % corpus_bleu(BLEU_dict_2[curr_actual], BLEU_dict_2[curr_argmax], weights=(0.25, 0.25, 0.25, 0.25)))
  print('BLEU-4, beam3: %f' % corpus_bleu(BLEU_dict_2[curr_actual], BLEU_dict_2[curr_beam], weights=(0.25, 0.25, 0.25, 0.25)))
  count = 0
  beam_total = 0
  argmax_total = 0
  for k in range(len(METEOR_dict_2[curr_actual])):
    actual = METEOR_dict_2[curr_actual][k]
    b = METEOR_dict_2[curr_beam][k]
    a = METEOR_dict_2[curr_argmax][k]
    curr_val_beam = meteor_score.meteor_score(actual, b)
    curr_val_argmax = meteor_score.meteor_score(actual, a)
    count += 1
    beam_total += curr_val_beam
    argmax_total += curr_val_argmax
  print('METEOR SCORE, argmax:', argmax_total/count)
  print('METEOR SCORE, beam:', beam_total/count)

Epoch:  11
BLEU-1, argmax: 0.477450
BLEU-1, beam3: 0.465224
BLEU-2, argmax: 0.260847
BLEU-2, beam3: 0.261913
BLEU-3, argmax: 0.142888
BLEU-3, beam3: 0.150393
BLEU-4, argmax: 0.074038
BLEU-4, beam3: 0.079238
METEOR SCORE, argmax: 0.2586564562422373
METEOR SCORE, beam: 0.264917304799644
Epoch:  12
BLEU-1, argmax: 0.480256
BLEU-1, beam3: 0.470010
BLEU-2, argmax: 0.265444
BLEU-2, beam3: 0.268449
BLEU-3, argmax: 0.149209
BLEU-3, beam3: 0.154209
BLEU-4, argmax: 0.079218
BLEU-4, beam3: 0.081095
METEOR SCORE, argmax: 0.2645584134598919
METEOR SCORE, beam: 0.2711168777718717
Epoch:  13
BLEU-1, argmax: 0.481982
BLEU-1, beam3: 0.468658
BLEU-2, argmax: 0.264255
BLEU-2, beam3: 0.267188
BLEU-3, argmax: 0.147560
BLEU-3, beam3: 0.155807
BLEU-4, argmax: 0.079184
BLEU-4, beam3: 0.083854
METEOR SCORE, argmax: 0.26720390341594824
METEOR SCORE, beam: 0.2691128631534178
Epoch:  14
BLEU-1, argmax: 0.479378
BLEU-1, beam3: 0.462986
BLEU-2, argmax: 0.264308
BLEU-2, beam3: 0.264318
BLEU-3, argmax: 0.148301
BLEU-

# Validation round 3: beam search width

In [None]:

# for round 3 look at different beam widths 
# trying to add beam search to model evaluation
def evaluate_model_multibeam(model, descriptions, photos):
  actual, predicted_argmax  = list(), list()
  predicted_beam2, predicted_beam3, predicted_beam4, predicted_beam5 = list(), list(), list(), list()
# step over the whole set
  for key, desc_list in tqdm(descriptions.items()):
    references = [d.split() for d in desc_list]
    actual.append(references)
    # generate description
    yhat = generate_desc_argmax(model, photos[key])
    predicted_argmax.append(yhat.split())

    yhat2 = generate_desc_beam(model, photos[key], 2)
    predicted_beam2.append(yhat2.split())

    yhat3 = generate_desc_beam(model, photos[key], 3)
    predicted_beam3.append(yhat3.split())

    yhat4 = generate_desc_beam(model, photos[key], 4)
    predicted_beam4.append(yhat4.split())

    yhat5 = generate_desc_beam(model, photos[key], 5)
    predicted_beam5.append(yhat5.split())

  return actual, predicted_argmax,  predicted_beam2, predicted_beam3, predicted_beam4, predicted_beam5



In [None]:
round3_results = {}

model_loc = '/content/drive/MyDrive/flickr_30k/RG_trial_models/model_126_112820_'
for j in [12]:
  model_loc2 = model_loc + str(j) + '.h5'
  curr_model  = tf.keras.models.load_model(model_loc2)
  a,b,c,d,e,f = evaluate_model_multibeam(curr_model, mini_validation_descriptions2, mini_validation_features2)
  round3_results[j] = [a,b,c,d,e,f]
  save_obj(round3_results, 'validate_prediction_part3_12012020')

100%|██████████| 500/500 [4:34:45<00:00, 32.97s/it]


NameError: ignored

In [None]:
round3_results = load_obj('validate_prediction_part3_12012020')

In [None]:
BLEU_dict_3 = {}
METEOR_dict_3 = {}
j= 12
for k in [0,1,2,3,4,5]:
  if k == 0:
    name = '_actual'
  elif k == 1:
    name = '_argmax'
  elif k == 2:
    name = '_beam2'
  elif k == 3:
    name = '_beam3'
  elif k == 4:
    name = '_beam4'
  elif k == 5:
    name = '_beam5'
  curr_model = str(128) + '_' + str(j) + name
  BLEU_dict_3[curr_model] = []
  METEOR_dict_3[curr_model] = []
  for picture in round3_results[j][k]:
    curr_pic_list_bleu = []
    curr_pic_list_met = []
    if k == 0:
      for desc in picture:
        # remove startseq and endseq tags
        desc_bleu = desc[1:-1]
        curr_pic_list_bleu +=[desc_bleu]
        desc_met = ' '.join(desc[1:-1])
        curr_pic_list_met +=[desc_met]
      BLEU_dict_3[curr_model] += [curr_pic_list_bleu]
      METEOR_dict_3[curr_model] += [curr_pic_list_met]
    else:
      desc_bleu = picture[1:-1]
      desc_met = ' '.join(picture[1:-1])
      BLEU_dict_3[curr_model] += [desc_bleu]
      METEOR_dict_3[curr_model] += [desc_met]
  

curr_actual = '128_12_actual'
curr_argmax = '128_12_argmax'
curr_beam2 = '128_12_beam2'
curr_beam3 = '128_12_beam3'
curr_beam4 = '128_12_beam4'
curr_beam5 = '128_12_beam5'



In [None]:
print('BLEU-1, argmax: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_argmax], weights=(1.0, 0, 0, 0)))
print('BLEU-1, beam2: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam2], weights=(1.0, 0, 0, 0)))
print('BLEU-1, beam3: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam3], weights=(1.0, 0, 0, 0)))
print('BLEU-1, beam4: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam4], weights=(1.0, 0, 0, 0)))
print('BLEU-1, beam5: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam5], weights=(1.0, 0, 0, 0)))
print('BLEU-2, argmax: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_argmax], weights=(0.5, 0.5, 0, 0)))
print('BLEU-2, beam2: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam2], weights=(0.5, 0.5, 0, 0)))
print('BLEU-2, beam3: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam3], weights=(0.5, 0.5, 0, 0)))
print('BLEU-2, beam4: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam4], weights=(0.5, 0.5, 0, 0)))
print('BLEU-2, beam5: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam5], weights=(0.5, 0.5, 0, 0)))
print('BLEU-3, argmax: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_argmax], weights=(0.33, 0.33, 0.33, 0)))
print('BLEU-3, beam2: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam2], weights=(0.33, 0.33, 0.33, 0)))
print('BLEU-3, beam3: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam3], weights=(0.33, 0.33, 0.33, 0)))
print('BLEU-3, beam4: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam4], weights=(0.33, 0.33, 0.33, 0)))
print('BLEU-3, beam5: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam5], weights=(0.33, 0.33, 0.33, 0)))
print('BLEU-4, argmax: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_argmax], weights=(0.25, 0.25, 0.25, 0.25)))
print('BLEU-4, beam2: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam2], weights=(0.25, 0.25, 0.25, 0.25)))
print('BLEU-4, beam3: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam3], weights=(0.25, 0.25, 0.25, 0.25)))
print('BLEU-4, beam4: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam4], weights=(0.25, 0.25, 0.25, 0.25)))
print('BLEU-4, beam5: %f' % corpus_bleu(BLEU_dict_3[curr_actual], BLEU_dict_3[curr_beam5], weights=(0.25, 0.25, 0.25, 0.25)))

count = 0
beam_total2 = 0
beam_total3 = 0
beam_total4 = 0
beam_total5 = 0
argmax_total = 0
for k in range(len(METEOR_dict_3[curr_actual])):
  actual = METEOR_dict_3[curr_actual][k]
  b2 = METEOR_dict_3[curr_beam2][k]
  b3 = METEOR_dict_3[curr_beam3][k]
  b4 = METEOR_dict_3[curr_beam4][k]
  b5 = METEOR_dict_3[curr_beam5][k]
  a = METEOR_dict_3[curr_argmax][k]
  curr_val_beam2 = meteor_score.meteor_score(actual, b2)
  curr_val_beam3 = meteor_score.meteor_score(actual, b3)
  curr_val_beam4 = meteor_score.meteor_score(actual, b4)
  curr_val_beam5 = meteor_score.meteor_score(actual, b5)
  curr_val_argmax = meteor_score.meteor_score(actual, a)
  count += 1
  beam_total2 += curr_val_beam2
  beam_total3 += curr_val_beam3
  beam_total4 += curr_val_beam4
  beam_total5 += curr_val_beam5
  argmax_total += curr_val_argmax
print('METEOR SCORE, argmax:', argmax_total/count)
print('METEOR SCORE, beam2:', beam_total2/count)
print('METEOR SCORE, beam3:', beam_total3/count)
print('METEOR SCORE, beam4:', beam_total4/count)
print('METEOR SCORE, beam5:', beam_total5/count)


BLEU-1, argmax: 0.480256
BLEU-1, beam2: 0.481259
BLEU-1, beam3: 0.470010
BLEU-1, beam4: 0.463103
BLEU-1, beam5: 0.455649
BLEU-2, argmax: 0.265444
BLEU-2, beam2: 0.270318
BLEU-2, beam3: 0.268449
BLEU-2, beam4: 0.264282
BLEU-2, beam5: 0.263057
BLEU-3, argmax: 0.149209
BLEU-3, beam2: 0.153330
BLEU-3, beam3: 0.154209
BLEU-3, beam4: 0.151427
BLEU-3, beam5: 0.153592
BLEU-4, argmax: 0.079218
BLEU-4, beam2: 0.081473
BLEU-4, beam3: 0.081095
BLEU-4, beam4: 0.077514
BLEU-4, beam5: 0.080880
METEOR SCORE, argmax: 0.2645584134598919
METEOR SCORE, beam2: 0.2692041976187466
METEOR SCORE, beam3: 0.2711168777718717
METEOR SCORE, beam4: 0.2649889153602954
METEOR SCORE, beam5: 0.26519733085178016


# beam 2 is the highest
# overall layer size of 128, epoch 12 and beam2 method produces the highest quality results
