In [None]:
import torch
from transformers import GenerationConfig

from datetime import datetime
import json
import numpy as np
import math
import tqdm

from T2CEvaluator import T2CEvaluator

from prompter import Prompter
prompter = Prompter()

def generate_test_prompt(data_point):
    #assert 'output' not in data_point or data_point['output']==''
    if "input" in data_point and data_point["input"]:
        return prompter.generate_prompt(instruction = data_point["instruction"],
                                        input = data_point["input"],
                                        #label = ''#data_point["output"]
                                       )
    else:
        return prompter.generate_prompt(instruction = data_point["instruction"],
                                        #input = None,
                                        #label = ''#data_point["output"]
                                       )
   
# def generate_test_prompt(data_point, train = False):
#     # To decrease expectations of results :)
#     assert train == False
#     # sorry about the formatting disaster gotta move fast
#     if data_point["input"]:
#         return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
# ### Instruction:
# {data_point["instruction"]}

# ### Input:
# {data_point["input"]}

# ### Response:
# {data_point["output"] if train else ''}"""
#     else:
#         return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
# ### Instruction:
# {data_point["instruction"]}

# ### Response:
# {data_point["output"] if train else ''}"""


class EvaluateTestSet:
    def __init__(self, 
                 generation_config = GenerationConfig(max_new_tokens = 128), 
                 fn_test_data = "../data/t2c_answers.json",
                 fn_etalon = "/root/data/answers.json",
                 batch_size = 10,
                 verbose = False
                ):
        self.generation_config = generation_config
        
        self.fn_test_data = fn_test_data
        self.fn_etalon = fn_etalon
        
        self.batch_size = batch_size
        self.verbose = verbose
       
    def preprocess(self, s):
        #ToDo rewrite it using Promt Template
        #prompter.get_response(s)
        s = s.split('### Response:\n')[-1]#.split("### Input")[0]
        s = s.replace('\n', '  ')
        s = s.replace('<unk>', " ")
        s = ' '.join(s.split(' ')[:100])
        while '  ' in s:
            s = s.replace('  ', ' ')

        if len(s) > 0 and s[0] == ' ':
            s = s[1:]
        
        if self.verbose:
            print(s)
        
        return s

    def clean_results(self, res_list):
        predict_list = []
        for s in tqdm.tqdm(res_list):
            predict_list.append(self.preprocess(s))
        return predict_list
    
    def get_raw_results(self, model, tokenizer, prompts):
        batch_size = self.batch_size
        generation_config = self.generation_config
        
        res_list = []
        n = math.ceil(len(prompts)/batch_size)
        
        for ind in tqdm.tqdm(range(n)):
            current_prompts = prompts[ind*batch_size: (ind+1)*batch_size]
            if self.verbose:
                print(ind * batch_size, (ind+1)*batch_size, len(current_prompts))

            tokenized_inputs = tokenizer(list(current_prompts), 
                                         padding=True, 
                                         truncation=True, 
                                         return_tensors="pt"
                                        ).to('cuda')



            with torch.no_grad():
                full_output = model.generate(
                    **tokenized_inputs,
                    generation_config=generation_config
                )

            res_list.extend(tokenizer.batch_decode(full_output, skip_special_tokens=False))
        
        return res_list
    
    def save_results(self, predict_list):
        output_filename = str(datetime.now()).split('.')[0].replace(' ', '-').replace(':', '_')+'.txt'
        fn_output = "/root/results/%s"%output_filename
        
        res = '\n'.join([i if i!='' else '-' for i in predict_list])
        open(fn_output, "w+", encoding='utf-8').write(res)
        return fn_output
    
    def evaluate(self, model, tokenizer):
        model.eval()
        assert model.training == False

        lst = json.load(open(self.fn_test_data, 'rb'))
        inputs = lst# [lst[0]]
        # instruction = 'Combine the question and answer into an image caption as succinctly as possible. Be sure to include the phrase "a photo of". Do not draw false conclusions.'
        # inputs = ['Is this a baseball game? yes', 'Is this a baseball game? no']
        prompts = [generate_test_prompt(inp) for inp in inputs]
        prompts = np.array(prompts)
        
        res_list = self.get_raw_results(model = model, 
                                        tokenizer = tokenizer,
                                        prompts = prompts)
        
        model.train()
        assert model.training == True
        
        predict_list = self.clean_results(res_list)
        
        self.fn_output = self.save_results(predict_list)
        
        t2c_evaluator = T2CEvaluator()
        metric_res = t2c_evaluator.calculate_metrics(fn_answers = self.fn_etalon, 
                                                     fn_predictions = self.fn_output
                                                     )
        return metric_res

In [None]:
from utils import load_model_tokenizer_from_pretrained
import pandas as pd
from transformers import GenerationConfig
from tqdm import tqdm_notebook

default_model = "decapoda-research/llama-7b-hf"
experiment_name = "/root/experiments/t2c_concode_220428_v19/"

In [None]:
tokenizer, model = load_model_tokenizer_from_pretrained(default_model = default_model, 
                                                        experiment_name = experiment_name
                                                       )

In [None]:
evaluator = EvaluateTestSet(generation_config = GenerationConfig(max_new_tokens = 49
                                                                ),
                            #fn_test_data = "temp/t2c_answers.json",
                            #fn_etalon = "temp/answers.json"
                            batch_size = 10
                           )

metric_res = evaluator.evaluate(model=model, 
                                tokenizer=tokenizer,
                               )
# for key, val in generation_config_dict.items():
#     assert key not in metric_res
#     metric_res[key] = val

metric_res['experiment_name'] = experiment_name
metric_res

In [None]:
# max_length = 1024

# max_length = 400, 
# {'EM': 0.0,
#  'BLEU': 0.20770774117978485,
#  'brevity_penalty': 0.7887852726079982,
#  'ratio': 0.8082368082368082,
#  'translation_length': 2512,
#  'reference_length': 3108,
#  'precisions_0': 0.4341424592120971,
#  'precisions_1': 0.3033568172399503,
#  'precisions_2': 0.2197195070123247,
#  'precisions_3': 0.1661578717836895,
#  'experiment_name': '/root/experiments/t2c_concode_220428_v19/'
# }

# max_length = 256, BLEU =  7.106139847448361e-12


# title = "best result ever", max_length = 512
# {'EM': 0.0,
#  'BLEU': 0.19442009681675598,
#  'brevity_penalty': 1.0,
#  'ratio': 1.4996782496782497,
#  'translation_length': 4661,
#  'reference_length': 3108,
#  'precisions_0': 0.3344058344058344,
#  'precisions_1': 0.22709338009644892,
#  'precisions_2': 0.1598479767493852,
#  'precisions_3': 0.1177007299270073,
#  'max_length': 512,
#  'experiment_name': '/root/experiments/t2c_concode_220428_v19/'}

# title = "best result ever", max_length = 512
# {'EM': 0.0,
#  'BLEU': 0.3069243409072005,
#  'brevity_penalty': 0.8743219831966469,
#  'ratio': 0.8815958815958816,
#  'translation_length': 2740,
#  'reference_length': 3108,
#  'precisions_0': 0.5618387449835827,
#  'precisions_1': 0.40931465354032565,
#  'precisions_2': 0.2975206611570248,
#  'precisions_3': 0.22194922194922195,
#  'max_new_tokens': 49,
#  'experiment_name': '/root/experiments/t2c_concode_220428_v19/'
# }


# title = "best result ever", batch_size = 20
# {'EM': 0.0,
#  'BLEU': 0.3069243409072005,
#  'brevity_penalty': 0.8743219831966469,
#  'ratio': 0.8815958815958816,
#  'translation_length': 2740,
#  'reference_length': 3108,
#  'precisions_0': 0.5618387449835827,
#  'precisions_1': 0.40931465354032565,
#  'precisions_2': 0.2975206611570248,
#  'precisions_3': 0.22194922194922195,
#  'max_new_tokens': 49,
#  'experiment_name': '/root/experiments/t2c_concode_220428_v19/'
# }

# title = "best result ever", batch_size = 5
# {'EM': 0.0,
#  'BLEU': 0.3094605290687326,
#  'brevity_penalty': 0.8757688775902636,
#  'ratio': 0.8828828828828829,
#  'translation_length': 2744,
#  'reference_length': 3108,
#  'precisions_0': 0.5632058287795992,
#  'precisions_1': 0.4109640831758034,
#  'precisions_2': 0.29901768172888016,
#  'precisions_3': 0.22526573998364677,
#  'max_new_tokens': 49,
#  'experiment_name': '/root/experiments/t2c_concode_220428_v19/'
# }

# title = ("best result ever"|"t2c_concode_220428_v19"), batch_size = {None|10}
# {'EM': 0.0,
#  'BLEU': 0.31417807176317536,
#  'brevity_penalty': 0.8993798379997678,
#  'ratio': 0.9041184041184042,
#  'translation_length': 2810,
#  'reference_length': 3108,
#  'precisions_0': 0.5592315901814301,
#  'precisions_1': 0.40575433419402435,
#  'precisions_2': 0.2949061662198391,
#  'precisions_3': 0.22253184713375795,
#  'max_new_tokens': 49,
#  'experiment_name': '/root/experiments/t2c_concode_220428_v19/'}