In [1]:
import json
from tqdm import tqdm
import os,sys,inspect
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)
from reward_model.citation_intent_classification.src.BertClassifier.model import CitationIntentClassifier
from rouge_score import rouge_scorer
from transformers import Trainer, AutoConfig, AutoModelForCausalLM, AutoTokenizer,  \
                          TrainingArguments, logging, \
                          BitsAndBytesConfig, TrainerCallback
import torch
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


[2023-06-26 14:33:32,374] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
class RewardCal:
    def __init__(self, intent_classifier_model_path, pretrained_lm_path, device_index ):
        self.rouge = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
        self.intent_classifier = CitationIntentClassifier( intent_classifier_model_path,  "allenai/scibert_scivocab_uncased",  device_index )
        self.lm = AutoModelForCausalLM.from_pretrained(pretrained_lm_path, load_in_4bit = True, device_map={"":device_index})
        self.lm.eval()
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_lm_path)
        
        with torch.no_grad():
            log_prior_distribution = self.lm(input_ids=torch.LongTensor( [ self.tokenizer.bos_token_id ] ).unsqueeze(0)
                   )["logits"].to(torch.float32).log_softmax(-1)[0,0].detach().cpu().numpy()
        self.log_prior_distribution = log_prior_distribution
    
    def sigmoid(self, x):
        return 1/(1+np.exp( -x ))
        
    def reward_fn(self, gen_citation,  given_intent, given_keywords, given_citation  ):
        if gen_citation.strip() == "":
            return 0.0
        
        reward_intent = self.intent_classifier.get_intent_scores( gen_citation ).get(given_intent, 0.0)
        reward_keywords = self.rouge.score( given_keywords, gen_citation )["rougeL"].recall
        
        cit_token_ids = np.array(self.tokenizer.encode( gen_citation ))
        prior_cit_log_probs = self.log_prior_distribution[ cit_token_ids ]
        
        input_ids = torch.LongTensor([self.tokenizer.bos_token_id] + cit_token_ids.tolist() ).unsqueeze(0)
        with torch.no_grad():
            cit_log_probs = self.lm(input_ids = input_ids)["logits"][0][:-1].to(torch.float32).log_softmax(-1).detach().cpu().numpy()
            cit_log_probs = cit_log_probs[ np.arange(len(cit_token_ids)), cit_token_ids ]
        reward_fluency = self.sigmoid( (np.mean(cit_log_probs - prior_cit_log_probs ) - 4) )
        # reward_fluency = np.mean(cit_log_probs - prior_cit_log_probs )
        
        reward_groundness = self.rouge.score( given_citation, gen_citation )
        # reward_groundness = reward_groundness["rouge1"].fmeasure + reward_groundness["rouge2"].fmeasure + reward_groundness["rougeL"].fmeasure 
        
        
        return { "intent_score": reward_intent,
                 "keywords_score":reward_keywords,
                 "fluency_score":reward_fluency,
                 "rouge_score":reward_groundness
               }

In [5]:
reward_cal = RewardCal("../reward_model/citation_intent_classification/model/BertClassifier/5_5_0.05_0.01/model_batch_515.pt",
                       "bigscience/bloom-560m", 
                       0
                      )

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/scieditor/anaconda3/envs/trl/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Loading binary /home/scieditor/anaconda3/envs/trl/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so...


  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)
  warn(msg)


In [6]:
rouge = evaluate.load("rouge")

In [7]:
def eval( flag, data_path ):
    
    corpus =[ json.loads(line) for line in open(data_path) ] 
    
    res = []
    
    # print("Mode1: uncontrolled generation")
    r1_list = []
    r2_list = []
    rl_list = []
    for example in tqdm( corpus ):
        
        ref_cit_text = example["citation"]
        gen_cit_info = example["generated_citations"][0]
        gen_cit_text = gen_cit_info["generation"]["citation"]
        
        score = reward_cal.rouge.score( ref_cit_text, gen_cit_text )

        r1_list.append( score["rouge1"].fmeasure )
        r2_list.append( score["rouge2"].fmeasure )
        rl_list.append( score["rougeL"].fmeasure )
        
    res.append( [   
        "%.2f"%(np.round( np.mean(r1_list) * 100, 2)),
        "%.2f"%(np.round( np.mean(r2_list) * 100, 2)),
        "%.2f"%(np.round( np.mean(rl_list) * 100, 2)),
        "",
        "",
        "",
    ] )
            
    # print("\nMode2: intent-controlled generation")
    r1_list = []
    r2_list = []
    rl_list = []
    for example in tqdm( corpus ):
        
        ref_cit_text = example["citation"]
        gen_cit_info = example["generated_citations"][1]
        assert gen_cit_info["given_citation_intent"] is not None and gen_cit_info["given_keywords"] is None
        gen_cit_text = gen_cit_info["generation"]["citation"]
        
        score = reward_cal.rouge.score( ref_cit_text, gen_cit_text )

        r1_list.append( score["rouge1"].fmeasure )
        r2_list.append( score["rouge2"].fmeasure )
        rl_list.append( score["rougeL"].fmeasure )
        
    res.append( [   
        "%.2f"%(np.round( np.mean(r1_list) * 100, 2)),
        "%.2f"%(np.round( np.mean(r2_list) * 100, 2)),
        "%.2f"%(np.round( np.mean(rl_list) * 100, 2)),
        "",
        "",
        "",
    ] )
    
    # print("\nMode3: intent and keywords controlled generation")
    
    intent_score_list = []
    keywords_score_list = []
    fluency_score_list = []
    r1_list = []
    r2_list = []
    rl_list = []

    for example in tqdm( corpus ):
        
        ref_cit_text = example["citation"]
        
        gen_cit_info = example["generated_citations"][2]
        assert gen_cit_info["given_citation_intent"] is not None and gen_cit_info["given_keywords"] is not None
        if isinstance( gen_cit_info["generation"], dict ):
            gen_cit_text = gen_cit_info["generation"]["citation"]
        else:
            gen_cit_text = gen_cit_info["generation"]
        
        score = reward_cal.reward_fn( gen_cit_text,
                        gen_cit_info["given_citation_intent"],
                        "; ".join( gen_cit_info["given_keywords"] ),
                        ref_cit_text
                        )    
        
        intent_score_list.append( score["intent_score"] )
        keywords_score_list.append( score["keywords_score"] )
        fluency_score_list.append( score["fluency_score"] )
        r1_list.append( score["rouge_score"]["rouge1"].fmeasure )
        r2_list.append( score["rouge_score"]["rouge2"].fmeasure )
        rl_list.append( score["rouge_score"]["rougeL"].fmeasure )
        
    res.append( [   
        "%.2f"%(np.round( np.mean(r1_list) * 100, 2)),
        "%.2f"%(np.round( np.mean(r2_list) * 100, 2)),
        "%.2f"%(np.round( np.mean(rl_list) * 100, 2)),
        "%.4f"%(np.round( np.mean(intent_score_list) ,4)),
        "%.4f"%(np.round( np.mean(keywords_score_list) ,4)),
        "%.4f"%(np.round( np.mean(fluency_score_list) ,4)),
    ] )
    
    
    return  flag + " & " + " & ".join( res[0][:3] + res[1][:3] + res[2] ) + " \\\\"

In [6]:
eval_results = []
for flag, data_path in [
    ( "BART-base-140M", "../results/sft_model/bart-base/test_with_citations.jsonl"  ),
    ( "BART-large-400M", "../results/sft_model/bart-large/test_with_citations.jsonl"  ),
    ( "GPT-Neo-125M", "../results/sft_model/gpt-neo-125m-hf/test_with_citations.jsonl"  ),
    ( "GPT-Neo-1.3B", "../results/sft_model/gpt-neo-1.3b-hf/test_with_citations.jsonl"  ),
    ( "Galactica-125M", "../results/sft_model/galactica-125m-ct2/test_with_citations.jsonl"  ),
    ( "Galactica-125M-PPO", "../results/ppo_model/galactica-125m-ct2/test_with_citations.jsonl"  ),
    ( "Galactica-1.3B", "../results/sft_model/galactica-1.3b-ct2/test_with_citations.jsonl"  ),
    ( "Galactica-6.7B", "../results/sft_model/galactica-6.7b-ct2/test_with_citations.jsonl"  ),
    ( "Galactica-6.7B-PPO", "../results/ppo_model/galactica-6.7b-ct2/test_with_citations.jsonl"  ),
    ( "LLaMa-7B", "../results/sft_model/llama-7b-ct2/test_with_citations.jsonl"  ),
    ( "LLaMa-7B-PPO", "../results/ppo_model/llama-7b-ct2/test_with_citations.jsonl"  ),
    ( "GPT-3.5-turbo", "../zero_shot_GPT-3.5-turbo/results/test_with_chatgpt_citations_merged.jsonl"  ),
]:
    eval_results.append( eval( flag, data_path ) )
    
print("\n".join(eval_results))

100%|██████████████████████████████████████████████████████████████████████████████████| 1080/1080 [00:00<00:00, 1333.74it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1080/1080 [00:00<00:00, 1348.59it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1080/1080 [01:00<00:00, 17.88it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1080/1080 [00:00<00:00, 1275.27it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1080/1080 [00:00<00:00, 1271.60it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1080/1080 [00:58<00:00, 18.38it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1080/1080 [00:00<00:00, 1372.04it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1080/1080 [00:00<00:00, 1335.

BART-base-140M & 25.49 & 4.26 & 18.28 & 26.05 & 4.52 & 18.71 & 31.63 & 8.79 & 22.74 & 0.6789 & 0.6444 & 0.7156 \\
BART-large-400M & 27.39 & 5.67 & 19.85 & 27.90 & 6.00 & 20.17 & 32.33 & 9.12 & 23.20 & 0.6521 & 0.5877 & 0.7510 \\
GPT-Neo-125M & 23.54 & 3.67 & 17.58 & 23.62 & 3.69 & 17.59 & 30.48 & 9.44 & 22.83 & 0.6252 & 0.6793 & 0.7996 \\
GPT-Neo-1.3B & 28.48 & 6.12 & 20.78 & 29.04 & 6.39 & 21.28 & 36.26 & 13.48 & 26.81 & 0.7018 & 0.7936 & 0.7595 \\
Galactica-125M & 28.03 & 5.77 & 20.23 & 28.70 & 6.27 & 20.96 & 35.67 & 13.07 & 26.50 & 0.7037 & 0.7914 & 0.7540 \\
Galactica-125M-PPO & 27.97 & 5.72 & 20.27 & 28.81 & 6.12 & 20.97 & 36.49 & 13.55 & 27.09 & 0.7273 & 0.8313 & 0.7651 \\
Galactica-1.3B & 30.07 & 7.34 & 22.06 & 30.66 & 7.62 & 22.64 & 38.06 & 15.21 & 28.50 & 0.6925 & 0.8299 & 0.7399 \\
Galactica-6.7B & 30.61 & 7.97 & 22.59 & 30.89 & 8.03 & 22.87 & 38.29 & 15.58 & 28.70 & 0.6734 & 0.8150 & 0.7468 \\
LLaMa-7B & 30.19 & 7.28 & 22.13 & 30.49 & 7.46 & 22.32 & 37.71 & 14.80 & 28.30 & 0




In [32]:
eval_results = []
for flag, data_path in [
    ( "Galactica-125M-beam1", "../results/sft_model/galactica-125m-ct2/val_with_citations_beam_size_1.jsonl"  ),
    ( "Galactica-125M-beam2", "../results/sft_model/galactica-125m-ct2/val_with_citations_beam_size_2.jsonl"  ),
    ( "Galactica-125M-beam4", "../results/sft_model/galactica-125m-ct2/val_with_citations_beam_size_4.jsonl"  ),
    ( "Galactica-125M-beam8", "../results/sft_model/galactica-125m-ct2/val_with_citations_beam_size_8.jsonl"  ),
]:
    eval_results.append( eval( flag, data_path ) )
    
print("\n".join(eval_results))

100%|██████████████████████████████████████████████████████████████████████████████████| 1299/1299 [00:00<00:00, 1358.02it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1299/1299 [00:00<00:00, 1329.36it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1299/1299 [01:48<00:00, 11.96it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1299/1299 [00:00<00:00, 1329.40it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1299/1299 [00:00<00:00, 1301.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1299/1299 [01:09<00:00, 18.56it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1299/1299 [00:00<00:00, 1301.20it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1299/1299 [00:00<00:00, 1308.

Galactica-125M-beam1 & 27.93 & 6.00 & 20.39 & 28.67 & 6.41 & 21.09 & 35.85 & 13.44 & 26.88 & 0.7128 & 0.7667 & 0.7539 \\
Galactica-125M-beam2 & 27.26 & 5.80 & 19.61 & 28.00 & 6.26 & 20.24 & 36.00 & 13.81 & 26.68 & 0.6946 & 0.7865 & 0.7526 \\
Galactica-125M-beam4 & 27.15 & 6.01 & 19.50 & 27.69 & 6.29 & 19.98 & 35.44 & 13.59 & 26.13 & 0.6872 & 0.7656 & 0.7466 \\
Galactica-125M-beam8 & 26.47 & 6.03 & 18.87 & 26.91 & 6.38 & 19.44 & 34.99 & 13.67 & 25.91 & 0.6724 & 0.7400 & 0.7425 \\



