# Install required libraries

In [None]:
!pip install -U pip
!pip install accelerate==0.18.0
!pip install appdirs==1.4.4
!pip install bitsandbytes==0.37.2
!pip install datasets==2.10.1
!pip install fire==0.5.0
!pip install git+https://github.com/huggingface/peft.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install torch==2.0.0
!pip install sentencepiece==0.1.97
!pip install tensorboardX==2.6
!pip install gradio==3.23.0 

# Login to HuggingFace Account

In [None]:
from huggingface_hub import interpreter_login
interpreter_login()

# Import required libraries

In [None]:
import transformers
import textwrap
from transformers import LlamaTokenizer, LlamaForCausalLM
import os
import sys
from typing import List
import gc

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
#     prepare_model_for_int8_training,
    prepare_model_for_kbit_training,
)

# import fire
import torch
from datasets import load_dataset
import pandas as pd

# import matplotlib.pyplot as plt
# import matplotlib as mpl
# # import seaborn as sns
# from pylab import rcParams

# %matplotlib inline
# # sns.set(rc={'figure.figsize':(10, 7)})
# # sns.set(rc={'figure.dpi':100})
# # sns.set(style='white', palette='muted', font_scale=1.2)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

In [None]:
# !pip install --upgrade peft
# pip install -i https://test.pypi.org/simple/ bitsandbytes
# !pip install -i https://pypi.org/simple/ bitsandbytes
# !python -m bitsandbytes
# !pip install -i https://pypi.org/simple/ bitsandbytes

# Load the llama-2–13b-chat-hf model and the corresponding tokenizer

In [None]:
BASE_MODEL = "meta-llama/Llama-2-13b-chat-hf"
# BASE_MODEL = "meta-llama/Llama-2-7b-chat-hf" 
model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=True, # loads the model using 8-bit quantization to reduce memory usage and improve inference speed
    torch_dtype=torch.float16,
    device_map="auto",
)

In [None]:
# model.save_pretrained("mregexperiments-13b-32-syn2")

In [None]:
CACHE_DIR="CACHE_DIR"
tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"
# tokenizer.save_pretrained("mregexperiments-13b-32-syn")


In [None]:
# !pip install fsspec==2023.9.2 
# !pip install -U datasets

# Load the data

In [None]:
data = load_dataset("json", data_files="Synth_HumanGenMRE.json")
data["train"]

# Creating prompts from the loaded dataset and tokenize them

In [None]:
CUTOFF_LEN=2000

# takes a data point from the dataset and generates a prompt by combining the instruction, 
# input, and output values

def generate_prompt(data_point):
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
### Response:
{data_point["output"]}"""

# takes the generated prompt and tokenizes it using the tokenizer defined earlier. 
# It also adds an end-of-sequence token to the input sequence and sets the label to 
# be the same as the input sequence.

def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=True,
        return_tensors=None,
    )    
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

# combines the first two functions to generate and tokenize the prompt in one step

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt



# Splitting the dataset into training, validation and testing sets

In [None]:
train_val = data["train"].train_test_split(
    test_size=0.2, shuffle=True, seed=42
)
val_test = train_val["test"].train_test_split(
    test_size=0.2, shuffle=True, seed=42
)
train_data = (
    train_val["train"].map(generate_and_tokenize_prompt)
)
val_data = (
    val_test["train"].map(generate_and_tokenize_prompt)
)
test_data = (
    val_test["test"].map(generate_and_tokenize_prompt)
)

train_data,val_data, test_data


# Training

In [None]:
# The training process requires several parameters

LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT= 0.05
LORA_TARGET_MODULES = [
    "q_proj",
#     "k_proj",
    "v_proj",
#     "o_proj",
]
BATCH_SIZE = 4 # number of prompt's chunks
MICRO_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 3e-4 #1e-5 
TRAIN_STEPS = 300
# EPOCHS=1
OUTPUT_DIR = "MREG-Orig-LORA8-13b-4batches-300epochs-q8-3e-4"

In [None]:
# Training with the LORA algorithm, which is a form of quantization that can reduce model size 
# and memory usage without significant loss in accuracy

from accelerate import Accelerator
model=prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
model.print_trainable_parameters()


In [None]:
# Specifies various settings and hyperparameters for training the model
training_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,#Number of updates steps to accumulate gradients before performing a backward/update pass
    warmup_steps=100,#Number of warmup steps for the optimizer
    max_steps=TRAIN_STEPS, #The total number of training steps to perform
#     num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE, #The learning rate for the optimizer
    fp16=True, #Use 16-bit precision for training.
    logging_steps=10,
    prediction_loss_only=True,
    optim="adamw_torch",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=10,
    save_steps=10,
    output_dir=OUTPUT_DIR,
    save_total_limit=3,
    load_best_model_at_end=True,
    report_to="tensorboard"
)



In [None]:
#creates batches of input/output sequences for sequence-to-sequence (seq2seq) models.
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True)

In [None]:
# For the training process, we'll use the Trainer class from the Hugging Face Transformers library:
trainer = transformers.Trainer(
    model=model,
#     compute_metrics = compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_arguments,
    data_collator=data_collator
)
model.config.use_cache = False
old_state_dict = model.state_dict
 
if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)

# Clean up the memory using the garbage cleaner
gc.collect() 
torch.cuda.empty_cache()

trainer.train()

model.save_pretrained(OUTPUT_DIR)
trainer.save_model(OUTPUT_DIR)


In [None]:
# evaluate the model using validation data: finding loss value
import math
eval_output = trainer.evaluate()
print("Validation_output: ",eval_output)
perplexity = math.exp(eval_output["eval_loss"])
print('\nEvaluate Perplexity: {:10,.2f}'.format(perplexity))

In [None]:
 pre= trainer.predict(test_data)
print("PredictionOutput: ",pre)
test_perplexity = math.exp(pre[2]['test_loss'])
print('\nTest Perplexity: {:10,.2f}'.format(test_perplexity))

# Plotting loss and perplexity

In [None]:
import io
import os
import math
import torch
import warnings
from tqdm.notebook import tqdm
from ml_things import plot_dict, fix_text# Keep track of train and evaluate loss.
loss_history = {'train_loss':[], 'eval_loss':[]}
 

# Keep track of train and evaluate perplexity.
# This is a metric useful to track for language models.
perplexity_history = {'train_perplexity':[], 'eval_perplexity':[]}
 
# Loop through each log history.
for log_history in trainer.state.log_history:
 
    if 'loss' in log_history.keys():
    # Deal with trianing loss.
        loss_history['train_loss'].append(log_history['loss'])
        perplexity_history['train_perplexity'].append(math.exp(log_history['loss']))

    elif 'eval_loss' in log_history.keys():
    # Deal with eval loss.
        loss_history['eval_loss'].append(log_history['eval_loss'])
        perplexity_history['eval_perplexity'].append(math.exp(log_history['eval_loss']))
 
 # Plot Losses.
plot_dict(loss_history, start_step=training_arguments.logging_steps, 
          step_size=training_arguments.logging_steps, use_title='Loss', 
          use_xlabel='Train Steps', use_ylabel='Values', magnify=2)
 
print()
 
# Plot Perplexities.
plot_dict(perplexity_history, start_step=training_arguments.logging_steps, 
          step_size=training_arguments.logging_steps, use_title='Perplexity', 
          use_xlabel='Train Steps', use_ylabel='Values', magnify=2)

In [None]:
# Pushing the model to HF
model.push_to_hub("Nadahass/MREG-Orig-LORA8-13b-4batches-300epochs-q8-3e-4", use_auth_token=True)

# Interpretations
The learning curve of the model has a moderately high training loss at the beginning which gradually decreases upon adding training examples and flattens gradually, indicating addition of more training examples doesn’t improve the model performance on training data.

Also, both the training and validation loss moved close to each other. This is an indicator of having a reasonable number of training data points.

# Inferences and Testing Evaluation

In [None]:
from ctransformers import AutoModelForCausalLM
# from HF
model = AutoModelForCausalLM.from_pretrained("Nadahass/MREG-13B-GGUF")
# from local
model = LlamaForCausalLM.from_pretrained("MREG-Orig-LORA8-13b-4batches-300epochs-q8-3e-4", load_in_8bit=False,torch_dtype=torch.float16, device_map="auto")

In [None]:
tokenizer = LlamaTokenizer.from_pretrained("Nadahass/MREG-13B")
tokenizer.pad_token_id = (0  # unk. we want this to be different from the eos token
                         )
tokenizer.padding_side = "left"

In [None]:
from peft import PeftModel
from transformers.generation.utils import GreedySearchDecoderOnlyOutput
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig


PROMPT_TEMPLATE=f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
### Instruction:
[instruction]
### Input:
[input]
### Response:
"""

def create_prompt(instruction: str, inputs:str) -> str:
    return PROMPT_TEMPLATE.replace("[instruction]", instruction).replace("[input]",inputs)
 

def generate_response(prompt: str, model: PeftModel) -> GreedySearchDecoderOnlyOutput:
    encoding = tokenizer(prompt, return_tensors="pt")
    input_ids = encoding["input_ids"].to('cuda')
 
    generation_config = GenerationConfig(
        temperature=0.1,
        top_p=0.75,
        repetition_penalty=1.1,
    )
    with torch.inference_mode():
        return model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=1000,
          
        )
    
def format_response(response: GreedySearchDecoderOnlyOutput) -> str:
    decoded_output = tokenizer.decode(response.sequences[0])
#     print("decoded_output: ", decoded_output)
    response = decoded_output.split("### Response:")[1].strip()
#     print("response: ", response)
    return "\n".join(textwrap.wrap(response))


def ask_Diana(instr: str, prompt: str, model: PeftModel = model) -> str:
    prompt = create_prompt(instr, prompt)
    response = generate_response(prompt, model)
    return format_response(response)

In [None]:
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,log_loss

refinput=[]

references=[]
predictions=[]

pred_speech=[]
ref_speech=[]

ref_mod=[]
pred_mod=[]

ref_pos=[]
pred_pos=[]

ref_dem=[]
pred_dem=[]

ref_ids=[]
pred_ids=[]



count=0


while(count < 887):
# storing references for val_data
    print("Count#: ",count+1)
    print("ref_input: ", test_data[count]['input'])
    print("ref_output: ", test_data[count]['output'])
    print("ref part: ", str(test_data[count]['output']).split(','))

    refinput.append(test_data[count]['input'])
    references.append(test_data[count]['output'])
    ref_dem.append(str(test_data[count]['output']).split(',')[1])
    ref_mod.append(str(test_data[count]['output']).split(',')[2])
    ref_pos.append(str(test_data[count]['output']).split(',')[3])
    ref_speech.append(str(test_data[count]['output']).split(',')[0])

# storing predictions for val_data prompts   
    string=ask_Diana(test_data[count]['instruction'],test_data[count]['input'])
    if("</s>" in string):
        string=string.replace("</s>","")
    if("\n" in string):
        string=string.replace("\n"," ")
    print("predicted_output: ", string)
    pred_ids.append(tokenizer(string).input_ids)
    predictions.append(string)
    if len(string.split(','))==4:
        pred_speech.append(string.split(',')[0])
        pred_dem.append(string.split(',')[1])
        pred_mod.append(string.split(',')[2])
        pred_pos.append(string.split(',')[3])
    else: 
        del ref_dem[-1] 
        del ref_mod[-1]
        del ref_pos[-1] 
        del ref_speech[-1]
        
    count=count+1
    print()
    



In [None]:
length= len(ref_pos)
print(length)

In [None]:
import csv
count =0
with open('MREG-Orig-LORA8-13b-4batches-300-q8-3e-4_SynTesting.csv', 'w', newline='') as csvfile:
    fieldnames = ['References_input', 'References_output', 'Predictions','Ref_speech','PredSpeech', 'Ref_mod', 'Pred_mod', 
                  'Ref_dem', 'Pred_dem', 'Ref_pos', 'pred_pos']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    while count < length:
        writer.writerow({'References_input':refinput[count],'References_output':references[count], 'Predictions':predictions[count],'Ref_speech':ref_speech[count],
                         'PredSpeech':pred_speech[count], 'Ref_mod':ref_mod[count],'Pred_mod':pred_mod[count],
                         'Ref_dem':ref_dem[count], 'Pred_dem':pred_dem[count], 'Ref_pos':ref_pos[count], 
                         'pred_pos':pred_pos[count]})
        count=count+1

In [None]:
def search_word_in_list(sentence, word_list):
    # Split the sentence into words
    words_in_sentence = sentence.split()
    # Check if any word in the sentence exists in the word list
    result = any(word in word_list for word in words_in_sentence)
    
    return result

In [None]:
from datasets import load_metric
import importlib.metadata as importlib_metadata
testsamples=length

pointing=0
speech=0
multi=0

rpointing=0
rspeech=0
rmulti=0

pointing_pointing=0
speech_speech=0
multi_multi=0
pointing_speech=0
speech_pointing=0
multi_pointing=0
pointing_multi=0
speech_multi=0
multi_speech=0

ref_multi_rel=0
ref_speech_rel=0

pred_multi_rel=0
pred_speech_rel=0

ref_multi_att=0
ref_speech_att=0

pred_multi_att=0
pred_speech_att=0


incor_positions=[]

ref_spch_spch=[]
pred_spch_spch=[]

ref_relational=[]
pred_relational=[]

ref_attributive=[]
pred_attributive=[]

relat_index1=[]
alt_index1=[]

relat_index2=[]
alt_index2=[]

relations = ["touching",
                "in",
                "on",
                "top", 
                "at",
                "behind",
                "in front of",
                "beside",
                "near",
                "left of",
                "right of",
                "center of",
                "edge of",
                "under",
                "underneath",
                "below",
                "against",
                "here",
                "there",
                "right",
                "left",
                 "next"]

# ..................................................  modalities statistics ..................................................
for i in pred_mod:
    if "pointing only" in i:
        pointing =pointing +1
    if "speech only" in i:
        speech =speech +1
    if "multimodal" in i:
        multi =multi +1    

for j in ref_mod:
    if "pointing only" in j:
        rpointing =rpointing +1
    if "speech only" in j:
        rspeech =rspeech +1
    if "multimodal" in j:
        rmulti =rmulti +1   
        
for i,j in zip(ref_mod,pred_mod):
    if "pointing only" in i and "pointing only" in j:
        pointing_pointing =pointing_pointing +1
    if "speech only" in i and "speech only" in j:
        speech_speech =speech_speech +1
    if "multimodal" in i and "multimodal" in j:
        multi_multi =multi_multi +1  
    if "pointing only" in i and "speech only" in j:
        pointing_speech =pointing_speech +1  
    if "speech only" in i and "pointing only" in j:
        speech_pointing =speech_pointing +1 
    if "multimodal" in i and "pointing only" in j:
        multi_pointing =multi_pointing +1  
    if "pointing only" in i and "multimodal" in j:
        pointing_multi =pointing_multi +1  
    if "speech only" in i and "multimodal" in j:
        speech_multi =speech_multi +1  
    if "multimodal" in i and "speech only" in j:
        multi_speech =multi_speech+1      
        
 # ..................................................  speech statistics ..................................................


for i,j in zip(ref_speech,pred_speech):
    if "No speech" not in i and "No speech" not in j:
        ref_spch_spch.append(i)
        pred_spch_spch.append(j)

 # ..................................................  relational and attributive speech statistics ..................................................

for re1 in ref_speech:
    if search_word_in_list(re1, relations):
        ref_relational.append(re1)
        relat_index2.append(ref_speech.index(re1))
    elif search_word_in_list(re1, relations)==False and "No speech" not in re1:
        ref_attributive.append(re1)
        alt_index2.append(ref_speech.index(re1))

        
for re2 in pred_speech:
    if search_word_in_list(re2, relations):
        pred_relational.append(re2)
        relat_index1.append(pred_speech.index(re2))
    elif search_word_in_list(re2, relations)==False and "No speech" not in re2:
        pred_attributive.append(re2)
        alt_index1.append(pred_speech.index(re2))
        

# ref multimodal -relational
# ref speech only -relational
for s1 in relat_index2:
    if("multimodal" in ref_mod[s1]):
        ref_multi_rel=ref_multi_rel+1
    

for s2 in relat_index2:
    if("speech only" in ref_mod[s2]):
        ref_speech_rel=ref_speech_rel+1
    

# pred multimodal -relational
# pred speech only -relational  

for p1 in relat_index1:
    if("multimodal" in pred_mod[p1]):
        pred_multi_rel=pred_multi_rel+1
    
for p2 in relat_index1:
    if("speech only" in pred_mod[p2]):
        pred_speech_rel=pred_speech_rel+1
    
        
# ref multimodal - attributive
# ref speech only - attributive

for a1 in alt_index2:
    if("multimodal" in ref_mod[a1]):
        ref_multi_att=ref_multi_att+1
    

for a2 in alt_index2:
    if("speech only" in ref_mod[a2]):
        ref_speech_att=ref_speech_att+1
    
# pred multimodal -attributive
# pred speech only -attributive      
    
for b1 in alt_index1:
    if("multimodal" in pred_mod[b1]):
        pred_multi_att=pred_multi_att+1
    

for b2 in alt_index1:
    if("speech only" in pred_mod[b2]):
        pred_speech_att=pred_speech_att+1

        
#   ......................................Position investigation........................................ 

for p1,p2 in zip(ref_pos,pred_pos):
    if (p1 != p2): 
        incor_positions.append(" REF: "+ ref_speech[ref_pos.index(p1)] + " , " + ref_mod[ref_pos.index(p1)] + " , " + ref_pos[ref_pos.index(p1)] +" PRED: "+ pred_speech[ref_pos.index(p1)] + " , " + pred_mod[ref_pos.index(p1)] + " , " + pred_pos[ref_pos.index(p1)])                       
    
# sacrebleu = evaluate.load("sacrebleu")
# sacrebleu_results=sacrebleu.compute(predictions=pred_speech, references=ref_speech)

# rouge = evaluate.load('rouge')
# rouge_results=rouge.compute(predictions=pred_speech, references=ref_speech)

# METEOR = evaluate.load('meteor')
# METEOR_results=METEOR.compute(predictions=pred_speech, references=ref_speech)


bertscore_metric = load_metric('bertscore')
bert_scores = bertscore_metric.compute(predictions=pred_speech, references=ref_speech, lang="en")


bert_scores2 = bertscore_metric.compute(predictions=pred_spch_spch, references=ref_spch_spch, lang="en")

# bleurt_metric = load_metric('bleurt')
# bleurt_scores = bleurt_metric.compute(predictions=pred_speech, references=ref_speech)

# accuracy_results=accuracy_score(predictions, references)
# precision_results=precision_score(references, predictions,average = "micro",zero_division=1)
# recall_results=recall_score(references, predictions,average ="micro",zero_division=1)
# f1 =f1_score(references, predictions, average = "micro")

sp_accuracy_results=accuracy_score(ref_speech, pred_speech)
sp_precision_results=precision_score(ref_speech, pred_speech,average = "micro",zero_division=1)
sp_recall_results=recall_score(ref_speech, pred_speech,average ="micro",zero_division=1)
sp_f1 =f1_score(ref_speech, pred_speech, average = "micro")

dem_accuracy_results=accuracy_score(ref_dem, pred_dem)
dem_precision_results=precision_score(ref_dem, pred_dem,average = "micro",zero_division=1)
dem_recall_results=recall_score(ref_dem, pred_dem,average ="micro",zero_division=1)
dem_f1 =f1_score(ref_dem, pred_dem, average = "micro")

mod_accuracy_results=accuracy_score(ref_mod, pred_mod)
mod_precision_results=precision_score(ref_mod, pred_mod,average = "micro",zero_division=1)
mod_recall_results=recall_score(ref_mod, pred_mod,average ="micro",zero_division=1)
mod_f1 =f1_score(ref_mod, pred_mod, average = "micro")

pos_accuracy_results=accuracy_score(ref_pos, pred_pos)
pos_precision_results=precision_score(ref_pos, pred_pos,average = "micro",zero_division=0)
pos_recall_results=recall_score(ref_pos, pred_pos,average ="micro",zero_division=1)
pos_f1 =f1_score(ref_pos, pred_pos, average = "micro")



print("Number of reference pointing only REs: ", rpointing)
print("Number of reference speech only REs: ", rspeech)
print("Number of reference multimodal only REs: ", rmulti)
print()
print("Number of predicted pointing only REs: ", pointing)
print("Number of predicted speech only REs: ", speech)
print("Number of predicted multimodal only REs: ", multi)
print()
print("Number of identical pointing only REs for ref aand pred: ", pointing_pointing)
print("Number of identical speech only REs for ref aand pred: ", speech_speech)
print("Number of identical multimodal  REs for ref aand pred: ", multi_multi)
print()
print("Number of pointing speech REs for ref aand pred: ", pointing_speech)
print("Number of speech_pointing REs for ref aand pred: ", speech_pointing)
print()
print("Number of multi_pointing REs for ref aand pred: ", multi_pointing)
print("Number of pointing_multi REs for ref aand pred: ", pointing_multi)
print()
print("Number of speech_multi REs for ref aand pred: ", speech_multi)
print("Number of multi_speech REs for ref aand pred: ", multi_speech)
print()
print("ref_multi_rel: ", ref_multi_rel)
print("ref_speech_rel: ", ref_speech_rel)

print("pred_multi_rel: ", pred_multi_rel)
print("pred_speech_rel: ", pred_speech_rel)
print()

print("ref_multi_att: ", ref_multi_att)
print("ref_speech_att: ", ref_speech_att)
print()

print("pred_multi_att: ", pred_multi_att)
print("pred_speech_att: ", pred_speech_att)
print()

print()
print("Evaluation Using Testing Data")
print()

#N-gram based metrics have a critical disadvantage which only uses the sole characteristic of word matching. 
#In which, this cannot evaluate the true semantics of the whole sentence to have a fair comparison.
print("N-gram based metrics")
print("--------------------------------------------------------------------------------------------------")
print()

# print("Bleu: ", sacrebleu_results["score"])
# print("Rouge: ", rouge_results)
# print("METEOR: ", METEOR_results)
print()
# print("Tuple Accuracy: ", accuracy_results)
# print("Tuple Precision: ", precision_results)
# print("Tuple Recall: ", recall_results)
# print("Tuple F1: ", f1)
print()
print("Speech Accuracy: ", sp_accuracy_results)
print("Speech Precision: ", sp_precision_results)
print("Speech Recall: ", sp_recall_results)
print("Speech F1: ", sp_f1)
print()
print("Dem Accuracy: ", dem_accuracy_results)
print("Dem Precision: ", dem_precision_results)
print("Dem Recall: ", dem_recall_results)
print("Dem F1: ", dem_f1)
print()
print("Mod Accuracy: ", mod_accuracy_results)
print("Mod Precision: ", mod_precision_results)
print("Mod Recall: ", mod_recall_results)
print("Mod F1: ", mod_f1)
print()
print("Pos Accuracy: ", pos_accuracy_results)
print("Pos Precision: ", pos_precision_results)
print("Pos Recall: ", pos_recall_results)
print("Pos F1: ", pos_f1)

print("Incorrect postion predition details: ")
for inc in incor_positions:
    print(inc)


print()
print("MODEL BASED METRICS")
print("--------------------------------------------------------------------------------------------------")
print()
#To retrieve the true semantics of a sentence, BERTScore leverages Transformer based model BERT embeddings. 
#After feeding the candidate and reference sentences through the model, 
#it calculates the similarity of the two sentences with cosine-similarity and uses importance weighting 
#to get the final scores.
#PRECISION, RECALL, F1 for each data point
#print("Bert_score (): ", bert_scores['precision'][0])

print("Bert_score : ", bert_scores) 

total1=0
for pre in bert_scores['precision']:
    total1= total1 + pre
average1=total1/testsamples
print()
print("All_Bert_precision average: ", average1)

total1=0
average1=0
for re in bert_scores['recall']:
    total1= total1 + re
average1=total1/testsamples
print()
print("All_Bert_recall average: ", average1)

total1=0
average1=0
for f1 in bert_scores['f1']:
    total1= total1 + f1
average1=total1/testsamples
print()
print("All_Bert_f1 average: ", average1)

# ..............................................speech only.......................................................

print("Bert_score : ", bert_scores2) 
total1=0
for pre in bert_scores2['precision']:
    total1= total1 + pre
average1=total1/len(pred_spch_spch)
print()
print("speech only_Bert_precision average: ", average1)

total1=0
average1=0
for re in bert_scores2['recall']:
    total1= total1 + re
average1=total1/len(pred_spch_spch)
print()
print("speech only_Bert_recall average: ", average1)

total1=0
average1=0
for f1 in bert_scores2['f1']:
    total1= total1 + f1
average1=total1/len(pred_spch_spch)
print()
print("speech only_Bert_f1 average: ", average1)

#BLEURT is a BERT-based model which is pre-trained on synthetic data and fine-tuned on a small rating dataset 
#to acquire human judgement knowledge
print()
# total2=0
# print("Bleurt_score: ", bleurt_scores)
# for bleu in bleurt_scores['scores']:
#     total2= total2 + bleu    
# average2=total2/testsamples
# print()
# print("Bleurt_score average: ", average2)

# from datasets import list_metrics
# metrics_list = list_metrics()
# print("metrics_list:" ,metrics_list)

In [None]:
# Intersection over Union (IoU) score
def calculate_iou(sentence1, sentence2):
    # Convert sentences to sets of words
    set1 = set(sentence1.split())
    set2 = set(sentence2.split())
    
    # Calculate intersection and union
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    # Calculate IoU
    iou = intersection / union if union != 0 else 0
    
    return iou


In [None]:
total3=0
with open('IoU-Orig-LORA8-13b-4batches-300epochs-q8-3e-4_synTesting.csv', 'w', newline='') as csvfile:
    fieldnames = ['References', 'Predictions','IoU']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for p,r in zip(predictions, references):
        iou_score = calculate_iou(p, r)
        writer.writerow({'References':r, 'Predictions':p,'IoU':iou_score})
        total3=total3+iou_score

iou_avg=total3/length
print("Average of (IoU) score:", iou_avg)


In [None]:
# Testing the model in gguf format either from HF or local
from llama_cpp import Llama
from ctransformers import AutoModelForCausalLM
# Huggingface
model = AutoModelForCausalLM.from_pretrained("Nadahass/MREG-13B-GGUF", model_file="mreg-13b-32-hf.gguf", model_type="llama", gpu_layers=50)
#local
model = Llama(model_path="mreg-13b-32-hf.gguf")

In [None]:
prompt =create_prompt("Generate a referring expression for an object.","PinkBlock2 , (0.295275000: 1.224503000: 0.150793400) , None , ( 0.295275: 1.149781: 0.1507934) ) , None , None ,  put put grasp put put put put put put grasp put put put grasp put put put put , PinkBlock2 BlueBlock1 BlueBlock1 RedBlock2 BlueBlock2 YellowBlock1 YellowBlock2 YellowBlock1 RedBlock2 RedBlock2 RedBlock2 BlueBlock2 GreenBlock2 GreenBlock2 PinkBlock2 BlueBlock2 PinkBlock2 BlueBlock1")
print(model(prompt))