# GENERAL LIBRARIES

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!pip install torch
!pip install transformers[torch]
!pip install transformers peft

# !pip install accelerate -U

In [None]:
# NEEDS TO BE PARENT DIRECTORY OF TRAINING
dir= '/content/drive/MyDrive/BridgeAthletics/Proj1'
sub_dir='/Training/AdvancedData2/'
data_sub_dir='/Dataset2_allparams'

In [None]:
import json
import os
from tqdm import tqdm
import sys
import torch
#import accelerate
#from accelerate import Accelerator
from torch.utils.data import Dataset, DataLoader
sys.path.append(dir)
import pandas as pd
import numpy as np
from peft import get_peft_model, LoraConfig, LoraModel

#CUSTOM FUNCTIONS FROM FUNCTIONS.PY
from Training.Functions import *

In [None]:
from transformers import Trainer, TrainingArguments, Seq2SeqTrainer, Seq2SeqTrainingArguments


In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

# MODEL + TOKENIZER LIBRARY

In [None]:
#MODEL SELECTION: GPT2
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, BitsAndBytesConfig

In [None]:
#MODEL SELECTION: T5
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, BitsAndBytesConfig

# DATASET CLASS - FORMATTING + PREPARING FOR TRAINING

In [None]:
class InstructionDataset_GPT2(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_texts = []
        self.instruction_lengths = []
        for item in data:
            instruction_plus_input = format_model_input(item)
            response_text = f"\n\n### Response:\n{item['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )
            instruction_length = len(tokenizer.encode(instruction_plus_input))
            self.instruction_lengths.append(instruction_length)

    def __getitem__(self, index):
        # return self.instruction_lengths[index], self.encoded_texts[index] #(TO USE WITH CUSTOM COLLATE)
        return self.encoded_texts[index] #(TO USE WITH TRANSFORMERS COLLATE)

    def __len__(self):
        return len(self.data)

class InstructionDataset_T5(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.inputs=[]
        self.labels=[]
        for item in data:
            instruction_plus_input = format_model_input(item)
            response_text = f"\n\n### Response:\n{item['output']}"

            input_ids = tokenizer.encode(instruction_plus_input)
            label_ids = tokenizer.encode(response_text)

            self.inputs.append(input_ids)
            self.labels.append(label_ids)

    def __getitem__(self, index):
      return {
            'input_ids': torch.tensor(self.inputs[index], dtype=torch.long),
            'labels': torch.tensor(self.labels[index], dtype=torch.long)
        }

    def __len__(self):
        return len(self.data)

# CUSTOM COLLATE FUNCTION IF NEEDED

In [None]:
def collated_fromMLMtoCLM(labels,instr_len):
    labels = labels[:,1:]
    new_labels = torch.zeros((labels.size(0), labels.size(1) + 1), dtype=labels.dtype)
    for i in range(0,len(labels)):
        if len(labels[i,:])==0:
            row_list=[end_of_text_token_id]

        else:
            if labels[i,-1]!=-100:
                row_list = labels[i].tolist()
                row_list.append(end_of_text_token_id)

            else:
                if (labels[i]==-100).all():
                    row_list = labels[i].tolist()
                    row_list.insert(0,end_of_text_token_id)
                else:
                    for j in range (len(labels[i,:])):
                        if labels[i,j+1]==-100:
                           row_list = labels[i].tolist()
                           row_list.insert(j+1,end_of_text_token_id)
                           break

        new_labels[i] = torch.tensor(row_list, dtype=labels.dtype).to(device)
        #new_labels[i,:instr_len[i]-1] = -100 #UNCOMMENT FOR INSTRUCTION MASKING IN LOSS FUNCTION

    return new_labels


In [None]:
def CLM_Collator(tokenized_data_input_tuple, tokenizer=GPT2Tokenizer.from_pretrained('gpt2',padding_side="right", add_eos_token=True, add_bos_token=False)):
    tokenizer.pad_token = tokenizer.eos_token
    tokenized_data_input = [item[1] for item in tokenized_data_input_tuple]
    instr_lengths = [item[0] for item in tokenized_data_input_tuple]
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    collated_samples = data_collator(tokenized_data_input)
    collated_samples['labels'] = collated_fromMLMtoCLM(collated_samples['labels'],instr_lengths)
    return collated_samples

# def CLM_Collator(tokenized_data_input, tokenizer=GPT2Tokenizer.from_pretrained('gpt2',padding_side="right", add_eos_token=True, add_bos_token=False)):
#     tokenizer.pad_token = tokenizer.eos_token
#     data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
#     collated_samples=data_collator(tokenized_data_input)
#     collated_samples['labels'] = collated_fromMLMtoCLM(collated_samples['labels'])
#     return collated_samples

# TEST FUNCTIONS (DATASET-BATCHES-COLLATE)

In [None]:
def check_input_label_shapes(train_loader):
  for pairs in train_loader:
      print(pairs['input_ids'].shape, pairs['labels'].shape)

  print(pairs['input_ids'][0])
  print(pairs['labels'][0])

In [None]:
def collator_decoder_test(train_loader):
  for j in train_loader:
      tensor = j['input_ids'][0]
      filtered_tensor = tensor[tensor != -100]
      token_ids = filtered_tensor.tolist()
      decoded_string = tokenizer.decode(token_ids, skip_special_tokens=False)
      print("Decoded Input:")
      print(decoded_string)

      tensor = j['labels'][0]
      filtered_tensor = tensor[tensor != -100]
      token_ids = filtered_tensor.tolist()
      decoded_string = tokenizer.decode(token_ids, skip_special_tokens=False)
      print("\nDecode Label:")
      print(decoded_string)
      break

# DATA + TRAIN_TEST_VAL SPLIT

In [None]:
data = download_data(dir+data_sub_dir+'/finaldataset_shortblocks.json')
data = remove_extra_quotes(data)

print("Number Of Samples:",len(data),"\n")
print("Initial Sample Example:\n",data[400],"\n")

In [None]:
train_data,test_data,val_data = train_test_val_split(data,0.9,0.05)

In [None]:
print("Training set length:", len(train_data), "//Validation set length:", len(val_data),"//Test set length:", len(test_data))

# MODEL + TOKENIZER DOWNLOAD (RUN ONE CELL ONLY)

## GPT2:

In [None]:
#BASE GPT2 MODEL FROM HUGGING FACE
original_model_name="gpt2-medium"
model = GPT2LMHeadModel.from_pretrained(original_model_name)

#TOKENIZER + COLLATOR
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
end_of_text_token_id = tokenizer.encode("<|endoftext|>")[0]
tokenizer.pad_token = tokenizer.eos_token
data_collator_fn = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [None]:
#SAVED GPT2 MODEL POST FINE-TUNING (IF EXISTS)
original_model_name="gpt2-medium"
model_name=dir+sub_dir+'/final_model_'+original_model_name
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
end_of_text_token_id = tokenizer.encode("<|endoftext|>")[0]
tokenizer.pad_token = tokenizer.eos_token
data_collator_fn = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
def check_unknown_tokens(texts):
    unknown_tokens = set()

    tokens = tokenizer.encode(texts)
    for token in tokens:
        if token == 50256:
            unknown_tokens.add(token)
    return unknown_tokens

In [None]:
for i in train_data:
  texts = str(train_data[10]['output'])
  unknown_tokens = check_unknown_tokens(texts)
  if len(unknown_tokens)!=0:
    print("Unknown tokens:", unknown_tokens)
    print(train_data[10]['output'])
    print(tokenizer.decode(tokenizer.encode(str(train_data[10]['output']))))

In [None]:
#ADD NEW TOKENS
new_token = [""]
num_added_toks = tokenizer.add_tokens(new_token)
print(f"Added {num_added_toks} new regular token.")
model.resize_token_embeddings(len(tokenizer))

## T5:

In [None]:
#BASE T5 MODEL
original_model_name = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(original_model_name)
tokenizer = T5Tokenizer.from_pretrained(original_model_name)
tokenizer.pad_token = tokenizer.eos_token
data_collator_fn = DataCollatorForSeq2Seq(tokenizer, model=model)

special_tokens_dict = {'additional_special_tokens': ['{', '}']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))


In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

In [None]:
#SAVED T5 MODEL POST FINE-TUNING (IF EXISTS)
original_model_name = "t5-base"
model_name = dir+sub_dir+'/final_model_'+original_model_name
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
data_collator_fn = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
#AWS MODEL
original_model_name = "t5-base"
model_name = dir+sub_dir+'/final_model_'+original_model_name+"AWS"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
data_collator_fn = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
#LoRA MODEL
original_model_name = "t5-base"
model_name = dir+sub_dir+'/final_model_'+original_model_name+"LoRA"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
data_collator_fn = DataCollatorForSeq2Seq(tokenizer, model=model)

### CHECK IF T5 MODEL HAS ALL TOKENS

In [None]:
def check_unknown_tokens(texts):
    unknown_tokens = set()

    tokens = tokenizer.encode(texts)
    for token in tokens:
        if token == 2:
            unknown_tokens.add(token)
    return unknown_tokens

In [None]:
for i in train_data:
  texts = str(train_data[10]['output'])
  unknown_tokens = check_unknown_tokens(texts)
  if len(unknown_tokens)!=0:
    print("Unknown tokens:", unknown_tokens)
    print(train_data[10]['output'])
    print(tokenizer.decode(tokenizer.encode(str(train_data[10]['output']))))


In [None]:
#ADD NEW TOKENS
new_tokens = ["{", "}"]
num_added_tokens = tokenizer.add_tokens(new_tokens)
print(f"Added {num_added_tokens} new regular token.")
model.resize_token_embeddings(len(tokenizer))


### T5 LoRA MODEL

In [None]:
lora_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.01,
)

In [None]:
model = get_peft_model(model, lora_config)

## Load Model To Device

In [None]:
print(f"Num of param for {model_name}:",sum(p.numel() for p in model.parameters()))
print(f"Max Length: {model.config.n_positions}")
model.to(device)  # Move the model to the appropriate device
print("")

In [None]:
model.print_trainable_parameters()

### Model Size Reduction

In [None]:
def get_model_size(model):
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
    size_all_mb = (param_size + buffer_size) / 1024**2
    return size_all_mb

original_model_size = get_model_size(model)
print(original_model_size, "MB")

In [None]:
#reduce model to half ONLY FOR INFERENCE

if device.type=="cuda":
  model.half().to(device)

def is_fp16(model):
    for param in model.parameters():
        if param.dtype != torch.float16:
            return False
    return True

if is_fp16(model):
    print("The model is in FP16 precision.")
else:
    print("The model is not in FP16 precision.")

new_model_size = get_model_size(model)

print(new_model_size, "MB")

# TOKENIZER + INSTRUCTION-DATASET + COLLATOR INITALIZATION FOR TRAINING



In [None]:
#TRAINING AND DATA SETTINGS

#GPT2
if "gpt2" in model_name:
  num_workers = 0
  batch_size = 8
  epochs=9

  torch.manual_seed(123)

  train_dataset = InstructionDataset_GPT2(train_data, tokenizer)
  val_dataset = InstructionDataset_GPT2(val_data, tokenizer)
  test_dataset = InstructionDataset_GPT2(test_data, tokenizer)

  #FOR TESTING PURPOSES:
  train_loader = DataLoader(train_dataset,batch_size=batch_size,collate_fn=data_collator_fn,
      shuffle=True,
      drop_last=True,
      num_workers=num_workers
  )

elif "t5" in model_name:
  #T5
  num_workers = 0
  batch_size = 8
  epochs=9

  torch.manual_seed(123)

  train_dataset = InstructionDataset_T5(train_data, tokenizer)
  val_dataset = InstructionDataset_T5(val_data, tokenizer)
  test_dataset = InstructionDataset_T5(test_data, tokenizer)

  #FOR TESTING PURPOSES:
  train_loader = DataLoader(train_dataset,batch_size=batch_size,collate_fn=data_collator_fn,
      shuffle=True,
      drop_last=True,
      num_workers=num_workers
  )

else:
  sys.exit("Error: model not defined")

In [None]:
check_input_label_shapes(train_loader)
print('\n\n\n')
collator_decoder_test(train_loader)

# TRAINING

## TRAINING HYPERPARAMS

In [None]:
if "gpt2" in original_model_name:
  batch_size = 4
  epochs=0.5
  training_args = TrainingArguments(
      output_dir=(dir+sub_dir+'results_'+original_model_name),
      num_train_epochs=epochs,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=2*batch_size,
      warmup_steps=int(0.1* epochs* (len(train_dataset)//batch_size)),
      weight_decay=0.1,
      logging_dir=dir+sub_dir+'logs_'+original_model_name,
      logging_steps=100,
      do_train=True,
      do_eval=True,
      eval_strategy="steps",
      eval_steps=int(len(train_dataset)/10),
      save_strategy="steps",
      save_steps=2*int(len(train_dataset)/10),
      save_total_limit=3,
      load_best_model_at_end=True,
      resume_from_checkpoint=True,
      lr_scheduler_type='linear',
      gradient_accumulation_steps=2,
      max_grad_norm=1.0,
      learning_rate=7.5e-5,
  )

  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset= val_dataset,
    data_collator=data_collator_fn,
  )


if "t5" in original_model_name:
  batch_size = 8
  epochs=15
  fp16_bool=True
  training_args = Seq2SeqTrainingArguments(
      output_dir=(dir+sub_dir+'results_'+original_model_name),
      num_train_epochs=epochs,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=2*batch_size,
      warmup_steps=int(0.1* epochs* (len(train_dataset)//batch_size)),
      weight_decay=0.1,
      logging_dir=dir+sub_dir+'logs_'+original_model_name,
      logging_steps=100,
      do_train=True,
      do_eval=True,
      eval_strategy="steps",
      eval_steps=int(len(train_dataset)/10),
      save_strategy="steps",
      save_steps=2*int(len(train_dataset)/10),
      save_total_limit=3,
      load_best_model_at_end=True,
      resume_from_checkpoint=True,
      lr_scheduler_type='linear',
      gradient_accumulation_steps=2,
      max_grad_norm=1.0,
      learning_rate=9e-5,
      remove_unused_columns=False,
      fp16=fp16_bool
  )

  trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset= val_dataset,
    data_collator=data_collator_fn,
  )

## TRAINING + EVAL

### GPT2

In [None]:
trainer.train()

In [None]:
trainer.evaluate(eval_dataset=test_dataset)

### T5

In [None]:
trainer.train()

In [None]:
trainer.evaluate(eval_dataset=train_dataset)

# SAVING THE MODEL

In [None]:
append_filename="LoRA" #AWS or LoRA ETC
filename = dir+sub_dir+'final_model_'+original_model_name+append_filename
if os.path.exists(filename):
    m=input("are you sure you want to overwrite file? reply with 'yes' or 'no'")
    if m.lower()=='yes':
        model.save_pretrained(filename,safe_serialization=False)
        tokenizer.save_pretrained(filename)

else:
    model.save_pretrained(filename,safe_serialization=False)
    tokenizer.save_pretrained(filename)

# INFERENCE

### GPT2 FINETUNED OUTPUT EXAMPLES

In [None]:
model.eval()
model_outputs=[]
data_to_use = train_data[74:75]

for i in tqdm(range(len(data_to_use))):
  in_test = data_to_use[i]
  sample_out = data_to_use[i]['output']
  in_test=format_model_input(in_test)
  input_ids = tokenizer.encode(in_test, return_tensors="pt").to(device)
  # output = model.generate(
  #     input_ids=input_ids,
  #     eos_token_id=50256,
  #     max_length=len(input_ids) + 200,
  #     num_return_sequences=1,
  #     early_stopping=True,
  #     pad_token_id=50256,
  # )
  output = model.generate(
      input_ids=input_ids,
      eos_token_id=50256,
      max_length=len(input_ids) + 300,
      num_beams=3,
      num_return_sequences=1,
      early_stopping=True,
      pad_token_id=50256,
      #repetition_penalty=1.5,  #TO EXPERIMENT WITH
  )

  decoded_output=tokenizer.decode(output[0], skip_special_tokens=True)

  stop_sequence = "### Response"
  stop_index = decoded_output.find(stop_sequence, decoded_output.find(stop_sequence) + len(stop_sequence))
  if stop_index != -1:
      trimmed_output = decoded_output[:stop_index]
  else:
      trimmed_output = decoded_output

  print('expected output:',sample_out,'model output:',trimmed_output,'\n',sep='\n')
  model_outputs.append(trimmed_output)

### T5 FINETUNED OUTPUT EXAMPLES

In [None]:
l=[{},{},{}]
l[0]['input']='foam roll'
l[1]['input']="warmup"
l[2]['input']='foam roll warmup'

In [None]:
model.eval()
model_outputs=[]
data_to_use = l
import time

for i in tqdm(range(len(data_to_use))):
  in_test = data_to_use[i]
  in_test=format_model_input(in_test)
  #print("in_test:\n",in_test) #UNCOMMENT TO PRINT
  input_ids = tokenizer.encode(in_test, return_tensors="pt").to(device)

  start_time = time.time()

  with torch.no_grad():
    output = model.generate(
        input_ids=input_ids,
        eos_token_id=1,
        max_length= 400,
        num_beams=2,
        num_return_sequences=1,
        early_stopping=True,
        repetition_penalty=3.0,  #TO EXPERIMENT WITH
    )
  end_time = time.time()

  decoded_output=tokenizer.decode(output[0], skip_special_tokens=True)
  stop_sequence = "### Response"
  stop_index = decoded_output.find(stop_sequence, decoded_output.find(stop_sequence) + len(stop_sequence))
  if stop_index != -1:
      trimmed_output = decoded_output[:stop_index]
  else:
      trimmed_output = decoded_output

  print('model output:',trimmed_output,'\n',sep='\n') #UNCOMMENT TO SEE OUTPUT
  model_outputs.append(trimmed_output)

inference_time = end_time - start_time
print(f" model inference time: {inference_time:.2f} seconds")

### SAVE MODEL OUTPUTS

In [None]:
if len(model_outputs)==len(train_data):
  output_file=dir+sub_dir+'final_model_'+original_model_name+'_model_outputs_train.json'

elif data_to_use==test_data:
  output_file=dir+sub_dir+'final_model_'+original_model_name+'_model_outputs_test.json'

elif data_to_use==val_data:
  output_file=dir+sub_dir+'final_model_'+original_model_name+'_model_outputs_val.json'

else:
  sys.exit("Error: lengths do not match with original dataset")

with open(output_file, 'w') as f:
  json.dump(model_outputs, f)

# TESTING + DATA ANALYSIS

In [None]:
import ast
import re

### TESTING + TRANSFORMATION FUNCTIONS

In [None]:
def convert_output_to_list(model_outputs,model_name):
  model_outputs_list=[]
  if "gpt2" in model_name:
    for i,generated in enumerate(model_outputs):
      s = generated

      start_index = s.find("### Response")
      if start_index != -1:
        s = s[start_index:]

      s = s.replace("### Response:", "").strip()

      end_index = s.find(']}]') #POST PROCESSING
      if end_index != -1:
        s = s[:end_index + 3]

      try:
          s = ast.literal_eval(s)

      except (ValueError, SyntaxError):
          print(f"ERROR: Output at index {i} is invalid and cannot be parsed.")

      if type(s) != list or not all(isinstance(item, dict) for item in s):
        print(i,s,"ERROR: not dict or not list")

      else:
        model_outputs_list.append(s)


  elif "t5" in model_name:
    for i,generated in enumerate(model_outputs):
      s = generated
      s = s.replace("</s>", "").strip()
      s = s.strip("<pad> ### Response: ")

      try:
          s = ast.literal_eval(s)

      except (ValueError, SyntaxError):  #POST PROCESSING
        last_bracket_index = s.rfind('}')
        if last_bracket_index != -1:
            s = s[:last_bracket_index + 1]
            s += ']'
            try:
                s=ast.literal_eval(s)
            except (ValueError, SyntaxError):
                print(f"ERROR: Output at index {i} is invalid and cannot be parsed.")

      if type(s) != list or not all(isinstance(item, dict) for item in s):
        print(i,s,"ERROR: not dict or not list")

      else:
        model_outputs_list.append(s)
  else:
    return "ERROR: Get Model Name"


  print(f"{len(model_outputs_list)} outputs converted correctly to list of dicts out of {len(model_outputs)} model outputs")
  return model_outputs_list


In [None]:
def check_parameters_correctness(model_outputs_list, param_list):
    required_keys = {'exercise', 'sets'}
    c=0
    m=0
    for i, outer_list in enumerate(model_outputs_list):
        for j, dictionary in enumerate(outer_list):
            m=m+1
            if not required_keys.issubset(dictionary.keys()):
                print(f"ERROR: Dictionary at index [{i}][{j}] must have at least the keys {required_keys}.")
                c=c+1
                continue

            if not any(param in dictionary for param in param_list):
                print(f"ERROR: Dictionary at index [{i}][{j}] must have at least one of the keys from {param_list}.")
                c=c+1
                continue

            allowed_keys = required_keys.union(param_list)
            if not set(dictionary.keys()).issubset(allowed_keys):
                print(f"ERROR: Dictionary at index [{i}][{j}] must only have keys from {allowed_keys}.")
                c=c+1
                continue
    print(f"inconsistency in keys for {c} out of {m} ")

In [None]:
def check_set_and_param_consistency(model_outputs_list,param_list): #MODIFY WHEN ADD PARMETERS OTHER THAN REPS (For ALL params other than exercise, #sets must be == len(param))
  c=0
  m=0
  for i, outer_list in enumerate(model_outputs_list):
      for j, dictionary in enumerate(outer_list):
        sets = dictionary['sets']
        for param in param_list:
            m=m+1
            if param in dictionary and sets != len(dictionary[param]):
                c += 1
                print(f"ERROR: Dictionary at index [{i}][{j}] has #sets != len({param})")

  print(f"inonsistency in sets and params for {c} out of {m} ")


In [None]:
def no_consecutive_same_exercise(model_outputs_list):
  c=0
  for i, outer_list in enumerate(model_outputs_list):
    k=1
    for j in range(0,len(outer_list)-1):
      if outer_list[j]['exercise']==outer_list[j+1]['exercise']:
        if k==1:
          print(f"ERROR: Duplicated exercise at model output index {i}")
          c=c+1
          k=0

  print(f"same exercise repeated consecutively for {c} out of {len(model_outputs_list)} ")



In [None]:
def remove_consecutive_duplicates(model_outputs_list):
    cleaned_outputs = []
    for i, outer_list in enumerate(model_outputs_list):
        new_list = []
        for j in range(len(outer_list)):
            if j == 0 or outer_list[j]['exercise'] != outer_list[j-1]['exercise']:
                new_list.append(outer_list[j])
        cleaned_outputs.append(new_list)
    return cleaned_outputs

In [None]:
def count_param_frequency(model_outputs_list, param_list):
    param_counts = {param: 0 for param in param_list}

    for outer_list in model_outputs_list:
        for param in param_list:
            if any(param in dictionary for dictionary in outer_list):
                param_counts[param] += 1

    total_outer_lists = len(model_outputs_list)
    param_percentages = {param: (count / total_outer_lists) * 100 for param, count in param_counts.items()}

    return param_counts, param_percentages

In [None]:
def output_list_to_df(model_outputs_list,param_list):
    flat_data = []
    for i, outer_list in enumerate(model_outputs_list):
        for d in outer_list:
            row = [i, d['exercise'], d['sets']]
            for param in param_list:
                row.append(d.get(param, None))
            flat_data.append(row)
    columns = ['block', 'exercise', 'sets'] + param_list

    df = pd.DataFrame(flat_data, columns=columns)
    return df

### CHOOSE MODEL AND DATASET OUTPUTS. !!!!!!DO NOT RUN FIRST CELL IF YOU WANT THE VALUES FROM "{MODEL}FINETUNED OUTPUT EXAMPLES" EXECUTION!!!!!!

#### PARAM LIST: MODIFY IF NEW PARAMS ADDED

In [None]:
param_list=['reps','time','distance']

#### SELECT DATA

In [None]:
#MODEL OUTPUTS
file_to_read = dir+sub_dir+'final_model_t5-base_model_outputs_train.json'
with open(file_to_read, 'r') as f:
    model_outputs = json.load(f)
print(len(model_outputs))

In [None]:
using_data=train_data
data_outputs = [using_data[i]['output'] for i in range(len(using_data))]
print(len(data_outputs))
main_dataset_df = output_list_to_df(data_outputs,param_list)
main_dataset_df

In [None]:
model_outputs_list=[]
for i in data:
  model_outputs_list.append(i['output'])

### GPT2 RUN TESTS + TRANSFORMATION

In [None]:
model_outputs_list=convert_output_to_list(model_outputs,model_name)

In [None]:
check_parameters_correctness(model_outputs_list)

In [None]:
check_set_and_param_consistency(model_outputs_list)

In [None]:
no_consecutive_same_exercise(model_outputs_list)

### GPT2 DATA ANALYSIS

In [None]:
gpt2df=output_list_to_df(model_outputs_list)
gpt2df

In [None]:
#DATA ANALYSIS ON EXERCISES
print("##############GPT2 OUTPUT##############")
total_number_of_ex = gpt2df['exercise'].count()
max_number_of_ex_per_block = gpt2df.groupby('block')['exercise'].count().max()
average_number_of_ex_per_outer_list = gpt2df.groupby('block')['exercise'].count().mean()
median_number_of_ex_per_outer_list = gpt2df.groupby('block')['exercise'].count().median()
std_dev_of_ex_per_outer_list = gpt2df.groupby('block')['exercise'].count().std()

print(f"Total number of exercises: {total_number_of_ex}")
print(f"Max number of exercises per block: {max_number_of_ex_per_block}")
print(f"Average number of exercises per block: {average_number_of_ex_per_outer_list}")
print(f"Median number of exercises per block: {median_number_of_ex_per_outer_list}")
print(f"Standard deviation of exercises per block: {std_dev_of_ex_per_outer_list}")

print("\n##############TRAINING DATASET OUTPUT##############")
total_number_of_ex = main_dataset_df['exercise'].count()
max_number_of_ex_per_block = main_dataset_df.groupby('block')['exercise'].count().max()
average_number_of_ex_per_outer_list = main_dataset_df.groupby('block')['exercise'].count().mean()
median_number_of_ex_per_outer_list = main_dataset_df.groupby('block')['exercise'].count().median()
std_dev_of_ex_per_outer_list = main_dataset_df.groupby('block')['exercise'].count().std()

print(f"Total number of exercises: {total_number_of_ex}")
print(f"Max number of exercises per block: {max_number_of_ex_per_block}")
print(f"Average number of exercises per block: {average_number_of_ex_per_outer_list}")
print(f"Median number of exercises per block: {median_number_of_ex_per_outer_list}")
print(f"Standard deviation of exercises per block: {std_dev_of_ex_per_outer_list}")

In [None]:
#DATA ANALYSIS ON SETS
print("##############GPT2 OUTPUT##############")
total_sets = gpt2df['sets'].sum()
max_sets_per_block = gpt2df.groupby('block')['sets'].sum().max()
average_sets_per_outer_list = gpt2df.groupby('block')['sets'].sum().mean()
median_sets_per_outer_list = gpt2df.groupby('block')['sets'].sum().median()
std_dev_sets_per_outer_list = gpt2df.groupby('block')['sets'].sum().std()

print(f"Total number of sets: {total_sets}")
print(f"Max number of sets per block: {max_sets_per_block}")
print(f"Average number of sets per block: {average_sets_per_outer_list}")
print(f"Median number of sets per block: {median_sets_per_outer_list}")
print(f"Standard deviation of sets per block: {std_dev_sets_per_outer_list}")

print("\n##############TRAINING DATASET OUTPUT##############")
total_sets = main_dataset_df['sets'].sum()
max_sets_per_block = main_dataset_df.groupby('block')['sets'].sum().max()
average_sets_per_outer_list = main_dataset_df.groupby('block')['sets'].sum().mean()
median_sets_per_outer_list = main_dataset_df.groupby('block')['sets'].sum().median()
std_dev_sets_per_outer_list = main_dataset_df.groupby('block')['sets'].sum().std()

print(f"Total number of sets: {total_sets}")
print(f"Max number of sets per block: {max_sets_per_block}")
print(f"Average number of sets per block: {average_sets_per_outer_list}")
print(f"Median number of sets per block: {median_sets_per_outer_list}")
print(f"Standard deviation of sets per block: {std_dev_sets_per_outer_list}")

In [None]:
#DATA ANALYSIS ON REPS
print("##############GPT2 OUTPUT##############")
gpt2df['max_reps'] = gpt2df['reps'].apply(lambda x: max(x) if x else 0)
gpt2df['mean_reps'] = gpt2df['reps'].apply(lambda x: sum(x) / len(x) if x else 0)
gpt2df['total_reps'] = gpt2df['reps'].apply(sum)

mean_reps = gpt2df['mean_reps'].mean()
median_reps = gpt2df['mean_reps'].median()
total_reps = gpt2df['total_reps'].sum()
max_reps_per_block = gpt2df.groupby('block')['total_reps'].sum().max()
average_reps_per_outer_list = gpt2df.groupby('block')['total_reps'].sum().mean()
median_reps_per_outer_list = gpt2df.groupby('block')['total_reps'].sum().median()
std_dev_reps_per_outer_list = gpt2df.groupby('block')['total_reps'].sum().std()

print(f"Mean number of average number of reps per exercise: {mean_reps}")
print(f"Median number of average number of reps per exercise: {median_reps}")
print(f"Total number of reps: {total_reps}")
print(f"Max number of reps per block: {max_reps_per_block}")
print(f"Average number of reps per block: {average_reps_per_outer_list}")
print(f"Median number of reps per block: {median_reps_per_outer_list}")
print(f"Standard deviation of reps per block: {std_dev_reps_per_outer_list}")

print("\n##############TRAINING DATASET OUTPUT##############")

main_dataset_df['max_reps'] = main_dataset_df['reps'].apply(lambda x: max(x) if x else 0)
main_dataset_df['mean_reps'] = main_dataset_df['reps'].apply(lambda x: sum(x) / len(x) if x else 0)
main_dataset_df['total_reps'] = main_dataset_df['reps'].apply(sum)

mean_reps = main_dataset_df['mean_reps'].mean()
median_reps = main_dataset_df['mean_reps'].median()
total_reps = main_dataset_df['total_reps'].sum()
max_reps_per_block = main_dataset_df.groupby('block')['total_reps'].sum().max()
average_reps_per_outer_list = main_dataset_df.groupby('block')['total_reps'].sum().mean()
median_reps_per_outer_list = main_dataset_df.groupby('block')['total_reps'].sum().median()
std_dev_reps_per_outer_list = main_dataset_df.groupby('block')['total_reps'].sum().std()

print(f"Mean number of average number of reps per exercise: {mean_reps}")
print(f"Median number of average number of reps per exercise: {median_reps}")
print(f"Total number of reps: {total_reps}")
print(f"Max number of reps per block: {max_reps_per_block}")
print(f"Average number of reps per block: {average_reps_per_outer_list}")
print(f"Median number of reps per block: {median_reps_per_outer_list}")
print(f"Standard deviation of reps per block: {std_dev_reps_per_outer_list}")

### T5 RUN TESTS

In [None]:
model_name="t5-base"
model_outputs_list=convert_output_to_list(model_outputs,model_name)

In [None]:
check_parameters_correctness(model_outputs_list,param_list)

In [None]:
check_set_and_param_consistency(model_outputs_list,param_list)

In [None]:
no_consecutive_same_exercise(model_outputs_list)

In [None]:
model_outputs_list = remove_consecutive_duplicates(model_outputs_list)

### T5 DATA ANALYSIS

In [None]:
param_counts, param_percentages = count_param_frequency(model_outputs_list, param_list)
print("MODEL Parameter Counts:", param_counts)
print("MODEL Parameter Percentages:", param_percentages)
print()
param_counts, param_percentages = count_param_frequency(data_outputs, param_list)
print("DATASET Parameter Counts:", param_counts)
print("DATASET Parameter Percentages:", param_percentages)

In [None]:
t5df=output_list_to_df(model_outputs_list,param_list)
t5df

In [None]:
#DATA ANALYSIS ON EXERCISES
print("##############T5 OUTPUT##############")
total_number_of_ex = t5df['exercise'].count()
max_number_of_ex_per_block = t5df.groupby('block')['exercise'].count().max()
average_number_of_ex_per_outer_list = t5df.groupby('block')['exercise'].count().mean()
median_number_of_ex_per_outer_list = t5df.groupby('block')['exercise'].count().median()
std_dev_of_ex_per_outer_list = t5df.groupby('block')['exercise'].count().std()
number_of_blocks_with_one_exercise = (main_dataset_df.groupby('block')['exercise'].count() == 1).sum()

print(f"Total number of exercises: {total_number_of_ex}")
print(f"Max number of exercises per block: {max_number_of_ex_per_block}")
print(f"Average number of exercises per block: {average_number_of_ex_per_outer_list}")
print(f"Median number of exercises per block: {median_number_of_ex_per_outer_list}")
print(f"Standard deviation of exercises per block: {std_dev_of_ex_per_outer_list}")
print(f"Percentage of blocks with only one exercise: {number_of_blocks_with_one_exercise/len(model_outputs_list) *100}%")

print("\n##############TRAINING DATASET OUTPUT##############")
total_number_of_ex = main_dataset_df['exercise'].count()
max_number_of_ex_per_block = main_dataset_df.groupby('block')['exercise'].count().max()
average_number_of_ex_per_outer_list = main_dataset_df.groupby('block')['exercise'].count().mean()
median_number_of_ex_per_outer_list = main_dataset_df.groupby('block')['exercise'].count().median()
std_dev_of_ex_per_outer_list = main_dataset_df.groupby('block')['exercise'].count().std()
number_of_blocks_with_one_exercise = (main_dataset_df.groupby('block')['exercise'].count() == 1).sum()

print(f"Total number of exercises: {total_number_of_ex}")
print(f"Max number of exercises per block: {max_number_of_ex_per_block}")
print(f"Average number of exercises per block: {average_number_of_ex_per_outer_list}")
print(f"Median number of exercises per block: {median_number_of_ex_per_outer_list}")
print(f"Standard deviation of exercises per block: {std_dev_of_ex_per_outer_list}")
print(f"Percentage of blocks with only one exercise: {number_of_blocks_with_one_exercise/len(model_outputs_list) *100}%")


In [None]:
#DATA ANALYSIS ON SETS
print("##############T5 OUTPUT##############")
total_sets = t5df['sets'].sum()
max_sets_per_block = t5df.groupby('block')['sets'].sum().max()
average_sets_per_outer_list = t5df.groupby('block')['sets'].sum().mean()
median_sets_per_outer_list = t5df.groupby('block')['sets'].sum().median()
std_dev_sets_per_outer_list = t5df.groupby('block')['sets'].sum().std()

print(f"Total number of sets: {total_sets}")
print(f"Max number of sets per block: {max_sets_per_block}")
print(f"Average number of sets per block: {average_sets_per_outer_list}")
print(f"Median number of sets per block: {median_sets_per_outer_list}")
print(f"Standard deviation of sets per block: {std_dev_sets_per_outer_list}")

print("\n##############TRAINING DATASET OUTPUT##############")
total_sets = main_dataset_df['sets'].sum()
max_sets_per_block = main_dataset_df.groupby('block')['sets'].sum().max()
average_sets_per_outer_list = main_dataset_df.groupby('block')['sets'].sum().mean()
median_sets_per_outer_list = main_dataset_df.groupby('block')['sets'].sum().median()
std_dev_sets_per_outer_list = main_dataset_df.groupby('block')['sets'].sum().std()

print(f"Total number of sets: {total_sets}")
print(f"Max number of sets per block: {max_sets_per_block}")
print(f"Average number of sets per block: {average_sets_per_outer_list}")
print(f"Median number of sets per block: {median_sets_per_outer_list}")
print(f"Standard deviation of sets per block: {std_dev_sets_per_outer_list}")

In [None]:
#DATA ANALYSIS ON REPS
print("##############T5 OUTPUT##############")
t5df['max_reps'] = t5df['reps'].apply(lambda x: max(x) if x else 0)
t5df['mean_reps'] = t5df['reps'].apply(lambda x: sum(x) / len(x) if x else 0)
t5df['total_reps'] = t5df['reps'].apply(sum)

mean_reps = t5df['mean_reps'].mean()
median_reps = t5df['mean_reps'].median()
total_reps = t5df['total_reps'].sum()
max_reps_per_block = t5df.groupby('block')['total_reps'].sum().max()
average_reps_per_outer_list = t5df.groupby('block')['total_reps'].sum().mean()
median_reps_per_outer_list = t5df.groupby('block')['total_reps'].sum().median()
std_dev_reps_per_outer_list = t5df.groupby('block')['total_reps'].sum().std()

print(f"Mean number of average number of reps per exercise: {mean_reps}")
print(f"Median number of average number of reps per exercise: {median_reps}")
print(f"Total number of reps: {total_reps}")
print(f"Max number of reps per block: {max_reps_per_block}")
print(f"Average number of reps per block: {average_reps_per_outer_list}")
print(f"Median number of reps per block: {median_reps_per_outer_list}")
print(f"Standard deviation of reps per block: {std_dev_reps_per_outer_list}")

print("\n##############TRAINING DATASET OUTPUT##############")

main_dataset_df['max_reps'] = main_dataset_df['reps'].apply(lambda x: max(x) if x else 0)
main_dataset_df['mean_reps'] = main_dataset_df['reps'].apply(lambda x: sum(x) / len(x) if x else 0)
main_dataset_df['total_reps'] = main_dataset_df['reps'].apply(sum)

mean_reps = main_dataset_df['mean_reps'].mean()
median_reps = main_dataset_df['mean_reps'].median()
total_reps = main_dataset_df['total_reps'].sum()
max_reps_per_block = main_dataset_df.groupby('block')['total_reps'].sum().max()
average_reps_per_outer_list = main_dataset_df.groupby('block')['total_reps'].sum().mean()
median_reps_per_outer_list = main_dataset_df.groupby('block')['total_reps'].sum().median()
std_dev_reps_per_outer_list = main_dataset_df.groupby('block')['total_reps'].sum().std()

print(f"Mean number of average number of reps per exercise: {mean_reps}")
print(f"Median number of average number of reps per exercise: {median_reps}")
print(f"Total number of reps: {total_reps}")
print(f"Max number of reps per block: {max_reps_per_block}")
print(f"Average number of reps per block: {average_reps_per_outer_list}")
print(f"Median number of reps per block: {median_reps_per_outer_list}")
print(f"Standard deviation of reps per block: {std_dev_reps_per_outer_list}")

# EVALUATION


In [None]:
!pip install rouge_score

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

## Functions

In [None]:
def calculate_bleu(reference, hypothesis):
    reference_tokens = [nltk.word_tokenize(reference)]
    hypothesis_tokens = nltk.word_tokenize(hypothesis)
    smoothie = SmoothingFunction().method4
    return sentence_bleu(reference_tokens, hypothesis_tokens, smoothing_function=smoothie)

def calculate_rouge(reference, hypothesis):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return scorer.score(reference, hypothesis)

## Evaluation

In [None]:
reference_texts = [str(i['output']) for i in train_data]
print(len(reference_texts), len(model_outputs))
for i in range(len(model_outputs)):
  model_outputs[i]=model_outputs[i].strip("<pad> ### Response: ")
  model_outputs[i]=model_outputs[i].replace(" { ", "{").replace(" }", "}").replace(",",", ").replace("} ","}")


In [None]:
bleu_scores = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for ref, gen in zip(reference_texts, model_outputs):

    bleu_score = calculate_bleu(ref, gen)
    rouge_scores = calculate_rouge(ref, gen)

    bleu_scores.append(bleu_score)
    rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
    rouge2_scores.append(rouge_scores['rouge2'].fmeasure)
    rougeL_scores.append(rouge_scores['rougeL'].fmeasure)



bleu_mean = np.mean(bleu_scores)
bleu_std = np.std(bleu_scores)

rouge1_mean = np.mean(rouge1_scores)
rouge1_std = np.std(rouge1_scores)

rouge2_mean = np.mean(rouge2_scores)
rouge2_std = np.std(rouge2_scores)

rougeL_mean = np.mean(rougeL_scores)
rougeL_std = np.std(rougeL_scores)

print(f"Average BLEU Score: {bleu_mean:.4f} ± {bleu_std:.4f}")
print(f"Average ROUGE-1 F1 Score: {rouge1_mean:.4f} ± {rouge1_std:.4f}")
print(f"Average ROUGE-2 F1 Score: {rouge2_mean:.4f} ± {rouge2_std:.4f}")
print(f"Average ROUGE-L F1 Score: {rougeL_mean:.4f} ± {rougeL_std:.4f}")

In [None]:
c=0
d=0
for i in model_outputs_list:
  d=d+1
  sets=i[0]['sets']
  exo=i[0]['exercise']
  for j in i:
    if j['exercise']==exo and j['sets']!=sets:
      print(i,'\n')
      c=c+1
      break
print(c,d)