In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
 !pip install transformers datasets sentencepiece optuna

In [None]:
pip install -U kaleido

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [None]:
import json
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments,EarlyStoppingCallback
import torch
from transformers import Seq2SeqTrainer
from torch.utils.data import Dataset, DataLoader
from itertools import product
import optuna
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import random
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy

nlp = spacy.load("en_core_web_sm")
# download necessary resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:

with open('/content/drive/MyDrive/MTP Project/t5_data/train_data.json') as jsonfile:
    train_data = json.load(jsonfile)


with open('/content/drive/MyDrive/MTP Project/t5_data/val_data.json') as jsonfile:
    val_data = json.load(jsonfile)

print(f"Total Training dataset samples : {len(train_data)}")

print(f"Total validation set samples : {len(val_data)}")

df_train = pd.DataFrame(train_data)
df_val = pd.DataFrame(val_data)

Total Training dataset samples : 77796
Total validation set samples : 15000


In [None]:
def preprocess(row,max_input_len,max_target_len,tok):
    tok=tok
    # Retrieve the input and output strings from the row dictionary
    input_str = row['input']
    target_str = row['output']
    # Tokenize the input string with a maximum length and truncation
    tokenized_inputs = tok(input_str, max_length=max_input_len, truncation=True)
    # Tokenize the target string with a maximum length and truncation
    tokenized_targets = tok(text_target=target_str, max_length=max_target_len, truncation=True)
    # Assign the tokenized target's 'input_ids' to the 'labels' key in the tokenized input dictionary
    tokenized_inputs['labels'] = tokenized_targets['input_ids']
    # Return the tokenized input dictionary
    return tokenized_inputs
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Return a dictionary containing the input_ids, attention_mask, and labels for a given index
        return {
            'input_ids': self.data[idx]['input_ids'],
            'attention_mask': self.data[idx]['attention_mask'],
            'labels': self.data[idx]['labels'],

        }



In [None]:
#load best model parameters
with open("/content/drive/MyDrive/MTP Project/out_attr/hyperparameters-attr/final_params_dup.json") as f:
  params=json.load(f)
print(params)

{'learning_rate': 5.285698854152201e-05, 'weight_decay': 0.00025829870123799276, 'num_train_epochs': 19, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'input_seq_length': 256, 'output_seq_length': 14, 'dropout_rate': 0.1133866716350046}


In [None]:
import optuna.visualization as vis
import plotly.express as px

In [None]:
#loading the study object from hyperparamter tuning
import joblib
study = joblib.load("/content/drive/MyDrive/MTP Project/out_attr/hyperparameters-attr/study_dup.pkl")

print("Best trial until now:")
print(" Value: ", study.best_trial.value)
print(" Params: ")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

Best trial until now:
 Value:  0.2896277606487274
 Params: 
    learning_rate: 5.285698854152201e-05
    weight_decay: 0.00025829870123799276
    num_train_epochs: 19
    per_device_train_batch_size: 16
    per_device_eval_batch_size: 16
    input_seq_length: 256
    output_seq_length: 14
    dropout_rate: 0.1133866716350046


In [None]:
vis.plot_param_importances(study)

In [None]:
# Plot the parallel coordinate plot
fig = vis.plot_parallel_coordinate(study)
fig.show()

In [None]:

# Specify the model name
model_name = "t5-base"
# Load the pre-trained T5 model for sequence-to-sequence learning
model = AutoModelForSeq2SeqLM.from_pretrained(model_name,dropout_rate=params['dropout_rate'])
# Initialize the tokenizer for the T5 model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
# Create the training dataset by applying the preprocess function to each row in df_train
train_dataset = df_train.apply(lambda row: preprocess(row,params['input_seq_length'],params['output_seq_length'],tokenizer), axis=1)

# Create the validation dataset by applying the preprocess function to each row in df_val
val_dataset = df_val.apply(lambda row: preprocess(row, params['input_seq_length'],params['output_seq_length'],tokenizer), axis=1)

# Convert the train_dataset from a DataFrame to a list
train_dataset=train_dataset.to_list()
# Convert the val_dataset from a DataFrame to a list
val_dataset=val_dataset.to_list()
# Create an instance of the MyDataset class for the training dataset
train_dataset = MyDataset(train_dataset)

# Create an instance of the MyDataset class for the validation dataset
val_dataset = MyDataset(val_dataset)

In [None]:
dir='/content/drive/MyDrive/MTP Project/out_attr/checkpoints/'

training_args = Seq2SeqTrainingArguments(

  output_dir=dir,
  evaluation_strategy="epoch",
  save_strategy="epoch",
   logging_strategy='epoch',
  learning_rate=params['learning_rate'],
  per_device_train_batch_size=params['per_device_train_batch_size'],
  per_device_eval_batch_size=params['per_device_eval_batch_size'],
  weight_decay=params['weight_decay'],
  save_total_limit=1,
  num_train_epochs=params['num_train_epochs'],
  predict_with_generate=True,
  load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False




)

In [None]:
# Import the necessary libraries

def post_process(text):
    # Convert the text to lowercase
    text = text.lower()

    # Tokenize the text using NLTK's word_tokenize
    tokens = nltk.word_tokenize(text)

    # Loop through each token in the list
    for i in range(len(tokens)):
        w = tokens[i]

        # Remove any spaces in the token
        w = w.replace(" ", "")

        # Replace the token in the list with the modified one
        tokens[i] = w

    # Get the set of English stopwords
    stop_words = set(stopwords.words('english'))

    # Filter out the stopwords from the tokens list
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Join the tokens back into a preprocessed text string
    preprocessed_text = ' '.join(tokens)

    # Load the English language model in spaCy
    nlp = spacy.load('en_core_web_sm')

    # Apply lemmatization to the preprocessed text using spaCy
    doc = nlp(preprocessed_text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])

    # Remove any non-alphanumeric characters from the lemmatized text using regular expression
    text = re.sub('[^a-zA-Z0-9\s]', '', lemmatized_text)

    # Tokenize the text again to get the final list of tokens
    tokens = nltk.word_tokenize(text)

    # Return the list of tokens
    return tokens
#function defined to apply postprocessing to predicted and label text
def post_process(text):
  # Convert the text to lowercase
  text = text.lower()
  # Tokenize the text using NLTK's word_tokenize
  tokens = nltk.word_tokenize(text)
  # Loop through each token in the list
  for i in range(len(tokens)):
     w=tokens[i]
      # Remove any spaces in the token
     w=w.replace(" ","")

     # Replace the token in the list with the modified one
     tokens[i]=w
  # Get the set of English stopwords
  stop_words = set(stopwords.words('english'))
  # Filter out the stopwords from the tokens list
  filtered_tokens = [word for word in tokens if word not in stop_words]

  # Join the tokens back into a preprocessed text string
  preprocessed_text = ' '.join(tokens)

  # Apply lemmatization to the preprocessed text using spaCy
  doc = nlp(preprocessed_text)

  lemmatized_text = " ".join([token.lemma_ for token in doc])

  # Remove any non-alphanumeric characters from the lemmatized text using regular expression
  text = re.sub('[^a-zA-Z0-9\s]', '',  lemmatized_text)
  # Tokenize the text again to get the final list of tokens
  tokens = nltk.word_tokenize(text)
  return tokens

In [None]:
#function defined to apply postprocessing to predicted and label text
def post_process(text):
  # Convert the text to lowercase
  text = text.lower()
  # Tokenize the text using NLTK's word_tokenize
  tokens = nltk.word_tokenize(text)
  # Loop through each token in the list
  for i in range(len(tokens)):
     w=tokens[i]
      # Remove any spaces in the token
     w=w.replace(" ","")

     # Replace the token in the list with the modified one
     tokens[i]=w
  # Get the set of English stopwords
  stop_words = set(stopwords.words('english'))
  # Filter out the stopwords from the tokens list
  filtered_tokens = [word for word in tokens if word not in stop_words]

  # Join the tokens back into a preprocessed text string
  preprocessed_text = ' '.join(tokens)

  # Apply lemmatization to the preprocessed text using spaCy
  doc = nlp(preprocessed_text)

  lemmatized_text = " ".join([token.lemma_ for token in doc])

  # Remove any non-alphanumeric characters from the lemmatized text using regular expression
  text = re.sub('[^a-zA-Z0-9\s]', '',  lemmatized_text)
  # Tokenize the text again to get the final list of tokens
  tokens = nltk.word_tokenize(text)
  return tokens

In [None]:
def compute_metrics(eval_pred):
  tokenizer = AutoTokenizer.from_pretrained("t5-base")
  preds, labels = eval_pred
  prec,recall=[],[]
  if isinstance(preds, tuple):
       preds = preds[0]
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   # Replace -100 in the labels as we can't decode them.
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  for i in range(len(decoded_preds)):
        pred_tokens = post_process(decoded_preds[i])
        label_tokens = post_process(decoded_labels[i])

         # calculate overlap
        overlap = len(set(pred_tokens).intersection(set(label_tokens)))
        if(len(pred_tokens)!=0):
           rp = overlap / len(pred_tokens)
        else:
           rp=0
        prec.append(rp)
        if(len(label_tokens)!=0):
    # calculate relaxed recall
           rr = overlap / len(label_tokens)
        else:
             rr=0
        recall.append(rr)
  print(prec,recall)
  precision,recall=np.mean(prec),np.mean(recall)
  f1=2*precision*recall / (precision+recall)
  res={'Relaxed Precision':precision,'Relaxed Recall':recall,'F1-score':f1}

  return res





In [None]:
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,

    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics

)
trainer.train()

In [None]:
trainer.state.log_history

In [None]:
logs=trainer.state.log_history

In [None]:
print(logs)

[{'loss': 0.6005, 'learning_rate': 5.0075041776178746e-05, 'epoch': 1.0, 'step': 4863}, {'eval_loss': 0.3017459511756897, 'eval_Relaxed Precision': 0.910667380952381, 'eval_Relaxed Recall': 0.9117305291005291, 'eval_F1-score': 0.9111986449174556, 'eval_runtime': 355.6786, 'eval_samples_per_second': 42.173, 'eval_steps_per_second': 2.637, 'epoch': 1.0, 'step': 4863}, {'loss': 0.299, 'learning_rate': 4.7293095010835485e-05, 'epoch': 2.0, 'step': 9726}, {'eval_loss': 0.25741755962371826, 'eval_Relaxed Precision': 0.9245016666666667, 'eval_Relaxed Recall': 0.9249411375661376, 'eval_F1-score': 0.9247213499021161, 'eval_runtime': 357.2206, 'eval_samples_per_second': 41.991, 'eval_steps_per_second': 2.626, 'epoch': 2.0, 'step': 9726}, {'loss': 0.2168, 'learning_rate': 4.451114824549222e-05, 'epoch': 3.0, 'step': 14589}, {'eval_loss': 0.24358727037906647, 'eval_Relaxed Precision': 0.9315002525252525, 'eval_Relaxed Recall': 0.9352176719576719, 'eval_F1-score': 0.9333552607697361, 'eval_runtime'

In [None]:
import json
with open('/content/drive/MyDrive/MTP Project/out_attr/logs_new.json','w') as f:
  json.dump(logs,f)


In [None]:
#save the pretrained model
model.save_pretrained('/content/drive/MyDrive/MTP Project/out_attr/t5_attr_model')

In [None]:
#save tokenizer
tokenizer.save_pretrained('/content/drive/MyDrive/MTP Project/out_attr/t5_attr_tokenizer')

('/content/drive/MyDrive/MTP Project/out_attr/t5_attr_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/MTP Project/out_attr/t5_attr_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/MTP Project/out_attr/t5_attr_tokenizer/spiece.model',
 '/content/drive/MyDrive/MTP Project/out_attr/t5_attr_tokenizer/added_tokens.json',
 '/content/drive/MyDrive/MTP Project/out_attr/t5_attr_tokenizer/tokenizer.json')

In [None]:
eval_result = trainer.evaluate(eval_dataset=val_dataset)
print(eval_result)

In [None]:
print(f"Evaluation loss : {eval_result['eval_loss']}")

Evaluation loss : 0.23476430773735046


In [None]:
#load the tokenizer & model
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/MTP Project/out_attr/t5_attr_tokenizer')
model = AutoModelForSeq2SeqLM.from_pretrained('/content/drive/MyDrive/MTP Project/out_attr/t5_attr_model')

In [None]:
#load the test data with 25 samples of each seq length
with open('/content/drive/MyDrive/MTP Project/t5_data/test_data_100_seperate.json' ) as f:
  rule_test=json.load(f)
len(rule_test)

100

In [None]:
test_json=[]
for i in rule_test:
  d={}
  inp=i['sentence']+" "+ f"What is the attribute related to entity {i['entity']} with quantity {i['quantity']}?"

  outt=i['measuring attribute']
  d['input']=inp
  d['output']=outt
  d['entity']=i['entity']#rule_test[i][0]
  d['quantity']=i['quantity']#rule_test[i][2]
  d['sen']=i['sentence']
  test_json.append(d)
len(test_json)

In [None]:
#predict the attribute given sentence,entity,quantity
#list of dictionaries with senetence and attribute as keys
res=[]
for i in range(len(test_json)):

  d=test_json[i]

  input=d['input']

  tokenized_inputs= tokenizer(input, max_length=256, truncation=True,return_tensors='pt')
  tokenized_inputs=tokenized_inputs.to('cuda')
  output_sequence = model.generate(**tokenized_inputs
                                 )

# decode the output sequence into text using the tokenizer
  #d=rule_test[i]
  u={}
  print(d['sen'])
  print("------------------------------Rule based result-------------------------------")
  print(f" Entity : {d['entity']}")
  print(f" Quantity : {d['quantity']} ")
  print(f" Attribute : {d['output']} ")
  print("-------------------------------- T5 Result------------------------------------")
  output_text = tokenizer.decode(output_sequence[0], skip_special_tokens=True)
  print(f" Attribute : {output_text}")
  print('\n\n')
  u['sentence']=d['sen']
  u['attribute']=output_text
  res.append(u)

In [None]:
import json
with open ('/content/drive/MyDrive/MTP Project/out_attr/output_test_100.json','w') as f:
  json.dump(res,f)
#stored as test_25samples_prediction.json

In [None]:
 #load the new test data according to distribution
 with open('/content/drive/MyDrive/MTP Project/t5_data/new_test_data_100.json' ) as f:
  rule_test=json.load(f)
len(rule_test)

100

In [None]:
test_json=[]
for i in rule_test:
  d={}
  inp=i['sentence']+" "+ f"What is the attribute related to entity {i['entity']} with quantity {i['quantity']}?"


  outt=i['measuring attribute']
  d['input']=inp
  d['output']=outt
  d['entity']=rule_test[i][0]
  d['quantity']=rule_test[i][2]
  d['sen']=i
  test_json.append(d)
len(test_json)

In [None]:
res=[]
for i in range(len(test_json)):

  d=test_json[i]

  input=d['input']

  tokenized_inputs= tokenizer(input, max_length=256, truncation=True,return_tensors='pt')
  tokenized_inputs=tokenized_inputs.to('cuda')
  output_sequence = model.generate(**tokenized_inputs
                                 )

# decode the output sequence into text using the tokenizer
  #d=rule_test[i]
  u={}
  print(d['sen'])
  print("------------------------------Rule based result-------------------------------")
  print(f" Entity : {d['entity']}")
  print(f" Quantity : {d['quantity']} ")
  print(f" Attribute : {d['output']} ")
  print("-------------------------------- T5 Result------------------------------------")
  output_text = tokenizer.decode(output_sequence[0], skip_special_tokens=True)
  print(f" Attribute : {output_text}")
  print('\n\n')
  u['sentence']=d['sen']
  u['attribute']=output_text
  res.append(u)

100

In [None]:
import json
with open ('/content/drive/MyDrive/MTP Project/out_attr/output_new_test_100.json','w') as f:
  json.dump(res,f)
#stored as test_distribution_prediction.json