<a href="https://colab.research.google.com/github/mjahid183e/Bangla-Text-summarization-Model/blob/main/BestTrainedVersion_CapstoneDesign_CSE499_Bangla_Text_summarizer_UsingGenerativeAIModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install libraries**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/Colab Notebooks/'

/content/drive/MyDrive/Colab Notebooks


In [None]:
!pip install accelerate
!pip install transformers==4.28.0 -q
!pip install transformers[torch]
!pip install torchdata==0.7.0
!pip install datasets sentencepiece evaluate rouge_score -q
# !pip install evaluate==0.4.0
!pip install rouge_score==0.1.2
!pip install --upgrade accelerate -q
!pip install loralib==0.1.1
!pip install peft==0.3.0 --quiet

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/270.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/270.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.2 MB/s[0m eta

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoConfig
import torch
import time
# import evalute
import pandas as pd
import numpy as np


# **XL-sum dataset**

Here we use XL-Sum Bengali dataset from Hugging Face Library, which is the annotated article-summary pairs from BBC news corpus.

In [None]:
huggingface_dataset_name = "csebuetnlp/xlsum"
ds = load_dataset(huggingface_dataset_name, 'bengali')
ds

In [None]:
example_indices = [40, 200]
dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
  print(dash_line)
  print('Example ', i + 1)
  print(dash_line)
  print('Input News: ')
  print(ds['test'] [index] ['text'])
  print(dash_line)
  print('BaseLine human summary: ')
  print(ds['test'] [index] ['summary'])
  print(dash_line)
  print()

# Preprocessing, and converting the Dataset into Instruction Dataset (prompt-response)

- To generate inputs for fine-tuning, tokenize texts into ids.
- First, we prepare a **tokenizer** in the pre-trained mT5 model.

In [None]:
t5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]



**Convertion of text to token ids (the sequence of integer) as follows. The generated inputs (tokenized_dataset) will have token ids for article text and summary text, each of which is in input_ids and labels, respectively.**

In [None]:
def tokenize_function(example):

  start_prompt = 'Summarize the following text:\n'
  end_prompt = '\nSummary: '
  prompt = [start_prompt + text + end_prompt for text in example["text"]]
  example['input_ids'] = t5_tokenizer(prompt, max_length=1024, truncation=True, padding=True, return_tensors="pt").input_ids
  example['labels'] = t5_tokenizer(example["summary"], max_length=128, truncation=True, padding=True, return_tensors="pt").input_ids

  return example

tokenized_datasets = ds.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'url', 'title', 'summary', 'text',])

print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)


# **Practice of an example of tokenization (Encoding-Decoding)**

In [None]:
sentence = "বাংলাদেশে করোনাভাইরাসের কারণে নয় মাসেরও বেশি সময় ধরে বন্ধ রয়েছে সব ধরণের শিক্ষা প্রতিষ্ঠান। মনোবিজ্ঞানীরা বলছেন, দীর্ঘ সময় স্কুলের বাইরে থাকার কারণে অনেক শিশুর মধ্যেই আচরণগত পরিবর্তন আসতে পারে।"
sentence_encoded = t5_tokenizer(sentence, return_tensors='pt')

sentence_decoded = t5_tokenizer.decode(
    sentence_encoded["input_ids"] [0],
    skip_special_tokens = True
)

print('Encoded Sentence: ')
print(sentence_encoded["input_ids"] [0])
print('\nDecoded Sentence: ')
print(sentence_decoded)

str abir = Abir Hasan isnot a boy;
print("Iffat is a boy!", abir)

SyntaxError: invalid syntax (<ipython-input-1-0897cf33d2c8>, line 14)

# **Pretrained model**




In [None]:
model_name = 'google/mt5-small'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mt5_config = AutoConfig.from_pretrained(
  model_name,
  max_length=128,
  length_penalty=1,
  no_repeat_ngram_size=2,
  num_beams=15,
)

model = (AutoModelForSeq2SeqLM.from_pretrained(model_name, config=mt5_config).to(device))


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# **Datacollator:**

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    t5_tokenizer,
    model=model,
    max_length=128,
    return_tensors="pt"
)

# **The number of model parameters and finding out how many of them are trainable**

In [None]:
def print_number_of_trainable_model_parameters(model):
  trainable_model_params = 0
  all_model_params = 0
  for _, param in model.named_parameters():
    all_model_params += param.numel()
    if param.requires_grad:
      trainable_model_params += param.numel()
      return f"Trainable model parameters: {trainable_model_params}\nAll model parameters: {all_model_params}"

print(print_number_of_trainable_model_parameters(model))

**Now it's time to explore how well the base LLM summaries a news or Bangla long text corpus without any prompt engineering. Prompt engineering is an act of a human changing the prompt(input) to improve the response for a given task.**

In [None]:
for i, index in enumerate(example_indices):
  bangla_text_corpus = ds['test'] [index] ['text']
  summary = ds['test'] [index] ['summary']

  inputs = t5_tokenizer(bangla_text_corpus, return_tensors='pt')
  output = t5_tokenizer.decode(
      model.generate(
          inputs["input_ids"],
          generation_config=GenerationConfig(max_new_tokens=50, do_sample=True, temperature=1.5, num_beams=10),
      )[0],
      skip_special_tokens = True
  )

  print(dash_line)
  print('Example ', i + 1)
  print(dash_line)

  print(f'Input Prompt:\n{bangla_text_corpus}')
  print(dash_line)

  print(f'Baseline Human Summary:\n{summary}')
  print(dash_line)

  print(f'Model Generated Summary - Withouth Any Prompt Engineering/Fine Tuning:\n{output}\n')


# **Zero Shot Inference with an Instruction Promt:**

In [None]:
for i, index in enumerate(example_indices):
  bangla_text_corpus = ds['test'] [index] ['text']
  summary = ds['test'] [index] ['summary']

  prompt = f"""
summarize the following bangla text corpus.
{bangla_text_corpus}

summary:
  """

  # Input constructed prompt instead of the bangla text corpus
  inputs = t5_tokenizer(prompt, return_tensors='pt')
  output = t5_tokenizer.decode(
      model.generate(
          inputs["input_ids"],
          max_new_tokens = 50, temperature = 1.5,
      )[0],
      skip_special_tokens = True
  )

  print(dash_line)
  print('Example ', i + 1)

  print(dash_line)
  print(f'Input Prompt:\n{prompt}')
  print(dash_line)

  print(f'Baseline Human Generated Summary:\n{summary}')
  print(dash_line)

  print(f'Model Generated Summary with Zero Shot Inference/In Context Learning:\n{output}\n')


# Let's try with different **prompt**
This is the prompt engineering side of these large language models where we're trying to find the best prompt to pass and seeing if the model does any better with slightly different phrases and in this case just zero-shot inference. No fine tuning.

In [None]:
for i, index in enumerate(example_indices):
  bangla_text_corpus = ds['test'] [index] ['text']
  summary = ds['test'] [index] ['summary']

  prompt = f"""
  bangla_text_corpus:
  {bangla_text_corpus}

  #Create a summary of the given Bangali text.
  #প্রবন্ধ থেকে একটি প্রবন্ধ সংক্ষেপ তৈরি করুন: একটি প্রবন্ধ দেওয়া হয়েছে।"
  #what was happening during covid-19 in Bangladesh?
  #করুনা চলাকালীন সময়ে তখন কি হয়েছিল?
  What happened during the Covid-19?
  """

  # Input constructed prompt instead of the bangla text corpus
  inputs = t5_tokenizer(prompt, return_tensors='pt')
  output = t5_tokenizer.decode(
      model.generate(
          inputs["input_ids"],
          max_new_tokens = 50, temperature = 1.5,
      )[0],
      skip_special_tokens = True
  )

  print(dash_line)
  print('Example ', i + 1)

  print(dash_line)
  print(f'Input Prompt:\n{prompt}')
  print(dash_line)

  print(f'Baseline Human Generated Summary:\n{summary}')
  print(dash_line)

  print(f'Model Generated Summary with Zero Shot Inference/In Context Learning:\n{output}\n')


# Summarize Bangla text corpus with one shot and few shot Inference

*   **One Shot Inference:**



In [None]:
def  make_prompt(example_indices_full, example_index_to_summarize):
  prompt = ''
  for index in example_indices_full:
    bangla_text_corpus = ds['test'][index] ['text']
    summary = ds['test'][index]['summary']

    # The stop sequence '{summary}\n\n\n' is important for mt5. Other models may have their own preferred stop sequence.
    prompt += f"""
Summarize the following text:
{bangla_text_corpus}

# What was happen at covid-19 pandmic?
#What was going on?
Summary:
{summary}
"""

    bangla_text_corpus = ds['test'] [example_index_to_summarize] ['text']
    prompt += f"""
Summarize the following text:
{bangla_text_corpus}

# What was happen at covid-19 pandmic?
#What was going on?
Summary:
"""
    return prompt


# Construct the prompt to perform one shot inference:

In [None]:
example_indices_full = [40]
example_index_to_summarize = 200
one_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(one_shot_prompt)

**Now pass this prompt to perform the one shot inference:**

In [None]:
summary = ds['test'] [example_index_to_summarize] ['summary']

inputs = t5_tokenizer(one_shot_prompt, return_tensors='pt')
output = t5_tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens = 50, temperature = 1.5,
    )[0],
    skip_special_tokens = True
)

print(dash_line)
print(f'Baseline Human Generated Summary:\n{summary}\n')

print(dash_line)
print(f'Model Generated Summary with One shot Learning:\n{output}')


# Few Shot Inference
Let's explore few shot inference by adding two more full BanglaText-summary pairs to the prompt.

In [None]:
def  make_prompt(example_indices_full, example_index_to_summarize):
  prompt = ''
  for index in example_indices_full:
    bangla_text_corpus = ds['test'][index] ['text']
    summary = ds['test'][index]['summary']

    # The stop sequence '{summary}\n\n\n' is important for mt5. Other models may have their own preferred stop sequence.
    prompt += f"""
bangla_text_corpus:
{bangla_text_corpus}

# What was happen at covid-19 pandmic?
What was going on?
{summary}
"""

    prompt += f"""
bangla_text_corpus:
{bangla_text_corpus}

# What was happen at covid-19 pandmic?
What was going on?
{summary}
"""

    prompt += f"""
bangla_text_corpus:
{bangla_text_corpus}

# What was happen at covid-19 pandmic?
What was going on?
{summary}
"""

    bangla_text_corpus = ds['test'] [example_index_to_summarize] ['text']
    prompt += f"""
bangla_text_corpus:
{bangla_text_corpus}

# What was happen at covid-19 pandmic?
What was going on?
"""
    return prompt


In [None]:
example_indices_full = [56, 58, 32]
example_index_to_summarize = 200

few_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(few_shot_prompt)

passing this prompt to perform a few shot inference:

In [None]:
summary = ds['test'] [example_index_to_summarize] [summary]

inputs = t5_tokenizer(few_shot_prompt, return_tensors='pt')
output = t5_tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens = 50,
    )[0],
    skip_special_tokens = True
)

print(dash_line)
print(f'Baseline Human Generated Summary:\n{summary}\n')

print(dash_line)
print(f'Model Generated Summary with Few Shot Learning:\n{output}')


**Playing with Generative Configuration Parameters for Inference and Generated output**



In [None]:
# generation_config = GenerationConfig(max_new_tokens = 50, do_sample = True, temperature = 0.5)
generation_config = GenerationConfig(max_new_tokens = 40, do_sample = True, temperature = 2.0) # Tempurature value can be 0.1-2 max

inputs = t5_tokenizer(few_shot_prompt, return_tensors='pt')
output = t5_tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config = generation_config,

    )[0],
    skip_special_tokens = True
)

print(dash_line)
print(f'Model Generated summary with few shot:\n{output}')
print(dash_line)
print(f'Baseline Human Generated Summary:\n{summary}\n')


ROUGE, a set of metrics for Evaluating Automatic Text Summarization.**
 For instance, two sentences, “This is my book” and “The book is mine”, matches no tokens in each position, but it has similar meaning. Measuring the quality of generated text is very difficult unlike classification tasks.

In [None]:
import evaluate
from nltk.tokenize import RegexpTokenizer

rouge_metric = evaluate.load("rouge")

# Function for custom tokenization
def tokenize_sentence(arg):
  encoded_arg = t5_tokenizer(arg)
  return t5_tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

# Function to get ROUGE scores with custom tokenization
def metrics_func(eval_arg):
  preds, labels = eval_arg

  # Replace -100
  labels = np.where(labels != -100, labels, t5_tokenizer.pad_token_id)

  # Converting id tokens to text
  text_preds = t5_tokenizer.batch_decode(preds, skip_special_tokens=True)
  text_labels = t5_tokenizer.batch_decode(labels, skip_special_tokens=True)

  # Inserting a line break (\n) in each sentence for ROUGE scoring
  text_preds = [(p if p.endswith(("!", "!", "?", "?", "।")) else p + "।") for p in text_preds]
  text_labels = [(l if l.endswith(("!", "!", "?", "?", "।")) else l + "।") for l in text_labels]
  sent_tokenizer_bn = RegexpTokenizer(u'[^!!??।]*[!!??।]')
  text_preds = ["\n".join(np.char.strip(sent_tokenizer_bn.tokenize(p))) for p in text_preds]
  text_labels = ["\n".join(np.char.strip(sent_tokenizer_bn.tokenize(l))) for l in text_labels]

  # Computation of ROUGE score with custom tokenization
  return rouge_metric.compute(
    predictions=text_preds,
    references=text_labels,
    tokenizer=tokenize_sentence
  )

# **ROUGE scores of the mT5 pretrained model**

In [None]:
from torch.utils.data import DataLoader

sample_dataloader = DataLoader(

  tokenized_datasets["test"].with_format("torch"),

  collate_fn = data_collator,
  batch_size = 10)

for batch in sample_dataloader:
  with torch.no_grad():
    preds = model.generate(
        batch["input_ids"].to(device),
        generation_config=GenerationConfig(max_new_tokens=128, num_beams=15, do_sample=True,
                                           length_penalty=1.0, temperature=1.5, output_attentions=True),
        num_return_sequences=1,
        no_repeat_ngram_size=1,
        remove_invalid_values=True,
        #max_length=128,
        #typical_p=1.0,
        #top_k=50,
        #top_p=1.0,
    )

  labels = batch["labels"]
  break

metrics_func([preds, labels])

# **Load fully fine-tuned instruction model**


In [None]:
import os
from transformers import AutoConfig

load_dir = '/content/drive/MyDrive/Colab Notebooks/Instruction_model/instruction_trained_model'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mt5_config = AutoConfig.from_pretrained(
  load_dir,
  output_attentions = True,
  chunk_size_feed_forward = 3,
  max_length=128,
  length_penalty=1.0,
  no_repeat_ngram_size=2,
  num_beams=10,
  finetuning_task = "Bangla Text Summarization"
)

#Instruction Model
instruct_model = (AutoModelForSeq2SeqLM.from_pretrained(load_dir, config=mt5_config).to(device))
#Training log
df = pd.read_csv(os.path.join(load_dir, 'training_log.csv'))


# Datacollator for the instruction model.

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator1 = DataCollatorForSeq2Seq(
    t5_tokenizer,
    model=instruct_model,
    #padding = True,
    max_length = 128,
    label_pad_token_id = -100,
    return_tensors="pt")

# Fine-Tune the Model with the Preprocessed Dataset

- Here we prepare HuggingFace training arguments.

In [None]:
from transformers import Seq2SeqTrainingArguments

epochs = 1
training_args = Seq2SeqTrainingArguments(
  output_dir = "bangla-text-summarizer",
  log_level = "error",
  num_train_epochs = epochs,
  learning_rate = 4e-4,
  warmup_steps = 70,
  optim = "adafactor",
  weight_decay = 0.01,
  per_device_train_batch_size = 8, # 2, 16
  per_device_eval_batch_size = 8, # 1, 8
  gradient_accumulation_steps = 1,
  auto_find_batch_size = True,
  eval_delay =0,
  max_grad_norm = 1,
  lr_scheduler_type = "linear",
  #use_cpu = False,
  #seed = 42,
  evaluation_strategy = "steps",
  eval_steps = 100,
  predict_with_generate=True,
  generation_max_length = 128,
  save_strategy = "steps",
  save_steps = 500,
  save_total_limit = 3,
  load_best_model_at_end = True,
  logging_steps = 100, # 500, 10
  logging_nan_inf_filter = True,
  remove_unused_columns = True,
  push_to_hub = False
)

# **Trainer Class**

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(

  model = instruct_model,
  args = training_args,
  data_collator = data_collator1,
  compute_metrics = metrics_func,

  train_dataset = tokenized_datasets['train'],
  eval_dataset = tokenized_datasets['validation'].select(range(30)),
  tokenizer = t5_tokenizer,
)

trainer.train()


## **Save the trained model**

In [None]:
import os
# import pandas as pd

save_dir = '/content/drive/MyDrive/Colab Notebooks/Instruction_model/instruction_trained_model'
os.makedirs(save_dir, exist_ok=True)

# save training log
log_df = pd.DataFrame(trainer.state.log_history)
log_df.to_csv(os.path.join(save_dir, 'training_log.csv'))

# save fine-tuned model, this is for 1st training: trainer.model
if hasattr(trainer.model, "module"):
  trainer.model.module.save_pretrained(save_dir)
else:
  trainer.model.save_pretrained(save_dir)


Performance Analysis:

In [None]:
df_eval = df[[
    'epoch',
    'step',
    'eval_loss',
    'eval_rouge1',
    'eval_rouge2',
    'eval_rougeL',
    'eval_rougeLsum',
    ]].dropna()
df_eval

# ROUGE score for the trained mT5 model:

In [None]:
from torch.utils.data import DataLoader

sample_dataloader = DataLoader(

  tokenized_datasets["test"].with_format("torch"),
  collate_fn = data_collator1,
  batch_size = 10)

for batch in sample_dataloader:
  with torch.no_grad():

    preds = instruct_model.generate(
        batch["input_ids"].to(device),
        remove_invalid_values=True,
        generation_config=GenerationConfig(max_new_tokens=128, num_beams=15, do_sample=True, top_k=50, top_p=1.0,
                                           typical_p=1.0, temperature=1.5, use_cache=True, output_attentions=True),
        #typical_p=1.0,
        #top_k=50,
        #top_p=1.0,
        #max_length = 128,
        length_penalty=1.0,
        no_repeat_ngram_size=2,
        num_return_sequences=1,
    )

  labels = batch["labels"]
  break

metrics_func([preds, labels])


Plots

In [None]:
from  matplotlib import pyplot as plt

In [None]:
df_loss = df['loss']
df_loss = df_loss.dropna()
df_loss

In [None]:
df_step = df['step']
df_step = df_step[~df_step.duplicated(keep='first')]
df_step

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

df_loss = df['loss']
df_loss = df_loss.dropna()

df_step = df['step']
df_step = df_step[~df_step.duplicated(keep='first')]

plt.plot(df_step, df_loss)
plt.legend(['Training Loss'], loc='upper right', fontsize='14')
plt.xlabel('Steps', fontsize='14')
plt.ylabel('Loss', fontsize='14')
plt.show()

fig = ax.get_figure()
fig.savefig(os.path.join(load_dir, 'training_loss.png'), bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

plt.plot(df_eval['step'], df_eval['eval_loss'], color='green')
plt.legend(['Validation Loss'], loc='upper right', fontsize='14')
plt.xlabel('Steps', fontsize='14')
plt.ylabel('Loss', fontsize='14')
plt.show()

fig = ax.get_figure()
fig.savefig(os.path.join(load_dir, 'validation_loss.png'), bbox_inches='tight')

Total Training Time

In [None]:
df_backup = df

# Training time
last_row = df_backup.iloc[-1]
column_value = last_row['train_runtime']

# Convert seconds to hours, minutes, and seconds
hours = int(column_value // 3600)
seconds_remaining = column_value % 3600
minutes = int(seconds_remaining // 60)
seconds = int(seconds_remaining % 60)

print(f"Training Time: {hours}:{minutes}:{seconds}")

In [None]:
df_backup = df

# Total training time
last_row = df_backup.iloc[-1]
total_runtime = last_row['train_runtime']

# Convert total runtime to hours, minutes, and seconds
total_hours = int(total_runtime // 3600)
total_seconds_remaining = total_runtime % 3600
total_minutes = int(total_seconds_remaining // 60)
total_seconds = int(total_seconds_remaining % 60)

# Format and print total training time
total_training_time = f"Total Training Time: {total_hours}:{total_minutes:02d}:{total_seconds:02d}"
print(total_training_time)

num_epochs = 10

# Calculate individual runtime per epoch
epoch_runtime = total_runtime / num_epochs

# Convert epoch runtime to hours, minutes, and seconds
epoch_hours = int(epoch_runtime // 3600)
epoch_seconds_remaining = epoch_runtime % 3600
epoch_minutes = int(epoch_seconds_remaining // 60)
epoch_seconds = int(epoch_seconds_remaining % 60)

# Format and print individual runtime per epoch
epoch_time = f"Individual Runtime per Epoch: {epoch_hours}:{epoch_minutes:02d}:{epoch_seconds:02d}"
print(epoch_time)

# **Evaluation of the Model Qualitatively (Human Evaluation)**

In [None]:
from torch.utils.data import DataLoader

# Prediction with test data (first 5 rows)
sample_dataloader = DataLoader(
  tokenized_datasets["test"].with_format("torch"),
  collate_fn = data_collator1,
  batch_size=10)

for batch in sample_dataloader:
  with torch.no_grad():
    preds = instruct_model.generate(
        batch["input_ids"].to(device),
        remove_invalid_values=True,
        generation_config=GenerationConfig(max_new_tokens=128, num_beams=15, do_sample=True, top_k=50, top_p=1.0, typical_p=1.0, temperature=1, use_cache=True, output_attentions=True),
        #typical_p=1.0,
        #top_k=50,
        #top_p=1.0,
        #max_length = 128,
        length_penalty=1.0,
        no_repeat_ngram_size=2,
        num_return_sequences=1,
    )

  labels = batch["labels"]
  break

metrics_func([preds, labels])


# Replace -100 (see above)
labels = np.where(labels != -100, labels, t5_tokenizer.pad_token_id)

# Convert id tokens to text
text_preds = t5_tokenizer.batch_decode(preds, skip_special_tokens=True)
text_labels = t5_tokenizer.batch_decode(labels, skip_special_tokens=True)

print("Input Text-Bangla Text Corpus: ")
print(ds["test"]["text"][2])

print("Baseline Human Summary: ")
print(text_labels[2])

print("Instruct Model Generated Summary: ")
print(text_preds[2])


# Evaluate the New Instruction Fine Tuned Model (Human Evaluation:)

In [None]:
index = 125
text_corpus1 = ds['test'] [index] ['text']
human_baseline_summary = ds['test'] [index] ['summary']

prompt = f"""
summarize the following text.
{text_corpus1}

Summary:
"""
#return_tensors="pt", this will b after prompt with a comma
input_ids = t5_tokenizer(prompt, return_tensors="pt").input_ids

#original_model_outputs = model.generate(input_ids = input_ids, generation_config=GenerationConfig(max_new_tokens=128, do_sample=True, temperature=1, num_beams=10))
#original_model_text_output = t5_tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(
    input_ids=input_ids.to(device),
    remove_invalid_values=True,
    generation_config=GenerationConfig(max_new_tokens=128, num_beams=15, do_sample=True, top_k=50,
                                       top_p=1.0, typical_p=1.0, temperature=1.5, use_cache=True, output_attentions=True),
    #typical_p=1.0,
    #top_k=50,
    #top_p=1.0,
    #max_length = 128,
    length_penalty=1.0,
    no_repeat_ngram_size=2,
    num_return_sequences=1,
)
instruct_model_text_output = t5_tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print("Input Text Corpus: ")
print(ds["test"][index]["text"])
#print(dash_line)
print(f'Baseline Human Summary:\n {human_baseline_summary}')
#print(dash_line)
#print(f'Original Model summary:\n{original_model_text_output}')
#print(dash_line)
print(f'Instruct Model Generated Summary:\n {instruct_model_text_output}')


# Evaluate the Model Quantitatively (with ROUGE Metric)

The ROUGE metric helps quantify the validity of summarizations produced by models. It compares summaries to a 'Baseline' summary which is usually created by a human.

In [None]:
text_corpus2 = ds['test'] [10:15] ['text']
human_baseline_Summaries = ds['test'] [10:15] ['summary']

original_model_summaries = []
instruction_model_summaries = []

for _, text in enumerate(text_corpus2):
  prompt = f"""
summarize the following text.
{text}

summary: """
  input_ids = t5_tokenizer(prompt, return_tensors="pt").input_ids

  original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=128, do_sample=True, temperature=1, num_beams=5))
  original_model_text_output = t5_tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
  original_model_summaries.append(original_model_text_output)

  instruction_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=128, do_sample=True, temperature=1, num_beams=5))
  instruction_model_text_output = t5_tokenizer.decode(instruction_model_outputs[0], skip_special_tokens=True)
  instruction_model_summaries.append(instruction_model_text_output)

zipped_summaries = list(zip(human_baseline_Summaries, original_model_summaries, instruction_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_Summaries', 'original_model_summaries', 'instruction_model_summaries'])
df


# Evaluate the models computing ROUGE metrics. Notice the improvement in the results!

In [None]:
rouge_metric = evaluate.load("rouge")
print(len(original_model_summaries))
print(human_baseline_Summaries)
print(instruction_model_summaries)

original_model_summaries = [(p if p.endswith(("!", "!", "?", "?", "।")) else p + "।") for p in original_model_summaries]
human_baseline_Summaries = [(l if l.endswith(("!", "!", "?", "?", "।")) else l + "।") for l in human_baseline_Summaries]
sent_tokenizer_bn = RegexpTokenizer(u'[^!!??।]*[!!??।]')
original_model_summaries = ["\n".join(np.char.strip(sent_tokenizer_bn.tokenize(p))) for p in original_model_summaries]
human_baseline_Summaries = ["\n".join(np.char.strip(sent_tokenizer_bn.tokenize(l))) for l in human_baseline_Summaries]

original_model_results = rouge_metric.compute(
    predictions = original_model_summaries,
    references = human_baseline_Summaries,
    use_aggregator = True,
    use_stemmer = True,
)

instruction_model_results = rouge_metric.compute(
    predictions = instruction_model_summaries,
    references = instruction_model_summaries,
    use_aggregator = True,
    use_stemmer = True,
)

print('Original Model ROUGE Scores:')
print(original_model_results)
print('Instruct Model ROUGE Scores:')
print(instruction_model_results)


In [None]:
rouge = evaluate.load('rouge')
predictions = ["গত বেশ কিছুদিন", "বাংলাদেশে স্বাস্থ্য অধিদপ্তরের তথ্যে দেখা গেছে, গত সপ্তাহে যেখানে নমুনা পরীক্ষার বিচারে কোভিড-১৯ রোগী শনাক্তের হার ছিল ২৩ শতাংশ, কয়েকদিনের ব্যবধানে সেটি এখন ১৩ শতাংশে নেমে এসেছে।"]
references = ["গত বেশ কিছুদিন বলছে", "general kenobi"]
results = rouge.compute(predictions=predictions,
                        references=references,
                        use_aggregator=False)
print(list(results.keys()))
print(results["rouge1"])

In [None]:
print("Absulute percentae improvement of Instruct model over Human Baseline")

improvement = (np.array(list(instruction_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(instruction_model_results.keys(), improvement):
  print(f'{key}: {value*100:.2f}%')



In [None]:
!pip install lida==0.0.4
!pip install gradio

In [None]:
import gradio as gr
from torch.utils.data import DataLoader
import torch
def summary(text):

    input_feature = t5_tokenizer(text, truncation=True, max_length=1024)
    # print("a: ", input_feature)
    input_feature["input_ids"] = torch.tensor(input_feature["input_ids"])
    # print(input_feature["input_ids"])
    input_feature['input_ids'] = input_feature['input_ids'].unsqueeze(0)
    # print(input_feature["input_ids"])

    with torch.no_grad():
        preds = instruct_model.generate(
            input_feature["input_ids"].to(device),
            remove_invalid_values=True,
            generation_config=GenerationConfig(max_new_tokens=128, num_beams=15, do_sample=True, top_k=50,
                                               top_p=1.0,temperature=1, use_cache=True, output_attentions=True),
            #typical_p=1.0,
            #top_k=50,
            #top_p=1.0,
            #max_length = 128,
            length_penalty=1.0,
            no_repeat_ngram_size=2,
            num_return_sequences=1,
        )

    text_preds = t5_tokenizer.batch_decode(preds, skip_special_tokens=True)
    return text_preds[0]


In [None]:
interface = gr.Interface(
    fn = summary,
    inputs =gr.Textbox(lines=10, placeholder="Enter Your Text Here ..."),
    outputs =gr.Textbox(lines=10, placeholder=""),
    # outputs="text",
    title='Bangla Text Summarization',

    description = "Flag if you find any erroneous output",
    flagging_options = ['Excellent summary', 'Nice summary', 'Unclear summary', 'Need to improve a litte'],
    theme = 'darkpeach'
)

In [None]:
interface.launch(auth=("Jahid", "Mjahid642")) #debug = True

In [None]:
"""import shutil

path = '/content/drive/MyDrive/Colab Notebooks/NLP/models'
source_file = path
destination_file = path

shutil.make_archive(destination_file, 'zip', source_file)
"""

In [None]:
ds['test'][350]