In [None]:
from typing import List

from datasets import load_dataset, DatasetDict
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

### Loading dataset from Hugging Face 

In [None]:
"""
The dataset consist +14k dialogues and theirs' summarisation.
"""
HUGGINGFACE_DATASET_NAME: str = "knkarthick/dialogsum"

dataset = load_dataset(HUGGINGFACE_DATASET_NAME)

In [None]:
def print_example_dialogue(dialogue_index: str, dialogue_dataset: DatasetDict) -> None:
    """
    Function printing the example dialogue from the dialogue dataset.
    """
    dash_line: str = "-".join("" for x in range(100))
    print("Example ", dialogue_index)
    print(dash_line)
    print("INPUT DIALOGUE:")
    print(dialogue_dataset["test"][dialogue_index]["dialogue"])
    print(dash_line)
    print("BASELINE HUMAN SUMMARY:")
    print(dataset["test"][dialogue_index]["summary"], "\n")
    print(dash_line)

In [None]:
# Printing an example of the dialogue and its summary
EXAMPLE_INDICIES: List[int] = [40, 200]

for idx in EXAMPLE_INDICIES:
    print_example_dialogue(dialogue_index=idx, dialogue_dataset=dataset)

### Loading model from Hugging Face

In [None]:
"""
WARNING: Flan-t5-base is built using ~250B parameters and weights ~1GB 
(use SageMaker Studio Lab instead of running it locally)
"""
model_name = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# creating tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [None]:
# Example of the usage of the Tokenizer
sentence: str = "What time is it, Tom?"

sentence_encoded = tokenizer(sentence, return_tensors="pt")

sentence_decoded = tokenizer.decode(
    sentence_encoded["input_ids"][0], skip_special_tokens=True
)

print("ENCODED SENTENCE:")
print(sentence_encoded["input_ids"][0])
print("\nDECODED SENTENCE:")
print(sentence_decoded)

### Using LLM without Prompt-Engineering

In [None]:
example_indices = [40, 200]
dash_line: str = "-".join("" for x in range(100))

for i, index in enumerate(example_indices):
    dialogue = dataset["test"][index]["dialogue"]
    summary = dataset["test"][index]["summary"]

    inputs = tokenizer(dialogue, return_tensors="pt")
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True,
    )

    print(dash_line)
    print("Example ", i + 1)
    print(dash_line)
    print(f"INPUT PROMPT:\n{dialogue}")
    print(dash_line)
    print(f"BASELINE HUMAN SUMMARY:\n{summary}")
    print(dash_line)
    print(f"MODEL GENERATION - WITHOUT PROMPT ENGINEERING:\n{output}\n")

### Zero-shot Inference

In [None]:
# zero-shot inference i.e. we give instructions want we want to achieve without giving examples
for i, index in enumerate(example_indices):
    dialogue = dataset["test"][index]["dialogue"]
    summary = dataset["test"][index]["summary"]

    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
    """

    # Input constructed prompt instead of the dialogue.
    inputs = tokenizer(prompt, return_tensors="pt")
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True,
    )

    print(dash_line)
    print("Example ", i + 1)
    print(dash_line)
    print(f"INPUT PROMPT:\n{prompt}")
    print(dash_line)
    print(f"BASELINE HUMAN SUMMARY:\n{summary}")
    print(dash_line)
    print(f"MODEL GENERATION - ZERO SHOT:\n{output}\n")

### One-shot Inference

In [None]:
# one-shot inference - we give one example of wanted input-output

def make_prompt(example_indices_full, example_index_to_summarize):
    prompt = ''
    for index in example_indices_full:
        dialogue = dataset['test'][index]['dialogue']
        summary = dataset['test'][index]['summary']
        
        # The stop sequence '{summary}\n\n\n' is important for FLAN-T5. Other models may have their own preferred stop sequence.
        prompt += f"""
Dialogue:

{dialogue}

What was going on?
{summary}


"""
    
    dialogue = dataset['test'][example_index_to_summarize]['dialogue']
    
    prompt += f"""
Dialogue:

{dialogue}

What was going on?
"""
        
    return prompt

In [None]:
example_indices_full = [40]
example_index_to_summarize = 200

one_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(one_shot_prompt)

In [None]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(one_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0], 
    skip_special_tokens=True
)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ONE SHOT:\n{output}')

### Few-shot Inference

In [None]:
# few-shot - we give a few examples (NOTE: if after 4-5 examples LLM is not giving desired output finetune model)
example_indices_full = [40, 80, 120]
example_index_to_summarize = 200

few_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(few_shot_prompt)

In [None]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0], 
    skip_special_tokens=True
)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')