# Prompt Enginnering

## 1. Import necessary libraries
- datasets: For loading and managing datasets from Hugging Face
- transformers: For accessing pre-trained language models and tokenizers

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig

## 2. Load the DialogSum dataset from Hugging Face
This dataset contains dialogues paired with their summaries

In [None]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)

## 3. Select specific examples from the test set to demonstrate summarization

In [None]:
example_indices = [40,100]
# Create a separator line for better output readability
dash_line = "-".join('' for x in range(100))

## 4. Display the selected examples with their human-written summaries
This helps establish a baseline for what good summaries look like

In [None]:
for i, index in enumerate(example_indices):
    print(dash_line)
    print(f"Example {i+1}")
    print(dash_line)
    print("Input dialog:")
    print(dataset["test"][index]["dialogue"])
    print(dash_line)
    print("Baseline human summary:")
    print(dataset["test"][index]["summary"])
    print(dash_line)
    print()

## 5. Load the pre-trained FLAN-T5 model and its associated tokenizer
FLAN-T5 is a fine-tuned version of T5 with improved instruction following capabilities

In [None]:
model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

## 6. Demonstrate basic tokenization process with a simple example

In [None]:
sentence = "What time is it, Tom?"
print(f"Original Sentence : {sentence}")
print(dash_line)

# Convert the sentence to token IDs (encoding)
sentence_encoded = tokenizer(sentence, return_tensors="pt")
print(f"Sentence Encoded : {sentence_encoded}")
print(dash_line)

# Convert the token IDs back to text (decoding)
sentence_decoded = tokenizer.decode(sentence_encoded["input_ids"][0], skip_special_tokens=True)
print(f"Sentence Decoded : {sentence_decoded}")
print(dash_line)


## 7. APPROACH 1: No-prompt summarization
This approach feeds the dialogue directly to the model without any instructions.
It relies on the model's pre-training to generate a summary.

In [None]:
for i, index in enumerate(example_indices):
    dialogue = dataset["test"][index]["dialogue"]
    summary = dataset["test"][index]["summary"]
    print(dialogue)

    # Tokenize the dialogue and generate a summary
    inputs = tokenizer(dialogue, return_tensors="pt")
    model_output = model.generate(inputs["input_ids"], max_new_tokens=50)[0]
    #print(model_output)
    outputs = tokenizer.decode(model_output, skip_special_tokens=True)
    print(f"Model output - {outputs}")
    print(f"Ground truth - {summary}")
    print(dash_line)

# 8. APPROACH 2: Zero-shot prompting
This approach includes an instruction to "Summarize the following conversation".
Zero-shot means we don't provide any examples of summaries.

In [None]:
for i, index in enumerate(example_indices):
    dialogue = dataset["test"][index]["dialogue"]
    summary = dataset["test"][index]["summary"]
    # Create a prompt that instructs the model to summarize the dialogue
    prompt = f"""Summarize the following conversation:
    {dialogue}
    """
    print(prompt)

    # Tokenize the prompt and generate a summary
    inputs = tokenizer(prompt, return_tensors="pt")
    model_output = model.generate(inputs["input_ids"], max_new_tokens=50, decoder_start_token_id=tokenizer.pad_token_id)[0]
    #print(model_output)
    outputs = tokenizer.decode(model_output, skip_special_tokens=True)
    print(f"Model output - {outputs}")
    print(f"Ground truth - {summary}")
    print(dash_line)

# 9. APPROACH 3: Few-shot prompting

This function creates a prompt that includes examples of dialogues and their summaries before presenting the new dialogue to summarize

In [None]:
def make_prompt(example_indices_full, example_index_to_summarize):
    prompt = ''
    # Add example dialogues and their summaries to the prompt
    for index in example_indices_full:
        dialogue = dataset["test"][index]["dialogue"]
        summary = dataset["test"][index]["summary"]
        prompt += f"""\nDialogue:
        {dialogue}

        \nWhat is going on?\n
        {summary}
        """ 

    # Add the new dialogue to summarize (without its summary)
    dialogue = dataset["test"][example_index_to_summarize]["dialogue"]
    prompt += f"""\nDialogue:
    {dialogue}
    \nWhat is going on?\n
    """
    return prompt

In [None]:
# Set up the few-shot learning example
# We use examples 40 and 100 as demonstrations, and summarize example 200
example_indices_full = [40, 100]
example_index_to_summarize = 200
one_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)
#print(one_shot_prompt)

# Get the ground truth summary for comparison
summary = dataset["test"][example_index_to_summarize]["summary"]

# Generate a summary using the few-shot prompt with default generation parameters
inputs = tokenizer(one_shot_prompt, return_tensors="pt")
output = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=50, )[0], skip_special_tokens=True)
print(f"Ground truth - {summary}")
print(dash_line)
print(f"Model output - {output}")

# APPROACH 4: Few-shot prompting with custom generation parameters
Create a generation configuration with temperature control for more controlled output.

In [None]:
generation_config = GenerationConfig(max_length=50, do_sample=True, temperature=0.1)
inputs = tokenizer(one_shot_prompt, return_tensors="pt")
output = tokenizer.decode(model.generate(inputs["input_ids"], 
                                         generation_config=generation_config)[0], 
                          skip_special_tokens=True)
print(dash_line)
print(f"Model output with gen config- {output}")