# Load data

In [3]:
from dataset_utils import load_dataset_from_csv

ds_type = "c2s"
setting = "masked"
num_examples = "n6"
output_name = f"{ds_type}-{setting}"

dataset = load_dataset_from_csv(ds_type, setting)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 1049
    })
    validation: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 134
    })
    test: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 116
    })
})


# Load the model

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

torch.cuda.empty_cache()
# LLAMA 3.2 3B
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-3B-Instruct', cache_dir="../models/llama/") 
model =  AutoModelForCausalLM.from_pretrained('meta-llama/Llama-3.2-3B-Instruct', cache_dir="../models/llama/", device_map ={'':torch.cuda.current_device()})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
ZERO_SHOT_SYSTEM_PROMPT = """You are an expert in generating exactly one short explanatory sentence (made up of around 10 words or fewer) in a plain English for a given context text. 
Your task is to provide additional information related to the complex statement, term, action, or concept that is semantically missing from the context text.
You may do this by offering a definition, examples, background knowledge, general statements, a description of the flow of actions, or an explanation of the reason or result of the target action.
The tone should be plain and simple!
Return only ONE short concise explanatory sentence!
"""

In [3]:
# text with elaboration sentence masked -> elaboration sentence
SYSTEM_PROMPT_MASKED = """You are an expert in generating exactly one short explanatory sentence (made up of around 10 words or fewer) in a plain English for a given context text. 
Your task is to provide additional information related to the complex statement, term, action, or concept that is semantically missing from the context text.
You may do this by offering a definition, examples, background knowledge, general statements, a description of the flow of actions, or an explanation of the reason or result of the target action.
Your task is to replace the `<explanatory sentence>` tag in the provided text with the explanation sentence you generate.
Return only the explanation sentence itself, without any tags, formatting, or additional text.
The tone should be plain and simple!
Return only ONE short concise explanatory sentence!
"""

In [3]:
# text with elaboration sentence masked -> text with filled-out elaboration sentence
SYSTEM_PROMPT_MASKED2 = """You are an expert in generating exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in a plain English for a given context text. 
Your task is to replace the `<explanatory sentence>` tag in the provided text with an explanation sentence that adds relevant information to clarify a complex statement, term, action, or concept that is semantically missing from the text.
The tone should be plain and simple!
"""

In [5]:
SYSTEM_PROMPT_SHORT = """You are an expert in generating exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in a plain English for a given context text. 
The tone should be plain and simple!"""

In [5]:
model, tokenizer = setup_chat_format(model, tokenizer)
formatting_func = conversations_formatting_function(tokenizer, "messages")

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [30]:
formatted_example = formatting_func(formatted_train_dataset[0])
print(formatted_example)

<|im_start|>system
You are an expert in generating exactly one short explanatory sentence (made up of around 10 words or fewer) in a plain English for a given context text. 
Your task is to provide additional information related to the complex statement, term, action, or concept that is semantically missing from the context text.
You may do this by offering a definition, examples, background knowledge, general statements, a description of the flow of actions, or an explanation of the reason or result of the target action.
Your task is to replace the `<explanatory sentence>` tag in the provided text with the explanation sentence you generate.
Return only the explanation sentence itself, without any tags, formatting, or additional text.
The tone should be plain and simple!
Return only ONE short concise explanatory sentence!
<|im_end|>
<|im_start|>user
Context: A watermark is an image that can be seen in the paper when you hold it up to the light. Investigators say Kellogg tried to copy t

In [19]:
formatted_train_dataset[0]["messages"]

[{'content': 'You are an expert in generating exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in a plain English for a given context text. \nThe tone should be plain and simple!',
  'role': 'system'},
 {'content': ": Return the explanation sentence for the following context text: 'A watermark is an image that can be seen in the paper when you hold it up to the light. Investigators say Kellogg tried to copy the watermark.'. The subject of the explanation sentence should be: 'Investigators'.",
  'role': 'user'},
 {'content': "Here's how they say he did it.", 'role': 'assistant'}]

In [33]:
print(tokenizer.eos_token)

<|im_end|>


In [18]:
def create_user_message_subject(example):
    return f": Return the explanation sentence for the following context text: '{example['source_text']}'. The subject of the explanation sentence should be: '{example['subject']}'."


def format_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_SHORT},
            {"role": "user", "content": create_user_message_subject(example)}, # Context or Text
            {"role": "assistant", "content":  f"{example['elaboration_sentence']}"}
        ]
    }
    

def format_test_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_SHORT},
            {"role": "user", "content": create_user_message_subject(example)},
             {"role": "assistant", "content":""}
        ]
    }

formatted_train_dataset = dataset["train"].map(format_example)
formatted_validation_dataset = dataset["validation"].map(format_example)
formatted_test_dataset = dataset["test"].map(format_test_example)

Map:   0%|          | 0/1046 [00:00<?, ? examples/s]

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

In [20]:
def calculate_mean_message_length(dataset):
    total_length = 0
    message_count = 0

    for entry in dataset:
        messages = entry.get('messages', [])
        for message in messages:
            content = message.get('content', "")
            total_length += len(content.split())  # Count words in the content
            message_count += 1

    return total_length / message_count if message_count > 0 else 0

mean_length = calculate_mean_message_length(formatted_train_dataset)
print(f"Mean length of messages: {mean_length}")

Mean length of messages: 28.91459528362014


# Alpaca format

#### Examples

In [56]:
from prompt_utils import examples_dict
print(examples_dict.keys())

dict_keys(['Definition', 'Example', 'Analogy', 'Background', 'Reason', 'Contrast', 'Result', 'Speculation', 'Supplementation'])


In [4]:
from prompt_utils import formatting_prompt_func 

formatted_test_dataset = formatting_prompt_func(dataset["test"], setting, num_examples)
print(formatted_test_dataset[0])

### User: You are an expert in clarifying unclear or complex terms and concepts in a given text. Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in plain English for a given context text. The tone should be plain and simple! Do not add any comments to your answer! 
For example:

context text: 'She teaches at the University of Utah. In 1974, Wiessner recorded conversations among the Ju/'hoansi Bushmen. <explanatory sentence> They live in a vast area of 124 miles in southwestern Africa. Their lives have changed since the 1970s.'
Assistant: 'The Bushmen are a group of people who hunt animals and gather wild berries and plants to eat.'

context text: 'There are differences in how the increases would work. The differences have to do with how the cost of living would be measured. <explanatory sentence> The minimum wage in Alaska would be based on prices in Alaska. South Dakota would raise the minimum wage based on changes to a nat

### BASE

#### Short version

In [3]:
from prompt_utils import examples_dict, base_prompt, insert_examples, create_user_message
# set the pad_token for llama 3.2 3B
tokenizer.pad_token = tokenizer.eos_token
EOS_TOKEN = tokenizer.eos_token

"""def create_user_message(context):
    return f"Return an explanation sentence for the following context text: '{context}'." 
    """


test_alpaca_prompt = """### User: You are an expert in clarifying unclear or complex terms and concepts in a given text. Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in plain English for a given context text. The tone should be plain and simple! Do not add any comments to your answer! 
For example: 
context text: 'The environment is essential for sustaining life, providing clean air, water, and fertile soil. Protecting it ensures a healthier planet for future generations. '
assistant: 'This includes reducing pollution and conserving resources.'

context text: 'Japan is known for its rich cultural heritage and advanced technology. Its landscapes range from serene cherry blossom gardens to towering Mount Fuji.'
assistant: 'Mount Fuji is the tallest mountain in Japan.'

{}\n### Assistant:"""

versions = {
    "n3":['Definition','Example','Background'],
    "n6":['Definition','Example','Background', 'Supplementation', 'Analogy', 'Speculation'],
    "n9":['Definition', 'Example', 'Analogy', 'Background', 'Reason', 'Contrast', 'Result', 'Speculation', 'Supplementation']
}

filtered_dict = {key: value for key, value in examples_dict.items() if key in versions[num_examples]}

def formatting_test_prompts_func(examples):
    
    contexts = examples["source_text"]
    texts = []
    for context in contexts:
        # must add EOS_TOKEN, otherwise the generation wont stop
        text = base_prompt.format(insert_examples(filtered_dict, setting),create_user_message(context, setting)) 
        texts.append(text)
    return texts

formatted_test_dataset = formatting_test_prompts_func(dataset["test"])

In [4]:
print(formatted_test_dataset[0])

### User: You are an expert in clarifying unclear or complex terms and concepts in a given text. Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in plain English for a given context text. The tone should be plain and simple! Do not add any comments to your answer! 
For example:

context text: 'She teaches at the University of Utah. In 1974, Wiessner recorded conversations among the Ju/'hoansi Bushmen. They live in a vast area of 124 miles in southwestern Africa. Their lives have changed since the 1970s.'
target_phrase='Bushmen'
Assistant: 'The Bushmen are a group of people who hunt animals and gather wild berries and plants to eat.'

context text: 'There are differences in how the increases would work. The differences have to do with how the cost of living would be measured. The minimum wage in Alaska would be based on prices in Alaska. South Dakota would raise the minimum wage based on changes to a national measure of the c

#### Long version

In [17]:
from prompt_utils import examples_dict, base_prompt, insert_examples, examples_dict_from_validation_dataset

# set the pad_token for llama 3.2 3B
tokenizer.pad_token = tokenizer.eos_token
EOS_TOKEN = tokenizer.eos_token

def create_user_message(context):
    return f"Return an explanation sentence for the following context text: '{context}'."


def formatting_test_prompts_func(examples):
    contexts = examples["source_text"]
    texts = []
    for context in contexts:
        # must add EOS_TOKEN, otherwise the generation wont stop
        text = base_prompt.format(insert_examples(examples_dict_from_validation_dataset),create_user_message(context))
        texts.append(text)
    return texts

formatted_test_dataset = formatting_test_prompts_func(dataset["test"])
print(formatted_test_dataset[1])

### User: You are an expert in clarifying unclear or complex terms and concepts in a given text. Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in plain English for a given context text. The tone should be plain and simple! Do not add any comments to your answer! 
For example:

context text: 'Together they slowed down the large group of trucks carrying the machine parts. They were not able to stop the trucks for long, though. This one was carrying a giant water evaporator. After being blocked, it continued on its way when police arrested 20 of the protesters.'
Assistant: 'Trucks that travel in groups are known as a convoy.'

context text: 'There are differences in how the increases would work. The differences have to do with how the cost of living would be measured. The minimum wage in Alaska would be based on prices in Alaska. South Dakota would raise the minimum wage based on changes to a national measure of the cost of l

#### Random version

In [25]:
from prompt_utils import examples_dict, base_prompt, insert_random_examples

# set the pad_token for llama 3.2 3B
tokenizer.pad_token = tokenizer.eos_token
EOS_TOKEN = tokenizer.eos_token

def create_user_message(context):
    return f"Return an explanation sentence for the following context text: '{context}'."


def formatting_test_prompts_func(examples):
    contexts = examples["source_text"]
    texts = []
    for context in contexts:
        # must add EOS_TOKEN, otherwise the generation wont stop
        text = base_prompt.format(insert_random_examples(examples_dict,num_examples=2),create_user_message(context)) 
        texts.append(text)
    return texts

formatted_test_dataset = formatting_test_prompts_func(dataset["test"])
print(formatted_test_dataset[2])

### User: You are an expert in clarifying unclear or complex terms and concepts in a given text. Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in plain English for a given context text. The tone should be plain and simple! Do not add any comments to your answer! 
For example:

context text: 'Japan is known for its rich cultural heritage and advanced technology. Its landscapes range from cherry blossom gardens to towering Mount Fuji.'
Assistant: 'Mount Fuji is the tallest mountain in Japan.'

context text: 'One of the most thrilling events in winter sports is ski jumping. Ski jumping is a winter sport where athletes glide down a ramp and jump to achieve maximum distance.'
Assistant: 'As they glide down, they gain speed, which helps them jump higher into the air.'

Return an explanation sentence for the following context text: 'WASHINGTON – At least four people died in Midwest floods this Spring. But the death toll could hav

### Masked version

In [138]:
# set the pad_token for llama 3.2 3B
tokenizer.pad_token = tokenizer.eos_token
EOS_TOKEN = tokenizer.eos_token

def create_user_message(context):
    return f"Return the explanation sentence that could replace the `<explanatory sentence>` tag in the following text: '{context}'."

test_alpaca_prompt = """### User: Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in a plain English that could replace the <explanatory sentence> tag in a given context text. The tone should be plain and simple! Do not add any comments to your answer! 
For example: 
context text: 'The environment is essential for sustaining life, providing clean air, water, and fertile soil. <explanatory sentence> Protecting the environment ensures a healthier planet for future generations. '
assistant: 'Fertile soil is ideal for growing plants.'

context text: 'Japan is known for its rich cultural heritage and advanced technology. Its landscapes range from serene cherry blossom gardens to towering Mount Fuji. <explanatory sentence>'
assistant: 'Mount Fuji is the tallest mountain in Japan.'

{}\n### Assistant:"""

def formatting_test_prompts_func(examples):
    contexts = examples["source_text"]
    texts = []
    for context in contexts:
        # must add EOS_TOKEN, otherwise the generation wont stop
        text = test_alpaca_prompt.format(create_user_message(context)) 
        texts.append(text)
    return texts

formatted_test_dataset = formatting_test_prompts_func(dataset["test"])

In [139]:
formatted_test_dataset[0]

'### User: Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in a plain English that could replace the <explanatory sentence> tag in a given context text. The tone should be plain and simple! Do not add any comments to your answer! \nFor example: \ncontext text: \'The environment is essential for sustaining life, providing clean air, water, and fertile soil. <explanatory sentence> Protecting the environment ensures a healthier planet for future generations. \'\nassistant: \'Fertile soil is ideal for growing plants.\'\n\ncontext text: \'Japan is known for its rich cultural heritage and advanced technology. Its landscapes range from serene cherry blossom gardens to towering Mount Fuji. <explanatory sentence>\'\nassistant: \'Mount Fuji is the tallest mountain in Japan.\'\n\nReturn the explanation sentence that could replace the `<explanatory sentence>` tag in the following text: \'They did not need special skills or a college edu

### Specifying subject

In [83]:
# set the pad_token for llama 3.2 3B
tokenizer.pad_token = tokenizer.eos_token
EOS_TOKEN = tokenizer.eos_token

def create_user_message_subject(context, subject):
    return f"Return the explanation sentence for the following context text: '{context}'. The explanation sentence should refer to the {subject}."
print(tokenizer.eos_token )

test_alpaca_prompt = """### User: Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in a plain English for a given context text. The tone should be plain and simple! Do not add any comments to your answer! 
For example: 
context text: 'The environment is essential for sustaining life, providing clean air, water, and fertile soil. Protecting it ensures a healthier planet for future generations. '
subject='protecting the environment'
assistant: 'This includes reducing pollution and conserving resources.'

context text: 'Japan is known for its rich cultural heritage and advanced technology. Its landscapes range from serene cherry blossom gardens to towering Mount Fuji.'
subject='Mount Fuji'
assistant: 'Mount Fuji is the tallest mountain in Japan.'

{}\n### Assistant:"""

def formatting_test_prompts_func(examples):
    contexts = examples["source_text"]
    subjects = examples["subject"]
    texts = []
    for context, subject in zip(contexts, subjects):
        # must add EOS_TOKEN, otherwise the generation wont stop
        text = test_alpaca_prompt.format(create_user_message_subject(context, subject)) 
        texts.append(text)
    return texts

formatted_test_dataset = formatting_test_prompts_func(dataset["test"])

<|eot_id|>


In [84]:
formatted_test_dataset[0]

'### User: Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in a plain English for a given context text. The tone should be plain and simple! Do not add any comments to your answer! \nFor example: \ncontext text: \'The environment is essential for sustaining life, providing clean air, water, and fertile soil. Protecting it ensures a healthier planet for future generations. \'\nsubject=\'protecting the environment\'\nassistant: \'This includes reducing pollution and conserving resources.\'\n\ncontext text: \'Japan is known for its rich cultural heritage and advanced technology. Its landscapes range from serene cherry blossom gardens to towering Mount Fuji.\'\nsubject=\'Mount Fuji\'\nassistant: \'Mount Fuji is the tallest mountain in Japan.\'\n\nReturn the explanation sentence for the following context text: \'Factories have closed and their low-skill manufacturing jobs are long gone. The new companies in town require workers w

### Specifying the target phrase

In [5]:
from prompt_utils import examples_dict, base_prompt, insert_examples, create_user_message

# set the pad_token for llama 3.2 3B
tokenizer.pad_token = tokenizer.eos_token
EOS_TOKEN = tokenizer.eos_token

def create_user_message_target(context, target):
    return f"Return the explanation sentence for the following context text: '{context}'. The explanation sentence should specifically clarify the {target}."

test_alpaca_prompt = """### User: You are an expert in clarifying unclear or complex terms and concepts in a given text. Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in plain English for a given context text. The tone should be plain and simple! Do not add any comments to your answer! 
For example: 
context text: 'Japan is known for its rich cultural heritage and advanced technology. Its landscapes range from cherry blossom gardens to towering Mount Fuji.'
target_phrase='cultural heritage'
Assistant: This heritage includes traditional arts like tea ceremony or calligraphy. 

context text: 'One of the most thrilling events in winter sports is ski jumping. Ski jumping is a winter sport where athletes glide down a ramp and jump to achieve maximum distance.'
target_phrase='glide down'
Assistant: As they glide down, they gain speed, which helps them jump higher into the air.

{}\n### Assistant:"""

versions = {
    "n3":['Definition','Example','Background'],
    "n6":['Definition','Example','Background', 'Supplementation', 'Analogy', 'Speculation'],
    "n9":['Definition', 'Example', 'Analogy', 'Background', 'Reason', 'Contrast', 'Result', 'Speculation', 'Supplementation']
}

filtered_dict = {key: value for key, value in examples_dict.items() if key in versions[num_examples]}

def formatting_test_prompts_func(examples):
    contexts = examples["source_text"]
    targets = examples["target_sentence_target"]
    texts = []
    for context, target in zip(contexts, targets):
        # must add EOS_TOKEN, otherwise the generation won't stop
        text = base_prompt.format(insert_examples(filtered_dict, setting), create_user_message(context, setting, target)) 
        texts.append(text)
    return texts

formatted_test_dataset = formatting_test_prompts_func(dataset["test"])

In [6]:
print(formatted_test_dataset[0])

### User: You are an expert in clarifying unclear or complex terms and concepts in a given text. Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in plain English for a given context text. The tone should be plain and simple! Do not add any comments to your answer! 
For example:

context text: 'She teaches at the University of Utah. In 1974, Wiessner recorded conversations among the Ju/'hoansi Bushmen. They live in a vast area of 124 miles in southwestern Africa. Their lives have changed since the 1970s.'
target_phrase='Bushmen'
Assistant: 'The Bushmen are a group of people who hunt animals and gather wild berries and plants to eat.'

context text: 'There are differences in how the increases would work. The differences have to do with how the cost of living would be measured. The minimum wage in Alaska would be based on prices in Alaska. South Dakota would raise the minimum wage based on changes to a national measure of the c

### Specifying the target sentence for clarification

In [37]:
# set the pad_token for llama 3.2 3B
tokenizer.pad_token = tokenizer.eos_token
EOS_TOKEN = tokenizer.eos_token

def create_user_message_target(context, target):
    return f"Return the explanation sentence for the following context text: '{context}'. The explanation sentence should specifically clarify the target_sentence={target}"

test_alpaca_prompt = """### User: You are an expert in clarifying unclear or complex terms and concepts in a given text. Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in plain English for a given context text. The tone should be plain and simple! Do not add any comments to your answer! 
For example: 
context text: 'Japan is known for its rich cultural heritage and advanced technology. Its landscapes range from cherry blossom gardens to towering Mount Fuji.'
target_sentence='Japan is known for its rich cultural heritage and advanced technology.'
Assistant: This heritage includes traditional arts like tea ceremony or calligraphy. 

context text: 'One of the most thrilling events in winter sports is ski jumping. Ski jumping is a winter sport where athletes glide down a ramp and jump to achieve maximum distance.'
target_sentence='Ski jumping is a winter sport where athletes glide down a ramp and jump to achieve maximum distance.'
Assistant: As they glide down, they gain speed, which helps them jump higher into the air.

{}\n### Assistant:"""

def formatting_test_prompts_func(examples):
    contexts = examples["source_text"]
    targets = examples["target_sentence_4o"]
    texts = []
    for context, target in zip(contexts, targets):
        # must add EOS_TOKEN, otherwise the generation wont stop
        text = test_alpaca_prompt.format(create_user_message_target(context, target)) 
        texts.append(text)
    return texts

formatted_test_dataset = formatting_test_prompts_func(dataset["test"])

In [38]:
print(formatted_test_dataset[1])

### User: You are an expert in clarifying unclear or complex terms and concepts in a given text. Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in plain English for a given context text. The tone should be plain and simple! Do not add any comments to your answer! 
For example: 
context text: 'Japan is known for its rich cultural heritage and advanced technology. Its landscapes range from cherry blossom gardens to towering Mount Fuji.'
target_sentence='Japan is known for its rich cultural heritage and advanced technology.'
Assistant: This heritage includes traditional arts like tea ceremony or calligraphy. 

context text: 'One of the most thrilling events in winter sports is ski jumping. Ski jumping is a winter sport where athletes glide down a ramp and jump to achieve maximum distance.'
target_sentence='Ski jumping is a winter sport where athletes glide down a ramp and jump to achieve maximum distance.'
Assistant: As they g

### Specifying both the target phrase/ subject and the target sentence

In [48]:
# set the pad_token for llama 3.2 3B
tokenizer.pad_token = tokenizer.eos_token
EOS_TOKEN = tokenizer.eos_token

def create_user_message_target(context, target,target_sentence):
    return f"Return the explanation sentence for the following context text: '{context}'. The explanation sentence should specifically clarify the target_sentence='{target_sentence}' by referring to the {target}."

test_alpaca_prompt = """### User: You are an expert in clarifying unclear or complex terms and concepts in a given text. Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in plain English for a given context text. The tone should be plain and simple! Do not add any comments to your answer! 
For example: 
context text: 'Japan is known for its rich cultural heritage and advanced technology. Its landscapes range from cherry blossom gardens to towering Mount Fuji.'
target_sentence='Japan is known for its rich cultural heritage and advanced technology.'
target_phrase='cultural heritage'
Assistant: This heritage includes traditional arts like tea ceremony or calligraphy. 

context text: 'One of the most thrilling events in winter sports is ski jumping. Ski jumping is a winter sport where athletes glide down a ramp and jump to achieve maximum distance.'
target_sentence='Ski jumping is a winter sport where athletes glide down a ramp and jump to achieve maximum distance.'
target_phrase='glide down'
Assistant: As they glide down, they gain speed, which helps them jump higher into the air.

{}\n### Assistant:"""

def formatting_test_prompts_func(examples):
    contexts = examples["source_text"]
    targets = examples["target_sentence_target"] # target_sentence_target subject
    target_sents = examples["target_sentence_4o"]
    texts = []
    for context, target, target_sent in zip(contexts, targets, target_sents):
        text = test_alpaca_prompt.format(create_user_message_target(context, target, target_sent)) 
        texts.append(text)
    return texts

formatted_test_dataset = formatting_test_prompts_func(dataset["test"])

In [49]:
print(formatted_test_dataset[0])

### User: You are an expert in clarifying unclear or complex terms and concepts in a given text. Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in plain English for a given context text. The tone should be plain and simple! Do not add any comments to your answer! 
For example: 
context text: 'Japan is known for its rich cultural heritage and advanced technology. Its landscapes range from cherry blossom gardens to towering Mount Fuji.'
target_sentence='Japan is known for its rich cultural heritage and advanced technology.'
target_phrase='cultural heritage'
Assistant: This heritage includes traditional arts like tea ceremony or calligraphy. 

context text: 'One of the most thrilling events in winter sports is ski jumping. Ski jumping is a winter sport where athletes glide down a ramp and jump to achieve maximum distance.'
target_sentence='Ski jumping is a winter sport where athletes glide down a ramp and jump to achieve maxim

### Specifying both the target phrase (from the target sentence) and the target sentence

In [5]:
# set the pad_token for llama 3.2 3B
tokenizer.pad_token = tokenizer.eos_token
EOS_TOKEN = tokenizer.eos_token

alpaca_prompt = """### User: Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in a plain English for a given context text. The tone should be plain and simple! {}\n### Assistant: {}"""

def create_user_message_target(context, target, target_sentence):
    return f"Return the explanation sentence for the following context text: '{context}'. The explanation sentence should specifically clarify the {target_sentence} by referring to the {target}."
print(tokenizer.eos_token )

def formatting_prompts_func(examples):
    contexts = examples["source_text"]
    targets = examples["target_sentence_target"]
    target_sents = examples["target_sentence_4o"]
    elab_sentences = examples["elaboration_sentence"]
    texts = []
    for context, target, target_sent, elab_sent in zip(contexts, targets, target_sents, elab_sentences):
        # must add EOS_TOKEN, otherwise the generation wont stop
        text = alpaca_prompt.format(create_user_message_target(context, target, target_sent), elab_sent) + EOS_TOKEN
        texts.append(text)
    return texts

test_alpaca_prompt = """### User: Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in a plain English for a given context text. The tone should be plain and simple! {}\n### Assistant:"""

def formatting_test_prompts_func(examples):
    contexts = examples["source_text"]
    targets = examples["target_sentence_target"]
    target_sents = examples["target_sentence_4o"]
    texts = []
    for context, target, target_sent in zip(contexts, targets, target_sents):
        # must add EOS_TOKEN, otherwise the generation wont stop
        text = test_alpaca_prompt.format(create_user_message_target(context, target, target_sent)) 
        texts.append(text)
    return texts

formatted_test_dataset = formatting_test_prompts_func(dataset["test"])

<|end_of_text|>


In [9]:
formatted_test_dataset[0]

"### User: Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in a plain English for a given context text. The tone should be plain and simple! Return the explanation sentence for the following context text: 'New companies have come that need skilled workers with more education. New Haven youth want those jobs, but they do not have the education or the skills.'. The explanation sentence should specifically clarify the target_sentence='New Haven youth want those jobs, but they do not have the education or the skills.' by referring to the target_phrase='do not have the education or the skills'.\n### Assistant:"

### Target sentence -> elaboration

In [56]:
# set the pad_token for llama 3.2 3B
tokenizer.pad_token = tokenizer.eos_token
EOS_TOKEN = tokenizer.eos_token

def create_user_message(context):
    return f"Return the clarification sentence for the {context}. "

test_alpaca_prompt = """### User: Your task is to generate exactly ONE short concise clarification sentence (made up of around 10 words or fewer) in a plain English for a given sentence. The tone should be plain and simple! Do not add any comments to your answer! 
For example:
target_sentence='Japan\'slandscapes range from serene cherry blossom gardens to towering Mount Fuji.'
Assistant: Mount Fuji is the tallest mountain in Japan.

target_sentence='Ski jumping is a winter sport where athletes glide down a ramp and jump to achieve maximum distance.'
Assistant: As they glide down, they gain speed, which helps them jump higher into the air.

{}\n### Assistant:"""

def formatting_test_prompts_func(examples):
    contexts = examples["target_sentence_4o"]
    texts = []
    for context in contexts:
        # must add EOS_TOKEN, otherwise the generation wont stop
        text = test_alpaca_prompt.format(create_user_message(context)) 
        texts.append(text)
    return texts

formatted_test_dataset = formatting_test_prompts_func(dataset["test"])

In [57]:
formatted_test_dataset[0]

"### User: Your task is to generate exactly ONE short concise clarification sentence (made up of around 10 words or fewer) in a plain English for a given sentence. The tone should be plain and simple! Do not add any comments to your answer! \nFor example:\ntarget_sentence='Japan'slandscapes range from serene cherry blossom gardens to towering Mount Fuji.'\nAssistant: Mount Fuji is the tallest mountain in Japan.\n\ntarget_sentence='Ski jumping is a winter sport where athletes glide down a ramp and jump to achieve maximum distance.'\nAssistant: As they glide down, they gain speed, which helps them jump higher into the air.\n\nReturn the clarification sentence for the target_sentence='New Haven needs to invest in educating its youth so they will be qualified to do those high-skilled jobs when they become adults.'. \n### Assistant:"

### Target sentence + target phrase/subject -> elaboration

In [74]:
# set the pad_token for llama 3.2 3B
tokenizer.pad_token = tokenizer.eos_token
EOS_TOKEN = tokenizer.eos_token

def create_user_message(context, target):
    return f"Return the clarification sentence for the {context}. The explanation sentence should refer to the {target}." 

test_alpaca_prompt = """### User: Your task is to generate exactly ONE short concise clarification sentence (made up of around 10 words or fewer) in a plain English for a given sentence. The tone should be plain and simple! Do not add any comments to your answer! 
For example:
target_sentence='Japan\'slandscapes range from serene cherry blossom gardens to towering Mount Fuji.'
target_phrase='Mount Fuji'
Assistant: Mount Fuji is the tallest mountain in Japan.

target_sentence='Ski jumping is a winter sport where athletes glide down a ramp and jump to achieve maximum distance.'
target_phrase='glide down'
Assistant: As they glide down, they gain speed, which helps them jump higher into the air.

{}\n### Assistant:"""


def formatting_test_prompts_func(examples):
    contexts = examples["target_sentence_4o"]
    targets = examples["target_sentence_target"] # target_sentence_target subject
    texts = []
    for context, target in zip(contexts,targets):
        # must add EOS_TOKEN, otherwise the generation wont stop
        text = test_alpaca_prompt.format(create_user_message(context, target)) 
        texts.append(text)
    return texts

formatted_test_dataset = formatting_test_prompts_func(dataset["test"])

In [75]:
formatted_test_dataset[0]

"### User: Your task is to generate exactly ONE short concise clarification sentence (made up of around 10 words or fewer) in a plain English for a given sentence. The tone should be plain and simple! Do not add any comments to your answer! \nFor example:\ntarget_sentence='Japan'slandscapes range from serene cherry blossom gardens to towering Mount Fuji.'\ntarget_phrase='Mount Fuji'\nAssistant: Mount Fuji is the tallest mountain in Japan.\n\ntarget_sentence='Ski jumping is a winter sport where athletes glide down a ramp and jump to achieve maximum distance.'\ntarget_phrase='glide down'\nAssistant: As they glide down, they gain speed, which helps them jump higher into the air.\n\nReturn the clarification sentence for the target_sentence='New Haven needs to invest in educating its youth so they will be qualified to do those high-skilled jobs when they become adults.'. The explanation sentence should refer to the target_phrase='educating its youth'.\n### Assistant:"

#### For testing the model trained with the subject or target info  

In [9]:
def create_user_message(context):
    return f"Return the explanation sentence for the following context text: '{context}'."

def formatting_test_prompts_func(examples):
    contexts = examples["source_text"]
    texts = []
    for context in contexts:
        # must add EOS_TOKEN, otherwise the generation wont stop
        text = test_alpaca_prompt.format(create_user_message(context)) 
        texts.append(text)
    return texts

formatted_test_dataset = formatting_test_prompts_func(dataset["test"])

In [10]:
formatted_test_dataset[0]

"### User: Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in a plain English for a given context text. \nThe tone should be plain and simple! Return the explanation sentence for the following context text: 'They did not need special skills or a college education to work there. Those factories are gone now. New companies have come that need skilled workers with more education. New Haven youth want those jobs, but they do not have the education or the skills.'.\n### Assistant:"

In [12]:
def print_tokens_with_ids(txt):
    tokens = tokenizer.tokenize(txt, add_special_tokens=False)
    token_ids = tokenizer.encode(txt, add_special_tokens=False)
    print(list(zip(tokens, token_ids)))

formatted_ds = formatting_prompts_func(dataset["train"])
prompt = formatted_ds[0]
print_tokens_with_ids(prompt) 

[('###', 14711), ('ĠUser', 2724), (':', 25), ('ĠYour', 4718), ('Ġtask', 3465), ('Ġis', 374), ('Ġto', 311), ('Ġgenerate', 7068), ('Ġexactly', 7041), ('ĠONE', 25002), ('Ġshort', 2875), ('Ġconcise', 64694), ('Ġexplanation', 16540), ('Ġsentence', 11914), ('Ġ(', 320), ('made', 28010), ('Ġup', 709), ('Ġof', 315), ('Ġaround', 2212), ('Ġ', 220), ('10', 605), ('Ġwords', 4339), ('Ġor', 477), ('Ġfewer', 17162), (')', 8), ('Ġin', 304), ('Ġa', 264), ('Ġplain', 14733), ('ĠEnglish', 6498), ('Ġfor', 369), ('Ġa', 264), ('Ġgiven', 2728), ('Ġcontext', 2317), ('Ġtext', 1495), ('.', 13), ('ĠThe', 578), ('Ġtone', 16630), ('Ġshould', 1288), ('Ġbe', 387), ('Ġplain', 14733), ('Ġand', 323), ('Ġsimple', 4382), ('!', 0), ('ĠReturn', 3494), ('Ġthe', 279), ('Ġexplanation', 16540), ('Ġsentence', 11914), ('Ġfor', 369), ('Ġthe', 279), ('Ġfollowing', 2768), ('Ġcontext', 2317), ('Ġtext', 1495), (':', 25), ("Ġ'", 364), ('A', 32), ('Ġwatermark', 89106), ('Ġis', 374), ('Ġan', 459), ('Ġimage', 2217), ('Ġthat', 430), ('Ġcan'

In [13]:
response_template = "### Assistant:"
print_tokens_with_ids(response_template) 

[('###', 14711), ('ĠAssistant', 22103), (':', 25)]


In [7]:
formatted_test_dataset = formatting_test_prompts_func(dataset["test"])
formatted_test_dataset[0]

"### User: Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in a plain English for a given context text. \nThe tone should be plain and simple! Return the explanation sentence for the following context text: 'New companies have come that need skilled workers with more education. New Haven youth want those jobs, but they do not have the education or the skills. That is where New Haven Promise comes in. It will make a difference by paying for college.'. The subject of the explanation sentence should refer to : 'Many (youth)'.\n### Assistant:"

# Clean-up

In [18]:
from model_utils import clear_directory
import torch

torch.cuda.empty_cache()
del model, trainer, tokenizer, data_collator
clear_directory(logs_dir)

Directory does not exist: ../models/llama3.2-news-ft/logs/logs-c2osp


# Generate predictions

https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Mistral/Supervised_fine_tuning_(SFT)_of_an_LLM_using_Hugging_Face_tooling.ipynb

In [8]:
from transformers import pipeline, StoppingCriteria, StoppingCriteriaList
import torch

class RefinedEndSentenceStoppingCriteria(StoppingCriteria):
    def __init__(self, tokenizer, sentence_end_tokens):
        super().__init__()
        self.tokenizer = tokenizer
        self.sentence_end_token_ids = [
            self.tokenizer.convert_tokens_to_ids(token) for token in sentence_end_tokens
        ]
        self.eos_token_id = tokenizer.eos_token_id  # Include eos_token_id

    def is_valid_stop(self, input_ids):
        # Get the last token and the one before it
        if len(input_ids[0]) < 2:
            return False  # Not enough tokens to decide
        last_token_id = input_ids[0, -1].item()
        second_last_token_id = input_ids[0, -2].item()

        # Decode tokens to check context
        last_token = self.tokenizer.decode([last_token_id])
        second_last_token = self.tokenizer.decode([second_last_token_id])

        # Stop if it's a sentence-ending token and not part of an abbreviation
        if (
            last_token in [".", "!", "?"]  # Check if it's a sentence-ending token
            and len(second_last_token) > 1  # Ensure not part of an abbreviation
            and not second_last_token.isupper()  # Ensure it's not "U.S." or similar
        ):
            return True

        # Include end-of-sequence token
        return last_token_id == self.eos_token_id

    def __call__(self, input_ids, scores, **kwargs):
        return self.is_valid_stop(input_ids)


sentence_end_tokens = [".","\n","!", "?"]
stopping_criteria = StoppingCriteriaList([RefinedEndSentenceStoppingCriteria(tokenizer, sentence_end_tokens)])

### Generate with alpaca

In [8]:
import torch
import random 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

example = random.choice(formatted_test_dataset)

inputs = tokenizer(
    example, #input_text,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512  # Adjust max_length as needed
).to(device)

with torch.no_grad():
    output_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=32,  # 32 for elaboration-only generation
        min_length=10,
        do_sample=False,  # Greedy decoding
        temperature=None,  # not used in greedy decoding
        top_p=None,  # not used in greedy decoding
        num_return_sequences=1,
        no_repeat_ngram_size=3,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        stopping_criteria=stopping_criteria
    )

generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)
response = extract_response(generated_text)
print("Extracted Response:", response)

### User: Your task is to generate exactly ONE short concise explanation sentence (made up of around 10 words or fewer) in a plain English for a given context text. 
The tone should be plain and simple! Return the explanation sentence for the following context text: 'Brown was a black teenager without a weapon who was shot by a white police officer. He was killed in August in Ferguson, Missouri, near St. Louis. The shooting set off nearly nightly protests and violence. The black community felt that Brown wouldn't have been killed if he was white.'.
### Assistant: The officer was not charged with a crime.
Extracted Response: The officer was not charged with a crime.


## Generate predictions

In [7]:
tokenizer.model_max_length

131072

In [7]:
from tqdm.notebook import tqdm
import pandas as pd
from dataset_utils import create_results_df

def extract_response(text, prefix = "### Assistant:"):
    if prefix in text:
        return text.split(prefix, 1)[1].strip()
    return None

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
model.config.use_cache = True

sentence_end_tokens = [".","\n","!", "?"]
stopping_criteria = StoppingCriteriaList([RefinedEndSentenceStoppingCriteria(tokenizer, sentence_end_tokens)])

def generate_predictions(dataset, formatted_test_dataset, ds_type, setting, num_examples):

    output_name = f"{ds_type}-{setting}"
    search_type = {"beam-search":{"num_beams":4, "early_stopping":True, 
                              "filename":f"../data/gen_predictions/predictions_llama-instruct-few-shot-{output_name}-{num_examples}.csv"},
              "greedy":{"num_beams":1, "early_stopping":False,
                        "filename":f"../data/gen_predictions/predictions_llama-instruct-few-shot-{output_name}-greedy-{num_examples}.csv"}
    }

    for search_t in search_type.keys():
    
        df_results = create_results_df(dataset)
    
        for idx, row in tqdm(df_results.iterrows(),total=len(df_results)):
            if row["pred_elaboration"]=="":
                inputs = tokenizer(
                    formatted_test_dataset[idx], #input_text,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=2500 # 1024 for short, 2500 for long
                ).to(device)
                
                with torch.no_grad():
                    output_ids = model.generate(
                        input_ids=inputs["input_ids"],
                        attention_mask=inputs["attention_mask"],
                        max_new_tokens=32,  # 32 for elaboration-only generation
                        min_length=10,
                        do_sample=False, 
                        temperature=None,  # not used in greedy decoding
                        top_p=None,# not used in greedy decoding
                        num_beams = search_type[search_t]["num_beams"],
                        early_stopping = search_type[search_t]["early_stopping"],
                        num_return_sequences=1,
                        no_repeat_ngram_size=3,
                        eos_token_id=tokenizer.eos_token_id,
                        pad_token_id=tokenizer.eos_token_id,
                        stopping_criteria=stopping_criteria
                    )
                
                generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
                response = extract_response(generated_text) #extract_assistant_response(generated_text) -> chatML
                df_results.at[idx,"pred_elaboration"] = response
        
        df_results.to_csv(search_type[search_t]["filename"], index=False)
        print(f"Saved {search_type[search_t]['filename']}")

In [None]:
from dataset_utils import load_dataset_from_csv
from prompt_utils import formatting_prompt_func

setting_ds_dict = {
    "base": ["c2o","c2op"],
    "masked": ["c2s","c2sp","c4s","c4sp"],
    "target-phrase":["c2s","c2sp","c4s","c4sp","c2o","c2op"],
    "target-sent":["c2s","c2sp","c4s","c4sp","c2o","c2op"],
    "target-sent-target":["c2s","c2sp","c4s","c4sp","c2o","c2op"],
}

example_num_versions = ["n3","n6"]

for setting, ds_types in setting_ds_dict.items():
    for ds_type in ds_types:
        dataset = load_dataset_from_csv(ds_type, setting)
        for num_examples in example_num_versions:
            formatted_test_dataset = formatting_prompt_func(dataset["test"], setting, num_examples)  
            generate_predictions(dataset, formatted_test_dataset, ds_type, setting, num_examples)

In [1]:
!python generate_elaborations.py --model llama-instruct

Using model: llama-instruct
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:03<00:00,  1.62s/it]
Processing setting: base, dataset type: c2o
Formatting prompts with n3 examples.
  0%|          | 0/116 [00:00<?, ?it/s]
Saved ../data/gen_predictions/predictions_llama-instruct-few-shot-c2o-base-n3.csv
  0%|          | 0/116 [00:00<?, ?it/s]
Saved ../data/gen_predictions/predictions_llama-instruct-few-shot-c2o-base-greedy-n3.csv
Formatting prompts with n6 examples.
  0%|          | 0/116 [00:00<?, ?it/s]
Saved ../data/gen_predictions/predictions_llama-instruct-few-shot-c2o-base-n6.csv
  0%|          | 0/116 [00:00<?, ?it/s]
Saved ../data/gen_predictions/predictions_llama-instruct-few-shot-c2o-base-greedy-n6.csv
Processing setting: base, dataset type: c2op
Formatting prompts with n3 examples.
  0%|          | 0/116 [00:00<?, ?it/s]
Saved ../data/gen_predictions/predictions_llama-instruct-few-shot-c2op-base-n3.csv
  0%|          | 0/116 [00:00<?, ?it/s]
Saved ../data/gen_predicti