# Load model and tokenizer

In [1]:
from model_utils import LlamaAssistant

assistant = LlamaAssistant(model_name="meta-llama/Llama-3.2-3B-Instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Load datasets

In [56]:
import os
import pandas as pd

data_path = "../data/elaborations"
test_ds_c2sp = "test_ds_c2sp.csv"
test_ds_c2s = "test_ds_c2s.csv"
test_ds_c2os = "test_ds_c2os.csv"
test_ds_c4s = "test_ds_c4s.csv"
test_ds_c4sp = "test_ds_c4sp.csv"
test_ds_c2osp = "test_ds_c2osp.csv"
test_ds_c4osp = "test_ds_c4osp.csv"
test_df = pd.read_csv(os.path.join(data_path, "test", test_ds_c4osp))

target_c2sp = "elab-target-predictions_llama-instr-test_ds_c2sp-context_elab-together.csv"
target_c2s = "elab-target-predictions_llama-instr-test_ds_c2s-context_elab-together.csv"
target_df = pd.read_csv(os.path.join("../data", "gen_predictions",target_c2sp))
print(len(test_df)==len(target_df))

True


# Create Prompt 

In [3]:
SYSTEM_PROMPT = """You are a helpful assistant who generates exactly one short, simple explanatory sentence ( made up of around 10 words or fewer) in a plain English for a given context. 
Your task is to provide additional information related to a complex statement, term, action, or concept that is semantically missing from the context document.
You may do this by offering a definition, examples, background knowledge, general statements, a description of the flow of actions, or an explanation of the reason or result of the target action.
Also, specify the target of your explanation as found in the context text. The "explanation target" should be written as a simple, concise noun phrase, not as a complete sentence.
Return your answer in the following format:
{"sentence": "<explanatory sentence text>", "target": "<explanation target>"}
"""

In [5]:
SYSTEM_PROMPT_TARGET = """You are a helpful assistant whose task is to determine what the explanation sentence provided is explaining within the context text.
Guidelines: 
- The "explanation target" should be a specific phrase, term, action, or concept in the context text provided. 
- The "explanation target" should be written as a noun phrase, not as a complete sentence.
- If there are multiple possible targets, select the first target that you think is most appropriate.

Return your answer in the following format:
{"sentence":"<provided explanation sentence>", "target": "<explanation target>"}
"""

In [4]:
SYSTEM_PROMPT_TARGET_WITH_LABEL_TEXT = """You are a helpful assistant whose task is to identify the subject of the provided explanation sentence. 

Guidelines:
1. Return the subject of the explanation sentence as the "explanation target."
2. If the subject of the explanation sentence is a **pronoun (e.g., "it," "they," "he," "she")**, determine what the pronoun is referring to within the context text and return the reference as the "explanation target."
3. The "explanation target" should be written as a **noun phrase**, not as a complete sentence.
4. If there are multiple possible subjects, select the first one you think is most appropriate.
5 Do not write any comments to the text!

Return your answer in the following format:
"target": "<explanation target>"
"""

In [4]:
SYSTEM_PROMPT_WITH_ELAB_TARGET_PROVIDED = """You are a helpful assistant who generates exactly one short, simple explanatory sentence in plain English (approximately 10 words or fewer) for a given context.

Your task is to provide additional information specifically related to the "explanation target" provided. 

Guidelines:
1. Keep the explanation concise and directly relevant to the "explanation target."
2. You may provide additional information by:
   - Offering a definition.
   - Giving examples.
   - Providing background knowledge.
   - Stating a general fact.
   - Describing a sequence of actions.
   - Explaining a reason or result connected to the "explanation target."
3. Do not restate the "explanation target." Focus only on elaborating it.

Return your answer in the following format:
{"sentence": "<explanatory sentence text>"}
"""

# Generate 

## Get the elaboration targets 

1) Providing the context and elaboration sentence seperately.
2) Providing the context and elaboration sentence in one text, pointing out the elaboration sentence.
3) Identify which two sentences are linked by the elaboration sentence.
4) Identify the subject in each elaboration sentence.

In [8]:
example = dataset["train"][10]
context = example["label_text"]
elab = example["elaboration_sentence"]
print("Context text: ", context, end="\n\n")
print("Elaboration sentence: ", elab,end="\n\n")

try:
    response = assistant.find_explanation_target(SYSTEM_PROMPT_TARGET_WITH_LABEL_TEXT , context, elab)
    sentence, target = assistant.extract_response(response)
    
    print("Generated Sentence:", sentence)
    print("Explanation Target:", target)
except:
    print("Error: ", response)

Context text:  Collins wrote that he realized he was gay when he was 12. His twin brother was getting interested in girls, but he was not. His aunt was the first relative with whom he shared his secret. She was not surprised. She accepted him lovingly.

Elaboration sentence:  His aunt was the first relative with whom he shared his secret.

Error:  [{'generated_text': 'target": Collins\' aunt'}]


## Identify the elaboration target when provided within the context text

In [26]:
import pandas as pd
from tqdm import tqdm

test_df["elaboration_target"] = ""

for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    context = row["label_text"]
    elab =  row["elaboration_sentence"]   
    try:
        response = assistant.find_explanation_target(SYSTEM_PROMPT_TARGET_WITH_LABEL_TEXT, context, elab)
        target = assistant.extract_response(response)
        
        test_df.at[index, "elaboration_target"] = target
    
    except Exception as e:
        print(f"Error ID: {index}")
        print(f"Exception: {e}")
        test_df.at[index, "elaboration_target"] = ""

100%|█████████████████████████████████████████| 116/116 [00:22<00:00,  5.10it/s]


In [27]:
len(test_df[test_df["elaboration_target"]==""])

0

In [21]:
import pandas as pd
from tqdm import tqdm

for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    if row["elaboration_target"] == "":
        context = row["label_text"]
        elab =  row["elaboration_sentence"]   
        try:
            response = assistant.find_explanation_target(SYSTEM_PROMPT_TARGET_WITH_LABEL_TEXT, context, elab)
            target = assistant.extract_response(response)
            
            test_df.at[index, "elaboration_target"] = target
        
        except Exception as e:
            print(f"Error ID: {index}")
            print(f"Exception: {e}")
            test_df.at[index, "elaboration_target"] = ""

100%|████████████████████████████████████████| 116/116 [00:00<00:00, 229.17it/s]


In [22]:
len(test_df[test_df["elaboration_target"]==""])

0

In [28]:
test_df

Unnamed: 0,doc_num,source_text,label_text,elaboration_sentence,contextual_specificity_rating,elaboration_target
0,6,They did not need special skills or a college ...,They did not need special skills or a college ...,Many do not have the money to get the training...,1,New Haven youth
1,28,WASHINGTON – At least four people died in Midw...,WASHINGTON – At least four people died in Midw...,A gauge is a kind of measuring stick.,0,a gauge
2,28,WASHINGTON – At least four people died in Midw...,WASHINGTON – At least four people died in Midw...,It sits in the water.,0,a gauge
3,34,Rescue crews swarmed into the ruins at Plaza T...,Rescue crews swarmed into the ruins at Plaza T...,They raced against the setting sun to search t...,1,rescue crews
4,67,"One half of Istanbul lies in Europe, while the...","One half of Istanbul lies in Europe, while the...",Turkey is larger than the state of Texas.,0,state of Turkey
...,...,...,...,...,...,...
111,1140,Some people are talking about it even more tha...,Some people are talking about it even more tha...,"Like many mysteries, this one may not be solve...",2,the mystery
112,1147,Should kids play tackle football? Football is ...,Should kids play tackle football? Football is ...,Players get bounced around.,2,kids
113,1163,Barnett made big changes at Capitol Records. H...,Barnett made big changes at Capitol Records. H...,Then the companies sell and promote the records.,0,record companies
114,1166,They also think it is wrong to force her to pe...,They also think it is wrong to force her to pe...,There are certainly good reasons to be worried.,2,good reasons


In [29]:
test_df.to_csv(os.path.join("../data","gen_predictions", "elab-target-predictions_llama-instr-test_ds_c4sp-context_elab-together.csv"), index=False)

## Indentify the elaboration target when provided seperately

In [28]:
import pandas as pd
from tqdm import tqdm

test_df["elaboration_target"] = ""

for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    context = row["source_text"]
    elab =  row["elaboration_sentence"]   
    try:
        response = assistant.find_explanation_target(SYSTEM_PROMPT_TARGET, context, elab)
        sentence, target = assistant.extract_response(response)
        
        test_df.at[index, "elaboration_target"] = target
    
    except Exception as e:
        print(f"Error ID: {index}")
        print(f"Exception: {e}")
        test_df.at[index, "elaboration_target"] = ""

 41%|█████████████████▍                        | 48/116 [00:31<00:49,  1.37it/s]

Error ID: 47
Exception: 'NoneType' object has no attribute 'start'


100%|█████████████████████████████████████████| 116/116 [01:14<00:00,  1.56it/s]


In [29]:
len(test_df[test_df["elaboration_target"]==""])

1

### Fill NaN fields

In [32]:
import pandas as pd
from tqdm import tqdm

for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    if row["elaboration_target"] == "":
        context = row["source_text"]
        elab =  row["elaboration_sentence"]   
        try:
            response = assistant.find_explanation_target(SYSTEM_PROMPT_TARGET, context, elab)
            sentence, target = assistant.extract_response(response)
            
            test_df.at[index, "elaboration_target"] = target
        
        except Exception as e:
            print(f"Error ID: {index}")
            print(f"Exception: {e}")
            test_df.at[index, "elaboration_target"] = ""

100%|████████████████████████████████████████| 116/116 [00:00<00:00, 133.09it/s]


In [33]:
len(test_df[test_df["elaboration_target"]==""])

0

## Generate elaboration example

In [16]:
idx = 11
#example = dataset["train"][idx]
context = test_df.loc[idx,"source_text"]
target = target_df.loc[idx,"elaboration_target"]
print("Context text: ", context, end="\n\n")
print("Target: ", target)

response = assistant.generate_explanation(SYSTEM_PROMPT_WITH_ELAB_TARGET_PROVIDED, context, target)
sentence = assistant.extract_response(response)

print("Generated Sentence:", sentence)

Context text:  The Internet helped Mark Zuckerberg start Facebook while still a college student.

Target:  Mark Zuckerberg
Generated Sentence: He created the social networking site after dropping out of Harvard.


## Context + elaboration target

In [57]:
import pandas as pd
from tqdm import tqdm

test_df["pred_elaboration"] = ""
test_df["elaboration_target"] =""

for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    context = row["source_text"]
    target = target_df.loc[index,"elaboration_target"]
    
    try:
        response = assistant.generate_explanation(SYSTEM_PROMPT_WITH_ELAB_TARGET_PROVIDED, context, target)
        sentence = assistant.extract_response(response)
        
        test_df.at[index, "pred_elaboration"] = sentence
        test_df.at[index, "elaboration_target"] = target
    
    except Exception as e:
        print(f"Error ID: {index}")
        print(f"Exception: {e}")
        test_df.at[index, "pred_elaboration"] = ""
        test_df.at[index, "elaboration_target"] = ""

 44%|██████████████████▍                       | 51/116 [00:28<00:34,  1.90it/s]

Error ID: 50
Exception: 'NoneType' object has no attribute 'end'


100%|█████████████████████████████████████████| 116/116 [01:04<00:00,  1.79it/s]


### Fill NaNs

In [59]:
import pandas as pd
from tqdm import tqdm

for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    if row["pred_elaboration"]=="":
        context = row["source_text"]
        target = target_df.loc[index,"elaboration_target"]
        try:
            response = assistant.generate_explanation(SYSTEM_PROMPT_WITH_ELAB_TARGET_PROVIDED, context, target)
            sentence = assistant.extract_response(response)
            
            test_df.at[index, "pred_elaboration"] = sentence
            test_df.at[index, "elaboration_target"] = target
        
        except Exception as e:
            print(f"Error ID: {index}")
            print(f"Exception: {e}")
            test_df.at[index, "pred_elaboration"] = ""
            test_df.at[index, "elaboration_target"] = ""

100%|████████████████████████████████████████| 116/116 [00:00<00:00, 190.81it/s]

Error ID: 50
Exception: 'NoneType' object has no attribute 'end'





In [60]:
len(test_df[test_df["pred_elaboration"]==""])

1

In [54]:
test_df

Unnamed: 0,doc_num,source_text,label_text,elaboration_sentence,contextual_specificity_rating,pred_elaboration,elaboration_target
0,6,New companies have come that need skilled work...,New companies have come that need skilled work...,Many do not have the money to get the training...,1,New Haven youth benefit from education and job...,New Haven youth
1,28,But the death toll could have been higher. For...,But the death toll could have been higher. For...,A gauge is a kind of measuring stick.,0,A gauge is a device that measures water levels...,a gauge
2,28,Forecasters said more people could have died i...,Forecasters said more people could have died i...,It sits in the water.,0,A gauge is essentially a water level sensor.,a gauge
3,34,Helicopters were banned from flying over the s...,Helicopters were banned from flying over the s...,They raced against the setting sun to search t...,1,Rescuers are the people who save those in need...,rescuers
4,67,Istanbul is an important city for trade. Turke...,Istanbul is an important city for trade. Turke...,Turkey is larger than the state of Texas.,0,Turkey is a transcontinental country located i...,Turkey
...,...,...,...,...,...,...,...
111,1140,"Instead of preparing for the Super Bowl, he ha...","Instead of preparing for the Super Bowl, he ha...","Like many mysteries, this one may not be solve...",2,Deflation of a football is when it loses air p...,the deflation of footballs
112,1147,Should kids play tackle football? Football is ...,Should kids play tackle football? Football is ...,Players get bounced around.,2,"Children's brains are still developing, making...",players
113,1163,He signed two of the biggest acts of last year...,He signed two of the biggest acts of last year...,Then the companies sell and promote the records.,0,Record companies are businesses that manage an...,record companies
114,1166,It worries about the spread of disease. It als...,It worries about the spread of disease. It als...,There are certainly good reasons to be worried.,2,These concerns highlight potential risks to th...,good reasons


In [61]:
test_df.to_csv(os.path.join("../data","gen_predictions", "predictions_llama-instr-test_ds_c4osp-context_elab-provided.csv"), index=False)

## Context only

In [34]:
import pandas as pd
from tqdm import tqdm

test_df["pred_elaboration"] = ""
test_df["pred_elaboration_target"] = ""

for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    context = row["source_text"]
    
    try:
        response = assistant.generate_explanation(SYSTEM_PROMPT, context)
        sentence, target = assistant.extract_response(response)
        
        test_df.at[index, "pred_elaboration"] = sentence
        test_df.at[index, "pred_elaboration_target"] = target
    
    except Exception as e:
        print(f"Error ID: {index}")
        print(f"Exception: {e}")
        test_df.at[index, "pred_elaboration"] = ""
        test_df.at[index, "pred_elaboration_target"] = ""

100%|█████████████████████████████████████████| 116/116 [01:23<00:00,  1.39it/s]


In [35]:
len(test_df[test_df["pred_elaboration"]==""])

0

In [36]:
len(test_df[test_df["pred_elaboration_target"]==""])

0

In [37]:
print(test_df["pred_elaboration"].dropna().apply(len).mean())

63.81896551724138


In [38]:
print(test_df["elaboration_sentence"].dropna().apply(len).mean())

51.672413793103445


### Fill NaN fields

In [23]:
import pandas as pd
from tqdm import tqdm

for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    if row["pred_elaboration"]=="" or row["pred_elaboration_target"]=="":
        context = row["source_text"]
        try:
            response = assistant.generate_explanation(SYSTEM_PROMPT, context)
            sentence, target = assistant.extract_response(response)
            
            test_df.at[index, "pred_elaboration"] = sentence
            test_df.at[index, "pred_elaboration_target"] = target
        
        except Exception as e:
            print(f"Error ID: {index}")
            print(f"Exception: {e}")
            test_df.at[index, "pred_elaboration"] = ""
            test_df.at[index, "pred_elaboration_target"] = ""

100%|█████████████████████████████████████████| 116/116 [00:01<00:00, 79.83it/s]


In [24]:
len(test_df[test_df["pred_elaboration_target"]==""])

0

In [25]:
len(test_df[test_df["pred_elaboration"]==""])

0

In [39]:
test_df.to_csv(os.path.join("../data","gen_predictions", "predictions_llama-instr-test_ds_c2os-context_elab-seperately.csv"), index=False)