# Load model and tokenizer

In [1]:
from model_utils import LlamaAssistant

assistant = LlamaAssistant(model_name="meta-llama/Llama-3.2-3B-Instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Load datasets

In [2]:
from datasets import load_dataset
import os

data_path = "../data/elaborations"

data_files_c2sp = {
    'train': os.path.join(data_path,"train","train_ds_c2sp.csv"),      
    'validation': os.path.join(data_path,"validation","valid_ds_c2sp.csv"),  
    'test': os.path.join(data_path,"test","test_ds_c2sp.csv")         
}
dataset = load_dataset('csv', data_files=data_files_c2sp)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 1049
    })
    validation: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 134
    })
    test: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 116
    })
})


In [3]:
import os
import pandas as pd

data_path = "../data/elaborations"
test_ds_c2sp = "test_ds_c2sp.csv"
test_ds_c2s = "test_ds_c2s.csv"
test_ds_c4s = "test_ds_c4s.csv"
test_ds_c4sp = "test_ds_c4sp.csv"
test_df = pd.read_csv(os.path.join(data_path, "test", test_ds_c4sp))

# Create Prompt 

In [4]:
SYSTEM_PROMPT = """You are a helpful assistant who generates exactly one short, simple explanatory sentence ( made up of around 10 words or fewer) in a plain English for a given context. 
Your task is to provide additional information related to a complex statement, term, action, or concept that is semantically missing from the context document.
You may do this by offering a definition, examples, background knowledge, general statements, a description of the flow of actions, or an explanation of the reason or result of the target action.
Also, specify the target of your explanation as found in the context text. The "explanation target" should be written as a simple, concise noun phrase, not as a complete sentence.
Return your answer in the following format:
{"sentence": "<explanatory sentence text>", "target": "<explanation target>"}
"""

In [5]:
SYSTEM_PROMPT_TARGET = """You are a helpful assistant whose task is to determine what the explanation sentence provided is explaining within the context text.
Gidelines: 
- The "explanation target" should be a specific phrase, term, action, or concept in the context text provided. 
- The "explanation target" should be written as a noun phrase, not as a complete sentence.
- If there are multiple possible targets, select the first target that you think is most appropriate.

Return your answer in the following format:
{"sentence":"<provided explanation sentence>", "target": "<explanation target>"}
"""

# Generate 

## Get the elaboration targets 

1) Providing the context and elaboration sentence seperately.
2) Providing the context and elaboration sentence in one text, pointing out the elaboration sentence.
3) Identify which two sentences are linked by the elaboration sentence.
4) Identify the subject in each elaboration sentence.

In [6]:
example = dataset["train"][8]
context = example["source_text"]
elab = example["elaboration_sentence"]
print("Context text: ", context, end="\n\n")
print("Elaboration sentence: ", elab,end="\n\n")

try:
    response = assistant.find_explanation_target(SYSTEM_PROMPT_TARGET, context, elab)
    sentence, target = assistant.extract_response(response)
    
    print("Generated Sentence:", sentence)
    print("Explanation Target:", target)
except:
    print("Error: ", response)

Context text:  Two other girls were wounded. The Pakistani government sent Malala to England for treatment. One politician called Malala "a beacon of knowledge." Pakistan's President Asif Ali Zardari said it was "an attack on all girls in Pakistan, an attack on education, and on all civilized people."

Elaboration sentence:  Many politicians called the gunmen "beasts."

Generated Sentence: Many politicians called you the 'beast' in the media.
Explanation Target: beast


In [7]:
import pandas as pd
from tqdm import tqdm

test_df["elaboration_target"] = ""

for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    context = row["source_text"]
    elab =  row["elaboration_sentence"]   
    try:
        response = assistant.find_explanation_target(SYSTEM_PROMPT_TARGET, context, elab)
        sentence, target = assistant.extract_response(response)
        
        test_df.at[index, "elaboration_target"] = target
    
    except Exception as e:
        print(f"Error ID: {index}")
        print(f"Exception: {e}")
        test_df.at[index, "elaboration_target"] = ""

  8%|███▎                                       | 9/116 [00:07<01:29,  1.20it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|█████████████████████████████████████████| 116/116 [01:27<00:00,  1.32it/s]


In [8]:
len(test_df[test_df["elaboration_target"]==""])

3

### Fill NaN fields

In [9]:
import pandas as pd
from tqdm import tqdm

for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    if row["elaboration_target"] == "":
        context = row["source_text"]
        elab =  row["elaboration_sentence"]   
        try:
            response = assistant.find_explanation_target(SYSTEM_PROMPT_TARGET, context, elab)
            sentence, target = assistant.extract_response(response)
            
            test_df.at[index, "elaboration_target"] = target
        
        except Exception as e:
            print(f"Error ID: {index}")
            print(f"Exception: {e}")
            test_df.at[index, "elaboration_target"] = ""

100%|█████████████████████████████████████████| 116/116 [00:02<00:00, 55.82it/s]


In [10]:
len(test_df[test_df["elaboration_target"]==""])

0

## Get the elaborations for given context

In [11]:
example = dataset["train"][11]
context = example["source_text"]
elab = example["elaboration_sentence"]
print("Context text: ", context, end="\n\n")
print("Elaboration sentence: ", elab,end="\n\n")

response = assistant.generate_explanation(SYSTEM_PROMPT, context)
sentence, target = assistant.extract_response(response)

print("Generated Sentence:", sentence)
print("Explanation Target:", target)

Context text:  "I can only hope that it's not going to get worse," Boudreau said. Illinois has laws about animal abuse and neglect. County animal control departments can issue fines. Rescue groups and volunteers may handle some abuse and neglect calls.

Elaboration sentence:  The laws explain what state workers can do to deal with the cases.

Generated Sentence: Illinois animal abuse laws allow county control and rescue groups to intervene and fine offenders.
Explanation Target: animal abuse laws


In [12]:
import pandas as pd
from tqdm import tqdm

test_df["pred_elaboration"] = ""
test_df["pred_elaboration_target"] = ""

for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    context = row["source_text"]
    
    try:
        response = assistant.generate_explanation(SYSTEM_PROMPT, context)
        sentence, target = assistant.extract_response(response)
        
        test_df.at[index, "pred_elaboration"] = sentence
        test_df.at[index, "pred_elaboration_target"] = target
    
    except Exception as e:
        print(f"Error ID: {index}")
        print(f"Exception: {e}")
        test_df.at[index, "pred_elaboration"] = ""
        test_df.at[index, "pred_elaboration_target"] = ""

100%|█████████████████████████████████████████| 116/116 [01:38<00:00,  1.17it/s]


In [13]:
len(test_df[test_df["pred_elaboration"]==""])

0

In [14]:
len(test_df[test_df["pred_elaboration_target"]==""])

0

In [15]:
print(test_df["pred_elaboration"].dropna().apply(len).mean())

64.5603448275862


In [16]:
print(test_df["elaboration_sentence"].dropna().apply(len).mean())

51.672413793103445


### Fill NaN fields

In [32]:
import pandas as pd
from tqdm import tqdm

for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    if row["pred_elaboration"]=="" or row["pred_elaboration_target"]=="":
        context = row["source_text"]
        try:
            response = assistant.generate_explanation(SYSTEM_PROMPT, context)
            sentence, target = assistant.extract_response(response)
            
            test_df.at[index, "pred_elaboration"] = sentence
            test_df.at[index, "pred_elaboration_target"] = target
        
        except Exception as e:
            print(f"Error ID: {index}")
            print(f"Exception: {e}")
            test_df.at[index, "pred_elaboration"] = ""
            test_df.at[index, "pred_elaboration_target"] = ""

100%|████████████████████████████████████████| 116/116 [00:00<00:00, 130.07it/s]


In [33]:
len(test_df[test_df["pred_elaboration_target"]==""])

0

In [34]:
len(test_df[test_df["pred_elaboration"]==""])

0

In [18]:
test_df.to_csv(os.path.join("../data","gen_predictions", "predictions_llama-instr-test_ds_c4sp-context_elab-seperately.csv"), index=False)