# Load model

In [2]:
import os
#os.environ["OPENAI_API_KEY"] = "my_key"

In [3]:
from openai import OpenAI 
import os

MODEL="gpt-4o-mini"
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Load data

In [4]:
from datasets import load_dataset
import os

data_path = "../data/elaborations"

data_files_c2sp = {
    'train': os.path.join(data_path,"train","train_ds_c2sp.csv"),      
    'validation': os.path.join(data_path,"validation","valid_ds_c2sp.csv"),  
    'test': os.path.join(data_path,"test","test_ds_c2sp.csv")         
}

dataset = load_dataset('csv', data_files=data_files_c2sp)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 1049
    })
    validation: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 134
    })
    test: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 116
    })
})


# Prompt and messages

In [6]:
SYSTEM_PROMPT_TARGET = """You are an expert in determining what the explanation sentence provided is explaining within the context text.
Guidelines: 
- The "explanation target" should be a specific phrase, term, action, or concept in the context text provided. 
- The "explanation target" should be written as a noun phrase, not as a complete sentence.
- If there are multiple possible targets, select the first target that you think is most appropriate.

Return your answer in the following format:
{"sentence":"<provided explanation sentence>", "target": "<explanation target>"}
"""

# Turn examples into ChatML format

In [62]:
def format_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_INFO},
            {"role": "user", "content":  "Identify the target of the following explanation sentence: '{}' within the given text: '{}'".format(
    example["elaboration_sentence"], example["label_text"])},
        ]
    }

def format_target_sent_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_INFO},
            {"role": "user", "content":  "Identify the target sentence that the following explanation: '{}' is clarifying within the given text: '{}'".format(
    example["elaboration_sentence"], example["label_text"])},
        ]
    }
formatted_train_dataset = dataset["train"].map(format_target_sent_example)
formatted_validation_dataset = dataset["validation"].map(format_target_sent_example)
formatted_test_dataset = dataset["test"].map(format_target_sent_example)

Map:   0%|          | 0/1049 [00:00<?, ? examples/s]

Map:   0%|          | 0/134 [00:00<?, ? examples/s]

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

# Identify subjects of elaborations

Analyze the subjects to explore the types of elaboration sentences.

In [57]:
SYSTEM_PROMPT_SUBJECT = """You are an expert in identifying the subject of the provided explanation sentence based on the context text. 
If the subject of the explanation sentence is a **pronoun (e.g., "it," "they," "he," "she")**, determine what the pronoun is referring to within the context.
The subject should be written as a phrase, not as a complete sentence.
If there are multiple possible subjects, select the first one you think is most appropriate.
"""

SYSTEM_PROMPT_TARGET = """You are an expert in identifying the target of the provided explanation sentence based on the context text. 
Return the main phrase which explanation sentence is referring to.
The phrase must appear in the context text, but does not need to be present in the explanation sentence itself.
"""

SYSTEM_PROMPT_TARGET_INFO = """
You are an expert in identifying the sentence that the provided explanation sentence is clarifying. 
First, determine what question the explanation sentence is answering, and based on that, identify the sentence within the context text (sentences surrounding the explanation sentence) that the question could be asked about.
Return the sentence from the context text that the explanation sentence is clarifying.
"""

#Return the sentence from the context text that the explanation sentence is clarifying.

In [61]:
example_subject = "Identify the subject of the following explanation sentence: '{}' within the given text: '{}'".format(
    dataset["test"][100]["elaboration_sentence"], dataset["test"][100]["label_text"])
print(example, end="\n\n")

example = "Identify the target sentence that the following explanation: '{}' is clarifying within the given text: '{}'".format(
    dataset["test"][100]["elaboration_sentence"], dataset["test"][100]["label_text"])

completion = client.chat.completions.create(
  model=MODEL,
  messages=[
    {"role": "system", "content": SYSTEM_PROMPT_TARGET_INFO}, 
    {"role": "user", "content": example }  
  ]
)

print("Assistant: " + completion.choices[0].message.content)

Identify the subject of the following explanation sentence: 'They come in all shapes and sizes, but usually look like helicopters.' within the given text: 'HACKENSACK, N.J. " The hottest present this holiday season is a gadget both kids and adults can enjoy: a drone. Drones are small flying aircraft. They come in all shapes and sizes, but usually look like helicopters. They often have cameras attached. One popular drone this year is the Phantom 2 Vision.'

Assistant: The explanation sentence 'They come in all shapes and sizes, but usually look like helicopters.' is clarifying the sentence: 'Drones are small flying aircraft.'


In [54]:
import pandas as pd
from tqdm.notebook import tqdm

#df_results = pd.read_csv("../data/elaborations/test_ds_c2sp_subjects_targets.csv")
df_results = pd.DataFrame({
    "source_text" : dataset["train"]["source_text"],
    "label_text" : dataset["train"]["label_text"],
    "elaboration_sentence":dataset["train"]["elaboration_sentence"],
    "subject": "",
    "target":"",
    "elaboration_info":""
    
})

formatted_dataset = formatted_train_dataset

targets = []
for example in tqdm(formatted_test_dataset,total=len(formatted_dataset)):
    completion = client.chat.completions.create(model=MODEL,messages=example["messages"])
    targets.append(completion.choices[0].message.content)

  0%|          | 0/116 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [30]:
df_results["elaboration_info"] = targets 

In [20]:
idx= 100
print(df_results.loc[idx,"label_text"], end="\n\n")
print(df_results.loc[idx, "target"])

HACKENSACK, N.J. " The hottest present this holiday season is a gadget both kids and adults can enjoy: a drone. Drones are small flying aircraft. They come in all shapes and sizes, but usually look like helicopters. They often have cameras attached. One popular drone this year is the Phantom 2 Vision.

drones


In [31]:
df_results

Unnamed: 0,source_text,label_text,elaboration_sentence,subject,target,elaboration_info
0,New companies have come that need skilled work...,New companies have come that need skilled work...,Many do not have the money to get the training...,Many (youth),the training they need,The explanation sentence clarifies the reason ...
1,But the death toll could have been higher. For...,But the death toll could have been higher. For...,A gauge is a kind of measuring stick.,A gauge,gauge,The explanation sentence 'A gauge is a kind of...
2,Forecasters said more people could have died i...,Forecasters said more people could have died i...,It sits in the water.,a gauge,gauge,"The explanation sentence ""It sits in the water..."
3,Helicopters were banned from flying over the s...,Helicopters were banned from flying over the s...,They raced against the setting sun to search t...,Rescuers,the area,The explanation sentence 'They raced against t...
4,Istanbul is an important city for trade. Turke...,Istanbul is an important city for trade. Turke...,Turkey is larger than the state of Texas.,Turkey,Turkey,The explanation sentence 'Turkey is larger tha...
...,...,...,...,...,...,...
111,"Instead of preparing for the Super Bowl, he ha...","Instead of preparing for the Super Bowl, he ha...","Like many mysteries, this one may not be solve...",this one,the deflated footballs,"The explanation sentence 'Like many mysteries,..."
112,Should kids play tackle football? Football is ...,Should kids play tackle football? Football is ...,Players get bounced around.,Players,Players,"The explanation sentence ""Players get bounced ..."
113,He signed two of the biggest acts of last year...,He signed two of the biggest acts of last year...,Then the companies sell and promote the records.,the companies,the records,The explanation sentence 'Then the companies s...
114,It worries about the spread of disease. It als...,It worries about the spread of disease. It als...,There are certainly good reasons to be worried.,the spread of disease and the inability of new...,the spread of disease,"The explanation sentence ""There are certainly ..."


In [32]:
df_results.to_csv("../data/elaborations/test_ds_c2sp_subjects_targets.csv", index=False)