# Load model

In [2]:
import os
#os.environ["OPENAI_API_KEY"] = "my_key"

In [3]:
from openai import OpenAI 
import os

MODEL="gpt-4o"
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Load data

In [80]:
from datasets import load_dataset
import os

data_path = "../data/elaborations"

data_files_c2s = {
    'train': os.path.join(data_path,"train","train_ds_c2s.csv"),      
    'validation': os.path.join(data_path,"validation","valid_ds_c2s.csv"),  
    'test': os.path.join(data_path,"test","test_ds_c2s.csv")         
}

data_files_c2sp = {
    'train': os.path.join(data_path,"train","train_ds_c2sp.csv"),      
    'validation': os.path.join(data_path,"validation","valid_ds_c2sp.csv"),  
    'test': os.path.join(data_path,"test","test_ds_c2sp.csv")         
}

dataset = load_dataset('csv', data_files=data_files_c2s)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 1046
    })
    validation: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 132
    })
    test: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 116
    })
})


# Prompt and messages

In [5]:
SYSTEM_PROMPT_SUBJECT = """You are an expert in identifying the subject of the provided explanation sentence based on the context text. 
If the subject of the explanation sentence is a **pronoun (e.g., "it," "they," "he," "she")**, determine what the pronoun is referring to within the context.
The subject should be written as a phrase, not as a complete sentence.
If there are multiple possible subjects, select the first one you think is most appropriate.
"""

SYSTEM_PROMPT_TARGET = """You are an expert in identifying the target of the provided explanation sentence based on the context text. 
Return the main phrase which explanation sentence is referring to.
The phrase must appear in the context text, but does not need to be present in the explanation sentence itself.
"""

SYSTEM_PROMPT_TARGET_INFO = """
You are an expert in identifying the sentence that the provided explanation sentence is clarifying. 
First, determine what question the explanation sentence is answering, and based on that, identify the sentence within the context text (sentences surrounding the explanation sentence) that the question could be asked about.
Return the sentence from the context text that the explanation sentence is clarifying.
"""

#Return the sentence from the context text that the explanation sentence is clarifying.

# Turn examples into ChatML format

In [7]:
def format_subject_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_SUBJECT},
            {"role": "user", "content":  "Identify the subject of the following explanation sentence: '{}' within the given text: '{}'".format(
    example["elaboration_sentence"], example["label_text"])},
        ]
    }

def format_target_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET},
            {"role": "user", "content":  "Identify the target of the following explanation sentence: '{}' within the given text: '{}'".format(
    example["elaboration_sentence"], example["label_text"])},
        ]
    }

def format_target_sent_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_INFO},
            {"role": "user", "content":  "Identify the target sentence that the following explanation: '{}' is clarifying within the given text: '{}'".format(
    example["elaboration_sentence"], example["label_text"])},
        ]
    }
formatted_train_dataset = dataset["train"].map(format_target_sent_example)
formatted_validation_dataset = dataset["validation"].map(format_target_sent_example)
formatted_test_dataset = dataset["test"].map(format_target_sent_example)

# Identify subjects of elaborations

Analyze the subjects to explore the types of elaboration sentences.

In [8]:
from pydantic import BaseModel

class ExplanationTarget(BaseModel):
    #explanation_sentence: str
    target_sentence: str

In [16]:
import re

def extract_target_sentence(response: str) -> str:
    """
    Extract the target_sentence from the model's response.
    """
    match = re.search(r"target_sentence='(.*?)'", response)
    if match:
        return match.group(1) 
    else:
        raise ValueError("target_sentence not found.")

In [22]:
example_subject = "Identify the subject of the following explanation sentence: '{}' within the given text: '{}'".format(
    dataset["test"][100]["elaboration_sentence"], dataset["test"][100]["label_text"])

example = "Identify the target sentence that the following explanation: '{}' is clarifying within the given text: '{}'".format(
    dataset["test"][100]["elaboration_sentence"], dataset["test"][100]["label_text"])

completion = client.chat.completions.create(
  model=MODEL,
  messages=[
    {"role": "system", "content": SYSTEM_PROMPT_TARGET_INFO}, 
    {"role": "user", "content": example }  
  ]
)
print("Assistant: " + completion.choices[0].message.content, end="\n\n") # standard format: completion.choices[0].message.content 

completion = client.beta.chat.completions.parse(
  model=MODEL,
  messages=[
    {"role": "system", "content": SYSTEM_PROMPT_TARGET_INFO}, 
    {"role": "user", "content": example }  
  ],
    response_format= ExplanationTarget,
)

response = completion.choices[0].message.parsed
print(response)
print(extract_target_sentence(str(response)))

Assistant: The explanation sentence 'They come in all shapes and sizes, but usually look like helicopters.' is clarifying the sentence: 'Drones are small flying aircraft.'

target_sentence='Drones are small flying aircraft.'
Drones are small flying aircraft.


In [31]:
df_results = pd.DataFrame({
    "source_text" : dataset["validation"]["source_text"],
    "label_text" : dataset["validation"]["label_text"],
    "elaboration_sentence":dataset["validation"]["elaboration_sentence"],
    "subject": "",
    "target":"",
    "elaboration_info":"",
    "target_sentence":""
    
})

## Get subjects, targets and elaboration info

In [43]:
import pandas as pd
from tqdm.notebook import tqdm

df_results = pd.read_csv("../data/elaborations/validation_ds_sp_subjects_targets.csv")


formatted_dataset = formatted_validation_dataset
column_name = "elaboration_info"
df_results[column_name] = df_results[column_name].fillna("")

for idx, example in tqdm(enumerate(formatted_dataset),total=len(formatted_dataset)):
    if df_results.at[idx,column_name]=="":
        completion = client.chat.completions.create(model=MODEL,messages=example["messages"])
        df_results.at[idx,column_name] = completion.choices[0].message.content

  0%|          | 0/134 [00:00<?, ?it/s]

## Get target sentences for elaborations

In [14]:
import pandas as pd
from tqdm.notebook import tqdm

df_results = pd.read_csv("../data/elaborations/train_ds_sp_subjects_targets.csv")
df_results["target_sentence_4o"] = ""
column_name = "target_sentence_4o"
#df_results[column_name] = df_results[column_name].fillna("")
formatted_dataset = formatted_train_dataset

for idx, example in tqdm(enumerate(formatted_dataset),total=len(formatted_dataset)):
    if df_results.at[idx,column_name] == "":
        completion = client.beta.chat.completions.parse(model=MODEL,messages=example["messages"],response_format= ExplanationTarget)
        df_results.at[idx,column_name] = completion.choices[0].message.parsed

  0%|          | 0/1049 [00:00<?, ?it/s]

# Save results

In [15]:
len(df_results[df_results[column_name]==""])

0

In [20]:
idx= 100
print(df_results.loc[idx,"label_text"], end="\n\n")
print(df_results.loc[idx, "target"])

HACKENSACK, N.J. " The hottest present this holiday season is a gadget both kids and adults can enjoy: a drone. Drones are small flying aircraft. They come in all shapes and sizes, but usually look like helicopters. They often have cameras attached. One popular drone this year is the Phantom 2 Vision.

drones


In [None]:
df_results

In [17]:
df_results.to_csv("../data/elaborations/train_ds_sp_subjects_targets.csv", index=False)

## Create additional df for prior-context only datasets

In [74]:
import pandas as pd
from tqdm.notebook import tqdm

split = "test"
df_sp = pd.read_csv(f"../data/elaborations/{split}_ds_sp_subjects_targets.csv")
df_sp = df_sp.drop_duplicates(subset="elaboration_sentence", keep="first")
df = pd.DataFrame({
    "source_text" : dataset[split]["source_text"],
    "label_text" : dataset[split]["label_text"],
    "elaboration_sentence":dataset[split]["elaboration_sentence"],
    
}) 

In [75]:
print(len(df_sp))
print(len(df))

116
116


In [None]:
# merge the additional columns from df_sp based on elaboration_sentence
columns_to_add = ["subject", "target", "elaboration_info", "target_sentence_4o"]
df = pd.merge(df, df_sp[["elaboration_sentence"] + columns_to_add], on="elaboration_sentence", how="left")
df

In [78]:
df.to_csv(f"../data/elaborations/{split}_ds_s_subjects_targets.csv")