# Load model

In [3]:
import os
#os.environ["OPENAI_API_KEY"] = "my_key"

In [2]:
from openai import OpenAI 
import os

MODEL="gpt-4o"
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Load data

In [46]:
from datasets import load_dataset
import os

data_path = "../data/elaborations"

data_files_c2s = {
    'train': os.path.join(data_path,"train","train_ds_c2s.csv"),      
    'validation': os.path.join(data_path,"validation","valid_ds_c2s.csv"),  
    'test': os.path.join(data_path,"test","test_ds_c2s.csv")         
}

data_files_c2sp = {
    'train': os.path.join(data_path,"train","train_ds_c2sp.csv"),      
    'validation': os.path.join(data_path,"validation","valid_ds_c2sp.csv"),  
    'test': os.path.join(data_path,"test","test_ds_c2sp.csv")         
}

dataset = load_dataset('csv', data_files=data_files_c2sp)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 1049
    })
    validation: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 134
    })
    test: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 116
    })
})


### Add columns

In [47]:
import pandas as pd
import os

df_train_s = pd.read_csv(os.path.join(data_path, "train_ds_s_subjects_targets.csv"))
df_train_sp = pd.read_csv(os.path.join(data_path, "train_ds_sp_subjects_targets.csv"))

df_valid_s = pd.read_csv(os.path.join(data_path, "validation_ds_s_subjects_targets.csv"))
df_valid_sp = pd.read_csv(os.path.join(data_path, "validation_ds_sp_subjects_targets.csv"))

df_test_s = pd.read_csv(os.path.join(data_path, "test_ds_s_subjects_targets.csv"))
df_test_sp = pd.read_csv(os.path.join(data_path, "test_ds_sp_subjects_targets.csv"))

df_train = df_train_sp
df_valid = df_valid_sp
df_test = df_test_sp

col_name = "target_sentence_4o"

dataset["train"] = dataset["train"].add_column(col_name, df_train[col_name])
dataset["validation"] = dataset["validation"].add_column(col_name, df_valid[col_name])
dataset["test"] = dataset["test"].add_column(col_name, df_test[col_name])

# Prompt and messages

In [48]:
SYSTEM_PROMPT_SUBJECT = """You are an expert in identifying the subject of the provided explanation sentence based on the context text. 
If the subject of the explanation sentence is a **pronoun (e.g., "it," "they," "he," "she")**, determine what the pronoun is referring to within the context.
The subject should be written as a phrase, not as a complete sentence.
If there are multiple possible subjects, select the first one you think is most appropriate.
"""

SYSTEM_PROMPT_TARGET = """You are an expert in identifying the target of the provided explanation sentence based on the context text. 
Return the main phrase which explanation sentence is referring to.
The phrase must appear in the context text, but does not need to be present in the explanation sentence itself.
"""

SYSTEM_PROMPT_TARGET_INFO = """
You are an expert in identifying the sentence that the provided explanation sentence is clarifying. 
First, determine what question the explanation sentence is answering, and based on that, identify the sentence within the context text (sentences surrounding the explanation sentence) that the question could be asked about.
Return the sentence from the context text that the explanation sentence is clarifying.
"""

SYSTEM_PROMPT_TARGET_SENT_TARGET = """
You are an expert in identifying the target phrase in a given sentence that is being clarified by the accompanying explanation sentence.
"""

#Return the sentence from the context text that the explanation sentence is clarifying.

# Turn examples into ChatML format

In [49]:
def format_subject_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_SUBJECT},
            {"role": "user", "content":  "Identify the subject of the following explanation sentence: '{}' within the given text: '{}'".format(
    example["elaboration_sentence"], example["label_text"])},
        ]
    }

def format_target_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET},
            {"role": "user", "content":  "Identify the target of the following explanation sentence: '{}' within the given text: '{}'".format(
    example["elaboration_sentence"], example["label_text"])},
        ]
    }

def format_target_sent_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_INFO},
            {"role": "user", "content":  "Identify the target sentence that the following explanation: '{}' is clarifying within the given text: '{}'".format(
    example["elaboration_sentence"], example["label_text"])},
        ]
    }

def format_target_sent_target_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT_TARGET},
            {"role": "user", "content":  "Identify the target phrase in the {} that is being clarified by the explanation sentence: '{}'".format(
    example["target_sentence_4o"], example["elaboration_sentence"])},
        ]
    }
    
formatted_train_dataset = dataset["train"].map(format_target_sent_target_example)
formatted_validation_dataset = dataset["validation"].map(format_target_sent_target_example)
formatted_test_dataset = dataset["test"].map(format_target_sent_target_example)

# Identify subjects of elaborations

Analyze the subjects to explore the types of elaboration sentences.

In [50]:
from pydantic import BaseModel

class ExplanationTarget(BaseModel):
    #explanation_sentence: str
    #target_sentence: str
    target_phrase: str

In [16]:
import re

def extract_target_sentence(response: str) -> str:
    """
    Extract the target_sentence from the model's response.
    """
    match = re.search(r"target_sentence='(.*?)'", response)
    if match:
        return match.group(1) 
    else:
        raise ValueError("target_sentence not found.")

In [15]:
example_subject = "Identify the subject of the following explanation sentence: '{}' within the given text: '{}'".format(
    dataset["test"][100]["elaboration_sentence"], dataset["test"][100]["label_text"])

example = "Identify the target phrase in the following sentence: '{}' that is being clarified by the explanation sentence: '{}'".format(
    dataset["test"][100]["target_sentence_4o"], dataset["test"][100]["elaboration_sentence"])

completion = client.chat.completions.create(
  model=MODEL,
  messages=[
    {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT_TARGET}, 
    {"role": "user", "content": example }  
  ]
)
print("Assistant: " + completion.choices[0].message.content, end="\n\n") # standard format: completion.choices[0].message.content 

completion = client.beta.chat.completions.parse(
  model=MODEL,
  messages=[
    {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT_TARGET}, 
    {"role": "user", "content": example }  
  ],
    response_format= ExplanationTarget,
)

response = completion.choices[0].message.parsed
print(response)

Assistant: The target phrase in the target sentence that is being clarified by the explanation sentence is "Drones."

target_phrase='Drones'


In [31]:
df_results = pd.DataFrame({
    "source_text" : dataset["validation"]["source_text"],
    "label_text" : dataset["validation"]["label_text"],
    "elaboration_sentence":dataset["validation"]["elaboration_sentence"],
    "subject": "",
    "target":"",
    "elaboration_info":"",
    "target_sentence":""
    
})

## Get subjects, targets and elaboration info

In [43]:
import pandas as pd
from tqdm.notebook import tqdm

df_results = pd.read_csv("../data/elaborations/validation_ds_sp_subjects_targets.csv")


formatted_dataset = formatted_validation_dataset
column_name = "elaboration_info"
df_results[column_name] = df_results[column_name].fillna("")

for idx, example in tqdm(enumerate(formatted_dataset),total=len(formatted_dataset)):
    if df_results.at[idx,column_name]=="":
        completion = client.chat.completions.create(model=MODEL,messages=example["messages"])
        df_results.at[idx,column_name] = completion.choices[0].message.content

  0%|          | 0/134 [00:00<?, ?it/s]

## Get target sentences for elaborations

In [51]:
import pandas as pd
from tqdm.notebook import tqdm

df_results = pd.read_csv("../data/elaborations/test_ds_sp_subjects_targets.csv")
df_results["target_sentence_target"] = ""
column_name = "target_sentence_target"
#df_results[column_name] = df_results[column_name].fillna("")
formatted_dataset = formatted_test_dataset

for idx, example in tqdm(enumerate(formatted_dataset),total=len(formatted_dataset)):
    if df_results.at[idx,column_name] == "":
        completion = client.beta.chat.completions.parse(model=MODEL,messages=example["messages"],response_format= ExplanationTarget)
        df_results.at[idx,column_name] = completion.choices[0].message.parsed

  0%|          | 0/116 [00:00<?, ?it/s]

# Save results

In [52]:
len(df_results[df_results[column_name]==""])

0

In [53]:
idx= 100
print(df_results.loc[idx,"elaboration_sentence"], end="\n\n")
print(df_results.loc[idx, column_name])

They come in all shapes and sizes, but usually look like helicopters.

target_phrase='Drones'


In [54]:
df_results

Unnamed: 0,source_text,label_text,elaboration_sentence,subject,target,elaboration_info,target_sentence_4o,target_sentence_target
0,New companies have come that need skilled work...,New companies have come that need skilled work...,Many do not have the money to get the training...,Many (youth),the training they need,The explanation sentence clarifies the reason ...,target_sentence='New Haven youth want those jo...,target_phrase='do not have the education or th...
1,But the death toll could have been higher. For...,But the death toll could have been higher. For...,A gauge is a kind of measuring stick.,A gauge,gauge,The explanation sentence 'A gauge is a kind of...,target_sentence='Forecasters said more people ...,target_phrase='river gauges'
2,Forecasters said more people could have died i...,Forecasters said more people could have died i...,It sits in the water.,a gauge,gauge,"The explanation sentence ""It sits in the water...",target_sentence='A gauge is a kind of measurin...,target_phrase='gauge'
3,Helicopters were banned from flying over the s...,Helicopters were banned from flying over the s...,They raced against the setting sun to search t...,Rescuers,the area,The explanation sentence 'They raced against t...,target_sentence='Rescuers feared engine noise ...,target_phrase='Rescuers feared engine noise wo...
4,Istanbul is an important city for trade. Turke...,Istanbul is an important city for trade. Turke...,Turkey is larger than the state of Texas.,Turkey,Turkey,The explanation sentence 'Turkey is larger tha...,"target_sentence=""Turkey is also one of the Uni...",target_phrase='Turkey'
...,...,...,...,...,...,...,...,...
111,"Instead of preparing for the Super Bowl, he ha...","Instead of preparing for the Super Bowl, he ha...","Like many mysteries, this one may not be solve...",this one,the deflated footballs,"The explanation sentence 'Like many mysteries,...",target_sentence='The NFL is looking into the m...,target_phrase='the matter'
112,Should kids play tackle football? Football is ...,Should kids play tackle football? Football is ...,Players get bounced around.,Players,Players,"The explanation sentence ""Players get bounced ...",target_sentence='Football is a rough game.',target_phrase='rough game'
113,He signed two of the biggest acts of last year...,He signed two of the biggest acts of last year...,Then the companies sell and promote the records.,the companies,the records,The explanation sentence 'Then the companies s...,target_sentence='Record companies have to sign...,target_phrase='sign contracts with musicians a...
114,It worries about the spread of disease. It als...,It worries about the spread of disease. It als...,There are certainly good reasons to be worried.,the spread of disease and the inability of new...,the spread of disease,"The explanation sentence ""There are certainly ...",target_sentence='It also worries that newly re...,target_phrase='worries'


In [55]:
df_results.to_csv("../data/elaborations/test_ds_sp_subjects_targets.csv", index=False)

## Create additional df for prior-context only datasets

In [56]:
import pandas as pd
from tqdm.notebook import tqdm

split = "test"
dataset = load_dataset('csv', data_files=data_files_c2s)
df_sp = pd.read_csv(f"../data/elaborations/{split}_ds_sp_subjects_targets.csv")
df_sp = df_sp.drop_duplicates(subset="elaboration_sentence", keep="first")
df = pd.read_csv(f"../data/elaborations/{split}_ds_s_subjects_targets.csv")
"""df = pd.DataFrame({
    "source_text" : dataset[split]["source_text"],
    "label_text" : dataset[split]["label_text"],
    "elaboration_sentence":dataset[split]["elaboration_sentence"],
    
}) """

'df = pd.DataFrame({\n    "source_text" : dataset[split]["source_text"],\n    "label_text" : dataset[split]["label_text"],\n    "elaboration_sentence":dataset[split]["elaboration_sentence"],\n    \n}) '

In [57]:
print(len(df_sp))
print(len(df))

116
116


In [None]:
# merge the additional columns from df_sp based on elaboration_sentence
columns_to_add = ["target_sentence_target"]
#columns_to_add = ["subject", "target", "elaboration_info", "target_sentence_4o"]
df = pd.merge(df, df_sp[["elaboration_sentence"] + columns_to_add], on="elaboration_sentence", how="left")
df

In [60]:
df.to_csv(f"../data/elaborations/{split}_ds_s_subjects_targets.csv")

In [59]:
idx= 100
print(df.loc[idx,"target_sentence_4o"], end="\n\n")
print(df.loc[idx, column_name])

target_sentence='Drones are small flying aircraft.'

target_phrase='Drones'
