# Load model

In [2]:
import os
#os.environ["OPENAI_API_KEY"] = "my_key"

In [30]:
from openai import OpenAI 
import os

MODEL="gpt-4o-mini"
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Load data

In [4]:
from datasets import load_dataset
import os

data_path = "../data/elaborations"

data_files_c2spo = {
    'train': os.path.join(data_path,"train","train_ds_c2spo.csv"),      
    'validation': os.path.join(data_path,"validation","valid_ds_c2spo.csv"),  
    'test': os.path.join(data_path,"test","test_ds_c2spo.csv")         
}

data_files_c4spo = {
    'train': os.path.join(data_path,"train","train_ds_c4spo.csv"),      
    'validation': os.path.join(data_path,"validation","valid_ds_c4spo.csv"),  
    'test': os.path.join(data_path,"test","test_ds_c4spo.csv")         
}

dataset = load_dataset('csv', data_files=data_files_c2spo)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 1049
    })
    validation: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 134
    })
    test: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 116
    })
})


### Create dataframes

In [19]:
import pandas as pd
import os

df_train_spo = pd.DataFrame({
    "source_text": dataset["train"]["source_text"],
    "elaboration_sentence": dataset["train"]["elaboration_sentence"]
})

df_valid_spo = pd.DataFrame({
    "source_text": dataset["validation"]["source_text"],
    "elaboration_sentence": dataset["validation"]["elaboration_sentence"]
})

df_test_spo = pd.DataFrame({
    "source_text": dataset["test"]["source_text"],
    "elaboration_sentence": dataset["test"]["elaboration_sentence"]
})

df_train_spo.to_csv(os.path.join(data_path, "train_ds_c2spo_subjects_targets.csv"), index=False)
df_valid_spo.to_csv(os.path.join(data_path, "validation_ds_c2spo_subjects_targets.csv"), index=False)
df_test_spo.to_csv(os.path.join(data_path, "test_ds_c2spo_subjects_targets.csv"), index=False)

### Add columns

In [6]:
import pandas as pd
import os

df_train_s = pd.read_csv(os.path.join(data_path, "train_ds_s_subjects_targets.csv"))
df_train_sp = pd.read_csv(os.path.join(data_path, "train_ds_sp_subjects_targets.csv"))
df_train_spo = pd.read_csv(os.path.join(data_path, "train_ds_c2spo_subjects_targets.csv"))


df_valid_s = pd.read_csv(os.path.join(data_path, "validation_ds_s_subjects_targets.csv"))
df_valid_sp = pd.read_csv(os.path.join(data_path, "validation_ds_sp_subjects_targets.csv"))
df_valid_spo = pd.read_csv(os.path.join(data_path, "validation_ds_c2spo_subjects_targets.csv"))

df_test_s = pd.read_csv(os.path.join(data_path, "test_ds_s_subjects_targets.csv"))
df_test_sp = pd.read_csv(os.path.join(data_path, "test_ds_sp_subjects_targets.csv"))
df_test_spo = pd.read_csv(os.path.join(data_path, "test_ds_c2spo_subjects_targets.csv"))

df_train = df_train_spo
df_valid = df_valid_spo
df_test = df_test_spo

col_name = "target_sentence_4o"

dataset["train"] = dataset["train"].add_column(col_name, df_train[col_name])
dataset["validation"] = dataset["validation"].add_column(col_name, df_valid[col_name])
dataset["test"] = dataset["test"].add_column(col_name, df_test[col_name])

# Prompt and messages

In [9]:
SYSTEM_PROMPT_SUBJECT = """You are an expert in identifying the subject of the provided explanation sentence based on the context text. 
If the subject of the explanation sentence is a **pronoun (e.g., "it," "they," "he," "she")**, determine what the pronoun is referring to within the context.
The subject should be written as a phrase, not as a complete sentence.
If there are multiple possible subjects, select the first one you think is most appropriate.
"""

SYSTEM_PROMPT_TARGET = """You are an expert in identifying the target of the provided explanation sentence based on the context text. 
Return the main phrase which explanation sentence is referring to.
The phrase must appear in the context text, but does not need to be present in the explanation sentence itself.
"""

SYSTEM_PROMPT_TARGET_SENT = """
You are an expert in identifying the sentence that the provided explanation sentence is clarifying. 
First, determine what question the explanation sentence is answering, and based on that, identify the sentence within the context text (sentences surrounding the explanation sentence) that the question could be asked about.
Return the sentence from the context text that the explanation sentence is clarifying.
"""

SYSTEM_PROMPT_TARGET_SENT_TARGET = """
You are an expert in identifying the target phrase in a given sentence that is being clarified by the accompanying explanation sentence.
"""

#Return the sentence from the context text that the explanation sentence is clarifying.

# Turn examples into ChatML format

In [31]:
def format_subject_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_SUBJECT},
            {"role": "user", "content":  "Identify the subject of the following explanation sentence: '{}' within the given text: '{}'".format(
    example["elaboration_sentence"], example["source_text"])}, #example["label_text"])},
        ]
    }

def format_target_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET},
            {"role": "user", "content":  "Identify the target of the following explanation sentence: '{}' within the given text: '{}'".format(
    example["elaboration_sentence"], example["label_text"])},
        ]
    }

def format_target_sent_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_INFO},
            {"role": "user", "content":  "Identify the target sentence that the following explanation: '{}' is clarifying within the given text: '{}'".format(
    example["elaboration_sentence"], example["label_text"])},
        ]
    }

def format_target_sent_target_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT_TARGET},
            {"role": "user", "content":  "Identify the target phrase in the {} that is being clarified by the explanation sentence: '{}'".format(
    example["target_sentence_4o"], example["elaboration_sentence"])},
        ]
    }

# for complex input text
def format_target_sent_complex_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT},
            {"role": "user", "content":  "Identify the target sentence that the following explanation: '{}' is clarifying within the given text: '{}'".format(
    example["elaboration_sentence"], example["source_text"])},
        ]
    }
    
formatted_train_dataset = dataset["train"].map(format_subject_example)
formatted_validation_dataset = dataset["validation"].map(format_subject_example)
formatted_test_dataset = dataset["test"].map(format_subject_example)

Map:   0%|          | 0/1049 [00:00<?, ? examples/s]

Map:   0%|          | 0/134 [00:00<?, ? examples/s]

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

In [32]:
formatted_train_dataset[1]

{'doc_num': 3,
 'source_text': 'Gun control was one of the topics the president addressed on Thursday in a "fireside hangout." The question-and-answer forum on Google+ was a modern version of President Franklin D. Roosevelt\'s informal radio broadcasts known as "fireside chats." She asked if the president wants to ban handguns. Obama replied that he didn\'t want to ban them, but rather require stricter background checks to buy them.',
 'label_text': None,
 'elaboration_sentence': "Obama's fireside hangout included many questions from different kinds of people.",
 'contextual_specificity_rating': 2,
 'target_sentence_4o': 'target_sentence=\'The question-and-answer forum on Google+ was a modern version of President Franklin D. Roosevelt\\\'s informal radio broadcasts known as "fireside chats."\'',
 'messages': [{'content': 'You are an expert in identifying the subject of the provided explanation sentence based on the context text. \nIf the subject of the explanation sentence is a **prono

# Identify subjects of elaborations

Analyze the subjects to explore the types of elaboration sentences.

In [33]:
from pydantic import BaseModel

class ExplanationTarget(BaseModel):
    #explanation_sentence: str
    subject: str
    #target_sentence: str
    #target_phrase: str

In [16]:
import re

def extract_target_sentence(response: str) -> str:
    """
    Extract the target_sentence from the model's response.
    """
    match = re.search(r"target_sentence='(.*?)'", response)
    if match:
        return match.group(1) 
    else:
        raise ValueError("target_sentence not found.")

In [15]:
example_subject = "Identify the subject of the following explanation sentence: '{}' within the given text: '{}'".format(
    dataset["test"][100]["elaboration_sentence"], dataset["test"][100]["label_text"])

example = "Identify the target phrase in the following sentence: '{}' that is being clarified by the explanation sentence: '{}'".format(
    dataset["test"][100]["target_sentence_4o"], dataset["test"][100]["elaboration_sentence"])

completion = client.chat.completions.create(
  model=MODEL,
  messages=[
    {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT_TARGET}, 
    {"role": "user", "content": example }  
  ]
)
print("Assistant: " + completion.choices[0].message.content, end="\n\n") # standard format: completion.choices[0].message.content 

Assistant: The target phrase in the target sentence that is being clarified by the explanation sentence is "Drones."

target_phrase='Drones'


In [18]:
example = "Identify the target phrase in the following text: '{}' that is being clarified by the explanation sentence: '{}'".format(
    dataset["test"][100]["source_text"], dataset["test"][100]["elaboration_sentence"])

completion = client.beta.chat.completions.parse(
  model=MODEL,
  messages=[
    {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT_TARGET}, 
    {"role": "user", "content": example }  
  ],
    response_format= ExplanationTarget,
)

response = completion.choices[0].message.parsed
print(response)

target_phrase='drones'


In [31]:
df_results = pd.DataFrame({
    "source_text" : dataset["validation"]["source_text"],
    "label_text" : dataset["validation"]["label_text"],
    "elaboration_sentence":dataset["validation"]["elaboration_sentence"],
    "subject": "",
    "target":"",
    "elaboration_info":"",
    "target_sentence":""
    
})

## Get subjects, targets and elaboration info

In [20]:
import pandas as pd
from tqdm.notebook import tqdm

splits = ["train","validation","test"]
split = "train"

df_results = pd.read_csv(f"../data/elaborations/{split}_ds_c2spo_subjects_targets.csv")

formatted_dataset = formatted_train_dataset
column_name = "target-phrase"
df_results[column_name] = "" #df_results[column_name].fillna("")

for idx, example in tqdm(enumerate(formatted_dataset),total=len(formatted_dataset)):
    if df_results.at[idx,column_name]=="":
        completion = client.chat.completions.create(model=MODEL,messages=example["messages"])
        df_results.at[idx,column_name] = completion.choices[0].message.content

  0%|          | 0/1049 [00:00<?, ?it/s]

In [24]:
df_results.rename(columns={"target-phrase":"elaboration-info"},inplace=True)

In [28]:
df_results.to_csv(f"../data/elaborations/{split}_ds_c2spo_subjects_targets.csv", index=False)

## Get target sentences for elaborations in specified response format

In [59]:
import pandas as pd
from tqdm.notebook import tqdm

split = "test"
df_results = pd.read_csv(f"../data/elaborations/{split}_ds_c2spo_subjects_targets.csv")
column_name = "subject" #"target_sentence_4o", "target_sentence_target"
df_results[column_name] = ""

#df_results[column_name] = df_results[column_name].fillna("")
formatted_dataset = formatted_test_dataset

for idx, example in tqdm(enumerate(formatted_dataset),total=len(formatted_dataset)):
    if df_results.at[idx,column_name] == "":
        completion = client.beta.chat.completions.parse(model=MODEL,messages=example["messages"],response_format= ExplanationTarget)
        df_results.at[idx,column_name] = completion.choices[0].message.parsed

  0%|          | 0/116 [00:00<?, ?it/s]

In [60]:
import random
idx = random.choice(df_results.index.tolist())
print(df_results.loc[idx,"subject"])

subject='measles'


In [62]:
df_results.to_csv(f"../data/elaborations/{split}_ds_c2spo_subjects_targets.csv", index=False)

# Save results

In [52]:
len(df_results[df_results[column_name]==""])

0

In [53]:
idx= 100
print(df_results.loc[idx,"elaboration_sentence"], end="\n\n")
print(df_results.loc[idx, column_name])

They come in all shapes and sizes, but usually look like helicopters.

target_phrase='Drones'


In [23]:
df_results

Unnamed: 0,source_text,elaboration_sentence,target_sentence_4o,target_sentence_target
0,Instead of using what money it had to provide ...,But he kept making weapons in secret.,target_sentence='North Korea decided to spend ...,target_phrase='spend more on nuclear weapons'
1,California was able to keep open all of its 28...,Repairs and mowing cannot happen as often.,"target_sentence=""Even with the private help, v...",target_phrase='maintenance reductions'
2,National parks in 2011 generated $13 billion i...,The entrances were blocked by snow.,target_sentence='Local businesses from tourism...,target_phrase='snowy park entrances'
3,"BALTIMORE""Some of springtime's more notable he...",They did this for nine years.,"target_sentence=""Some of springtime's more not...",target_phrase='a new study'
4,"Currently, nearly half a million acres of land...",That is a larger area than Los Angeles and New...,"target_sentence='Currently, nearly half a mill...",target_phrase='nearly half a million acres of ...
...,...,...,...,...
129,California sea lions were exploited in the 19t...,The hunting nearly wiped them out.,target_sentence='California sea lions were exp...,target_phrase='California sea lions were explo...
130,"Left unchanged, the supply of skilled workers ...",It's a cruel situation.,"target_sentence='Left unchanged, the supply of...",target_phrase='the supply of skilled workers w...
131,Some of Mexico's largest export farms have act...,Workers also are being paid more quickly.,target_sentence='Some of Mexico\'s largest exp...,target_phrase='reforming pay methods'
132,One thing is clear: Wal-Mart and other U.S. re...,The farms are located all over Mexico.,"target_sentence=""It can be a difficult task, e...",target_phrase='scattered in remote locations'


In [55]:
df_results.to_csv("../data/elaborations/test_ds_sp_subjects_targets.csv", index=False)

## Create additional df for prior-context only datasets

In [56]:
import pandas as pd
from tqdm.notebook import tqdm

split = "test"
dataset = load_dataset('csv', data_files=data_files_c2s)
df_sp = pd.read_csv(f"../data/elaborations/{split}_ds_sp_subjects_targets.csv")
df_sp = df_sp.drop_duplicates(subset="elaboration_sentence", keep="first")
df = pd.read_csv(f"../data/elaborations/{split}_ds_s_subjects_targets.csv")
"""df = pd.DataFrame({
    "source_text" : dataset[split]["source_text"],
    "label_text" : dataset[split]["label_text"],
    "elaboration_sentence":dataset[split]["elaboration_sentence"],
    
}) """

'df = pd.DataFrame({\n    "source_text" : dataset[split]["source_text"],\n    "label_text" : dataset[split]["label_text"],\n    "elaboration_sentence":dataset[split]["elaboration_sentence"],\n    \n}) '

In [57]:
print(len(df_sp))
print(len(df))

116
116


In [None]:
# merge the additional columns from df_sp based on elaboration_sentence
columns_to_add = ["target_sentence_target"]
#columns_to_add = ["subject", "target", "elaboration_info", "target_sentence_4o"]
df = pd.merge(df, df_sp[["elaboration_sentence"] + columns_to_add], on="elaboration_sentence", how="left")
df

In [60]:
df.to_csv(f"../data/elaborations/{split}_ds_s_subjects_targets.csv")

In [59]:
idx= 100
print(df.loc[idx,"target_sentence_4o"], end="\n\n")
print(df.loc[idx, column_name])

target_sentence='Drones are small flying aircraft.'

target_phrase='Drones'
