# Load model

In [2]:
import os
#os.environ["OPENAI_API_KEY"] = "my_key"

In [3]:
from openai import OpenAI 
import os

MODEL="gpt-4o"
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Load data

In [36]:
from datasets import load_dataset
import os

data_path = "../data/elaborations"

data_files_c2spo = {
    'train': os.path.join(data_path,"train","train_ds_c2spo.csv"),      
    'validation': os.path.join(data_path,"validation","valid_ds_c2spo.csv"),  
    'test': os.path.join(data_path,"test","test_ds_c2spo.csv")         
}

data_files_c2sp = {
    'train': os.path.join(data_path,"train","train_ds_c2sp.csv"),      
    'validation': os.path.join(data_path,"validation","valid_ds_c2sp.csv"),  
    'test': os.path.join(data_path,"test","test_ds_c2sp.csv")         
}

dataset = load_dataset('csv', data_files=data_files_c2spo)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 1049
    })
    validation: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 134
    })
    test: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 116
    })
})


### Create dataframes

In [19]:
import pandas as pd
import os

df_train_spo = pd.DataFrame({
    "source_text": dataset["train"]["source_text"],
    "elaboration_sentence": dataset["train"]["elaboration_sentence"]
})

df_valid_spo = pd.DataFrame({
    "source_text": dataset["validation"]["source_text"],
    "elaboration_sentence": dataset["validation"]["elaboration_sentence"]
})

df_test_spo = pd.DataFrame({
    "source_text": dataset["test"]["source_text"],
    "elaboration_sentence": dataset["test"]["elaboration_sentence"]
})

df_train_spo.to_csv(os.path.join(data_path, "train_ds_c2spo_subjects_targets.csv"), index=False)
df_valid_spo.to_csv(os.path.join(data_path, "validation_ds_c2spo_subjects_targets.csv"), index=False)
df_test_spo.to_csv(os.path.join(data_path, "test_ds_c2spo_subjects_targets.csv"), index=False)

### Add columns

In [5]:
import pandas as pd
import os

df_train_s = pd.read_csv(os.path.join(data_path, "train_ds_s_subjects_targets.csv"))
df_train_sp = pd.read_csv(os.path.join(data_path, "train_ds_sp_subjects_targets.csv"))
df_train_spo = pd.read_csv(os.path.join(data_path, "train_ds_c2spo_subjects_targets.csv"))


df_valid_s = pd.read_csv(os.path.join(data_path, "validation_ds_s_subjects_targets.csv"))
df_valid_sp = pd.read_csv(os.path.join(data_path, "validation_ds_sp_subjects_targets.csv"))
df_valid_spo = pd.read_csv(os.path.join(data_path, "validation_ds_c2spo_subjects_targets.csv"))

df_test_s = pd.read_csv(os.path.join(data_path, "test_ds_s_subjects_targets.csv"))
df_test_sp = pd.read_csv(os.path.join(data_path, "test_ds_sp_subjects_targets.csv"))
df_test_spo = pd.read_csv(os.path.join(data_path, "test_ds_c2spo_subjects_targets.csv"))

df_train = df_train_spo
df_valid = df_valid_spo
df_test = df_test_spo

col_name = "target_sentence_4o"

dataset["train"] = dataset["train"].add_column(col_name, df_train[col_name])
dataset["validation"] = dataset["validation"].add_column(col_name, df_valid[col_name])
dataset["test"] = dataset["test"].add_column(col_name, df_test[col_name])

# Prompt and messages

In [44]:
SYSTEM_PROMPT_SUBJECT = """You are an expert in identifying the subject of the provided explanation sentence based on the context text. 
If the subject of the explanation sentence is a **pronoun (e.g., "it," "they," "he," "she")**, determine what the pronoun refers to within the context.
The subject MUST be written as a concise phrase (not as a complete sentence), and it MUST be found in the context text (it does not need to appear in the explanation sentence itself).
"""

SYSTEM_PROMPT_TARGET = """You are an expert in identifying the target phrase that provided explanation sentence is clarifying. 
Return the main phrase which explanation sentence is referring to.
The phrase must appear in the context text, but does not need to be present in the explanation sentence itself.
"""

SYSTEM_PROMPT_TARGET_SENT = """
You are an expert in identifying the sentence that the provided explanation sentence is clarifying. 
First, determine what question the explanation sentence is answering, and based on that, identify the sentence within the context text (sentences surrounding the explanation sentence) that the question could be asked about.
Return the sentence from the context text that the explanation sentence is clarifying.
"""

SYSTEM_PROMPT_TARGET_SENT_TARGET = """
You are an expert in identifying the target phrase in a given sentence that is being clarified by the accompanying explanation sentence.
"""

SYSTEM_PROMPT_ELAB = """You are an expert in clarifying unclear or complex terms and concepts in a given text. Your task is as follows:  
1. Identify ***the target sentence** that requires clarification.  
2. Specify **the exact phrase** within that sentence that needs clarification. 
3. Determine the **type of clarification** that should be added. Choose from the following categories:  
- **Definition**: Provide a clear and concise definition of the term or concept.  
- **Example**: Offer one or more examples that illustrate the meaning of the term or concept.  
- **Background**: Provide relevant background knowledge or general statements to give context.  
- **Flow**: Describe the sequence or flow of actions to make the events clearer and easier to follow.  
- **Reason**: Explain the reason behind an action or occurrence.  
- **Result**: Clarify the outcome or result of an action or occurrence.  
- **Speculation**: Offer a possible outcome, assumption, or hypothesis, such as "what would happen if..." scenarios.  
- **Other**: If the clarification does not fit into the above categories, specify what kind of clarification you are providing. 
4. Based on the above's, generate **ONE concise explanation sentence** (made up of 10 words or fewer) that provides additional information related to the identified target sentence and phrase.  
The explanation sentence should be clear, plain, and simple in tone!"""

#Return the sentence from the context text that the explanation sentence is clarifying.

# Turn examples into ChatML format

In [37]:
def format_subject_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_SUBJECT},
            {"role": "user", "content":  "Identify the subject of the following explanation sentence: '{}' within the given text: '{}'".format(
    example["elaboration_sentence"], example["source_text"])}, #example["label_text"])},
        ]
    }

def format_target_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET},
            {"role": "user", "content":  "Identify the target of the following explanation sentence: '{}' within the given text: '{}'".format(
    example["elaboration_sentence"], example["label_text"])},
        ]
    }

def format_target_sent_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_INFO},
            {"role": "user", "content":  "Identify the target sentence that the following explanation: '{}' is clarifying within the given text: '{}'".format(
    example["elaboration_sentence"], example["label_text"])},
        ]
    }

def format_target_sent_target_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT_TARGET},
            {"role": "user", "content":  "Identify the target phrase in the {} that is being clarified by the explanation sentence: '{}'".format(
    example["target_sentence_4o"], example["elaboration_sentence"])},
        ]
    }

# for complex input text
def format_target_sent_complex_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT},
            {"role": "user", "content":  "Identify the target sentence that the following explanation: '{}' is clarifying within the given text: '{}'".format(
    example["elaboration_sentence"], example["source_text"])},
        ]
    }

def format_elab_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_ELAB},
            {"role": "user", "content":  "Identify the sentence and specific phrase in the given text: '{}' that need clarification. Then, specify the type of clarificatio needed and generate one concise explanation sentence.".format(
    example["source_text"])},
        ]
    }
    
formatted_train_dataset = dataset["train"].map(format_elab_example)
formatted_validation_dataset = dataset["validation"].map(format_elab_example)
formatted_test_dataset = dataset["test"].map(format_elab_example)

Map:   0%|          | 0/1049 [00:00<?, ? examples/s]

Map:   0%|          | 0/134 [00:00<?, ? examples/s]

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

In [38]:
formatted_test_dataset[0]["messages"][1]

{'content': "Identify the sentence and specific phrase in the given text: 'The new companies in town require workers with a college degree or advanced training. But the turnaround will take more than just new companies moving to town; New Haven needs to invest in educating its youth so they will be qualified to do those high-skilled jobs when they become adults. The New Haven Promise is no one-way street. The goal is to create an expectation that all students can and will attend college.' that need clarification. Then, specify the type of clarificatio needed and generate one concise explanation sentence.",
 'role': 'user'}

# Identify subjects of elaborations

Analyze the subjects to explore the types of elaboration sentences.

In [16]:
import re

def extract_target_sentence(response: str) -> str:
    """
    Extract the target_sentence from the model's response.
    """
    match = re.search(r"target_sentence='(.*?)'", response)
    if match:
        return match.group(1) 
    else:
        raise ValueError("target_sentence not found.")

## Examples

### Subject

In [33]:
import random 
example = random.choice(formatted_test_dataset)
print(example["messages"][1],end="\n\n")

completion = client.beta.chat.completions.parse(
  model=MODEL,
  messages=example["messages"],
    response_format= ExplanationTarget,
)

response = completion.choices[0].message.parsed
print(response)

{'content': "Identify the subject of the following explanation sentence: 'It is not easy to relax, or to feel positive.' within the given text: 'In Philadelphia, there's a 50 percent higher prevalence of smoking among the poor than among the non-poor, according to Giridhar Mallya, director of policy and planning for the Philadelphia Department of Public Health. The morality of buying cigarettes when you're poor is complicated. The poor smoke to manage high levels of stress and depression, Mallya said, as much a part of poverty as empty pockets. It's also harder for the poor to get smoking-cessation counseling and nicotine patches than others who may receive help through insurance, experts said.'", 'role': 'user'}

subject='the poor'


### Target phrase

In [19]:
example = "Identify the target phrase in the following text: '{}' that is being clarified by the explanation sentence: '{}'".format(
    dataset["test"][100]["source_text"], dataset["test"][100]["elaboration_sentence"])

completion = client.beta.chat.completions.parse(
  model=MODEL,
  messages=[
    {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT_TARGET}, 
    {"role": "user", "content": example }  
  ],
    response_format= ExplanationTarget,
)

response = completion.choices[0].message.parsed
print(response)

subject='drones'


### Elaboration

In [19]:
import random

seed = 42
sample_size = 50 
random_sample_dataset = formatted_test_dataset.shuffle(seed=seed).select(range(sample_size))

In [18]:
random_sample_dataset

Dataset({
    features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating', 'messages'],
    num_rows: 50
})

In [13]:
import random 
from pydantic import BaseModel

example = random.choice(random_sample_dataset)
print(example["messages"][1],end="\n\n")


class ExplanationTarget(BaseModel):
    target_sentence: str
    target_phrase: str
    clarification_type: str
    explanation_sentence: str

completion = client.beta.chat.completions.parse(
  model=MODEL,
  messages=example["messages"],
    response_format= ExplanationTarget,
)

response = completion.choices[0].message.parsed
print(response)

{'content': "Identify the sentence and specific phrase in the given text: 'He said hate toward Roma people remains widespread there. His people still suffer unfair and insulting treatment, he said. They are also the first to be fired. He said that when poor Romani villagers come to Berlin, they find only overcrowded apartments.' that need clarification. Then, specify the type of clarificatio needed and generate one concise explanation sentence.", 'role': 'user'}

target_sentence='His people still suffer unfair and insulting treatment, he said.' target_phrase='unfair and insulting treatment' clarification_type='Example' explanation_sentence='Discrimination in housing and employment are common examples.'


In [40]:
df_results = pd.DataFrame({
    "doc_num": formatted_test_dataset["doc_num"],
    "source_text" : formatted_test_dataset["source_text"],
    "label_text" : formatted_test_dataset["label_text"],
    "elaboration_sentence":formatted_test_dataset["elaboration_sentence"],
    "response":"",
    "pred_elaboration":"",
    "target_sentence_target": "",
    "target_sentence_4o":"",
    "clarification_category":"",
    
})

df_results.to_csv(f"../data/elaborations/{split}_ds_c2spo_gpt4o_elab_preds_targets.csv", index=False)

## Get subjects, targets and elaboration info

In [20]:
import pandas as pd
from tqdm.notebook import tqdm

splits = ["train","validation","test"]
split = "train"

df_results = pd.read_csv(f"../data/elaborations/{split}_ds_c2spo_subjects_targets.csv")

formatted_dataset = formatted_train_dataset
column_name = "target-phrase"
df_results[column_name] = "" #df_results[column_name].fillna("")

for idx, example in tqdm(enumerate(formatted_dataset),total=len(formatted_dataset)):
    if df_results.at[idx,column_name]=="":
        completion = client.chat.completions.create(model=MODEL,messages=example["messages"])
        df_results.at[idx,column_name] = completion.choices[0].message.content

  0%|          | 0/1049 [00:00<?, ?it/s]

In [24]:
df_results.rename(columns={"target-phrase":"elaboration-info"},inplace=True)

In [28]:
df_results.to_csv(f"../data/elaborations/{split}_ds_c2spo_subjects_targets.csv", index=False)

## Get target sentences for elaborations in specified response format

In [23]:
from pydantic import BaseModel

# for elaboration targets identification
class ExplanationTarget(BaseModel):
    #explanation_sentence: str
    subject: str
    #target_sentence: str
    #target_phrase: str

# for elaboration generation
class ExplanationTarget(BaseModel):
    target_sentence: str
    target_phrase: str
    clarification_type: str
    explanation_sentence: str

In [29]:
formatted_test_dataset

Dataset({
    features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating', 'messages'],
    num_rows: 116
})

In [41]:
import pandas as pd
from tqdm.notebook import tqdm

split = "test"
#df_results = pd.read_csv(f"../data/elaborations/{split}_ds_c2spo_subjects_targets.csv")
df_results = pd.read_csv(f"../data/elaborations/{split}_ds_c2spo_gpt4o_elab_preds_targets.csv")
column_name = "response" # column_names : "subject " "target_sentence_4o", "target_sentence_target"
#df_results[column_name] = ""
df_results[column_name] = df_results[column_name].fillna("")
formatted_dataset = formatted_test_dataset

for idx, example in tqdm(enumerate(formatted_dataset),total=len(formatted_dataset)):
    if df_results.at[idx,column_name] == "":
        completion = client.beta.chat.completions.parse(model=MODEL,messages=example["messages"],response_format=ExplanationTarget)
        df_results.at[idx,column_name] = completion.choices[0].message.parsed

  0%|          | 0/116 [00:00<?, ?it/s]

In [43]:
df_results.to_csv(f"../data/elaborations/{split}_ds_c2spo_gpt4o_elab_preds_targets.csv", index=False)

# Save results

In [52]:
len(df_results[df_results[column_name]==""])

0

In [53]:
idx= 100
print(df_results.loc[idx,"elaboration_sentence"], end="\n\n")
print(df_results.loc[idx, column_name])

They come in all shapes and sizes, but usually look like helicopters.

target_phrase='Drones'


In [23]:
df_results

Unnamed: 0,source_text,elaboration_sentence,target_sentence_4o,target_sentence_target
0,Instead of using what money it had to provide ...,But he kept making weapons in secret.,target_sentence='North Korea decided to spend ...,target_phrase='spend more on nuclear weapons'
1,California was able to keep open all of its 28...,Repairs and mowing cannot happen as often.,"target_sentence=""Even with the private help, v...",target_phrase='maintenance reductions'
2,National parks in 2011 generated $13 billion i...,The entrances were blocked by snow.,target_sentence='Local businesses from tourism...,target_phrase='snowy park entrances'
3,"BALTIMORE""Some of springtime's more notable he...",They did this for nine years.,"target_sentence=""Some of springtime's more not...",target_phrase='a new study'
4,"Currently, nearly half a million acres of land...",That is a larger area than Los Angeles and New...,"target_sentence='Currently, nearly half a mill...",target_phrase='nearly half a million acres of ...
...,...,...,...,...
129,California sea lions were exploited in the 19t...,The hunting nearly wiped them out.,target_sentence='California sea lions were exp...,target_phrase='California sea lions were explo...
130,"Left unchanged, the supply of skilled workers ...",It's a cruel situation.,"target_sentence='Left unchanged, the supply of...",target_phrase='the supply of skilled workers w...
131,Some of Mexico's largest export farms have act...,Workers also are being paid more quickly.,target_sentence='Some of Mexico\'s largest exp...,target_phrase='reforming pay methods'
132,One thing is clear: Wal-Mart and other U.S. re...,The farms are located all over Mexico.,"target_sentence=""It can be a difficult task, e...",target_phrase='scattered in remote locations'


In [55]:
df_results.to_csv("../data/elaborations/test_ds_sp_subjects_targets.csv", index=False)

## Create additional df for prior-context only datasets

In [56]:
import pandas as pd
from tqdm.notebook import tqdm

split = "test"
dataset = load_dataset('csv', data_files=data_files_c2s)
df_sp = pd.read_csv(f"../data/elaborations/{split}_ds_sp_subjects_targets.csv")
df_sp = df_sp.drop_duplicates(subset="elaboration_sentence", keep="first")
df = pd.read_csv(f"../data/elaborations/{split}_ds_s_subjects_targets.csv")
"""df = pd.DataFrame({
    "source_text" : dataset[split]["source_text"],
    "label_text" : dataset[split]["label_text"],
    "elaboration_sentence":dataset[split]["elaboration_sentence"],
    
}) """

'df = pd.DataFrame({\n    "source_text" : dataset[split]["source_text"],\n    "label_text" : dataset[split]["label_text"],\n    "elaboration_sentence":dataset[split]["elaboration_sentence"],\n    \n}) '

In [57]:
print(len(df_sp))
print(len(df))

116
116


In [None]:
# merge the additional columns from df_sp based on elaboration_sentence
columns_to_add = ["target_sentence_target"]
#columns_to_add = ["subject", "target", "elaboration_info", "target_sentence_4o"]
df = pd.merge(df, df_sp[["elaboration_sentence"] + columns_to_add], on="elaboration_sentence", how="left")
df

In [60]:
df.to_csv(f"../data/elaborations/{split}_ds_s_subjects_targets.csv")

In [59]:
idx= 100
print(df.loc[idx,"target_sentence_4o"], end="\n\n")
print(df.loc[idx, column_name])

target_sentence='Drones are small flying aircraft.'

target_phrase='Drones'
