# Load model

In [2]:
import os
#os.environ["OPENAI_API_KEY"] = ""

In [3]:
from openai import OpenAI 
import os

MODEL="gpt-4o"
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Load data

In [6]:
from dataset_utils import load_dataset_from_csv

ds_type = "c4op"
setting = "base"

dataset = load_dataset_from_csv(ds_type, setting)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['doc_num', 'source_text', 'target_sentence', 'target_sentence_bertscore', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 1049
    })
    validation: Dataset({
        features: ['doc_num', 'source_text', 'target_sentence', 'target_sentence_bertscore', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 134
    })
    test: Dataset({
        features: ['doc_num', 'source_text', 'target_sentence', 'target_sentence_bertscore', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 116
    })
})


### Create dataframes

In [19]:
import pandas as pd
import os

df_train_spo = pd.DataFrame({
    "source_text": dataset["train"]["source_text"],
    "elaboration_sentence": dataset["train"]["elaboration_sentence"]
})

df_valid_spo = pd.DataFrame({
    "source_text": dataset["validation"]["source_text"],
    "elaboration_sentence": dataset["validation"]["elaboration_sentence"]
})

df_test_spo = pd.DataFrame({
    "source_text": dataset["test"]["source_text"],
    "elaboration_sentence": dataset["test"]["elaboration_sentence"]
})

df_train_spo.to_csv(os.path.join(data_path, "train_ds_c2spo_subjects_targets.csv"), index=False)
df_valid_spo.to_csv(os.path.join(data_path, "validation_ds_c2spo_subjects_targets.csv"), index=False)
df_test_spo.to_csv(os.path.join(data_path, "test_ds_c2spo_subjects_targets.csv"), index=False)

### Add columns

In [5]:
import pandas as pd
import os

df_train_s = pd.read_csv(os.path.join(data_path, "train_ds_s_subjects_targets.csv"))
df_train_sp = pd.read_csv(os.path.join(data_path, "train_ds_sp_subjects_targets.csv"))
df_train_spo = pd.read_csv(os.path.join(data_path, "train_ds_c2spo_subjects_targets.csv"))


df_valid_s = pd.read_csv(os.path.join(data_path, "validation_ds_s_subjects_targets.csv"))
df_valid_sp = pd.read_csv(os.path.join(data_path, "validation_ds_sp_subjects_targets.csv"))
df_valid_spo = pd.read_csv(os.path.join(data_path, "validation_ds_c2spo_subjects_targets.csv"))

df_test_s = pd.read_csv(os.path.join(data_path, "test_ds_s_subjects_targets.csv"))
df_test_sp = pd.read_csv(os.path.join(data_path, "test_ds_sp_subjects_targets.csv"))
df_test_spo = pd.read_csv(os.path.join(data_path, "test_ds_c2spo_subjects_targets.csv"))

df_train = df_train_spo
df_valid = df_valid_spo
df_test = df_test_spo

col_name = "target_sentence_4o"

dataset["train"] = dataset["train"].add_column(col_name, df_train[col_name])
dataset["validation"] = dataset["validation"].add_column(col_name, df_valid[col_name])
dataset["test"] = dataset["test"].add_column(col_name, df_test[col_name])

# Prompt and messages

In [7]:
SYSTEM_PROMPT_SUBJECT = """You are an expert in identifying the subject of the provided explanation sentence based on the context text. 
If the subject of the explanation sentence is a **pronoun (e.g., "it," "they," "he," "she")**, determine what the pronoun refers to within the context.
The subject MUST be written as a concise phrase (not as a complete sentence), and it MUST be found in the context text (it does not need to appear in the explanation sentence itself).
"""

SYSTEM_PROMPT_TARGET = """You are an expert in identifying the target phrase in a given text that provided explanation sentence is clarifying or simplifying. 
Return the main phrase which explanation sentence is referring to.
Return the identified phrase from the CONTEXT TEXT, not the explanation sentence itself.
"""

SYSTEM_PROMPT_TARGET_SENT = """
You are an expert in identifying the sentence that the provided explanation sentence clarifies or refers to. 
Your task is to return the sentence from the CONTEXT TEXT, not the explanation sentence itself!
"""

SYSTEM_PROMPT_TARGET_SENT_AND_TARGET_PHRASE = """
You are an expert in identifying unclear or complex terms and concepts in a given text.
Your task is to:
1. Identify the sentence from the CONTEXT TEXT that the provided explanation sentence clarifies or refers to.
2. Specify the exact phrase within that sentence that is being clarified.
Return the identified sentence and the phrase from the CONTEXT TEXT, not the explanation sentence itself.
"""

SYSTEM_PROMPT_ELAB = """You are an expert in clarifying unclear or complex terms and concepts in a given text. Your task is as follows:  
1. Identify ***the target sentence** that requires clarification.  
2. Specify **the exact phrase** within that sentence that needs clarification. 
3. Determine the **type of clarification** that should be added. Choose from the following categories:  
- **Definition**: Provide a clear and concise definition of the term or concept.  
- **Example**: Offer one or more examples that illustrate the meaning of the term or concept.  
- **Background**: Provide relevant background knowledge or general statements to give context.  
- **Flow**: Describe the sequence or flow of actions to make the events clearer and easier to follow.  
- **Reason**: Explain the reason behind an action or occurrence.  
- **Result**: Clarify the outcome or result of an action or occurrence.  
- **Speculation**: Offer a possible outcome, assumption, or hypothesis, such as "what would happen if..." scenarios.  
- **Other**: If the clarification does not fit into the above categories, specify what kind of clarification you are providing. 
4. Based on the above's, generate **ONE concise explanation sentence** (made up of 10 words or fewer) that provides additional information related to the identified target sentence and phrase.  
The explanation sentence should be clear, plain, and simple in tone!"""

#Return the sentence from the context text that the explanation sentence is clarifying.

# Turn examples into ChatML format

In [8]:
def format_subject_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_SUBJECT},
            {"role": "user", "content":  "Identify the subject of the following explanation sentence: '{}' within the given text: '{}'".format(
    example["elaboration_sentence"], example["source_text"])}, #example["label_text"])},
        ]
    }

def format_target_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET},
            {"role": "user", "content":  "Identify the target phrase of the following explanation sentence: '{}' within the given text: '{}'".format(
    example["elaboration_sentence"], example["target_sentence"])},
        ]
    }

def format_target_sent_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT},
            {"role": "user", "content":  "Identify the target sentence that the following explanation: '{}' clarifies or refers to within the given text: '{}'".format(
    example["elaboration_sentence"], example["source_text"])},
        ]
    }

def format_target_sent_and_target_phrase_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT_AND_TARGET_PHRASE},
            {"role": "user", "content":  "Identify the target sentence and target phrase in the context text: '{}' that is being clarified by the explanation sentence: '{}'".format(
    example["source_text"], example["elaboration_sentence"])},
        ]
    }

# for complex input text
def format_target_sent_complex_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT},
            {"role": "user", "content":  "Identify the target sentence that the following explanation: '{}' is clarifying within the given text: '{}'".format(
    example["elaboration_sentence"], example["source_text"])},
        ]
    }

def format_elab_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_ELAB},
            {"role": "user", "content":  "Identify the sentence and specific phrase in the given text: '{}' that need clarification. Then, specify the type of clarificatio needed and generate one concise explanation sentence.".format(
    example["source_text"])},
        ]
    }
    
formatted_train_dataset = dataset["train"].map(format_elab_example)
formatted_validation_dataset = dataset["validation"].map(format_elab_example)
formatted_test_dataset = dataset["test"].map(format_elab_example)

datasets = {"train":formatted_train_dataset, "validation":formatted_validation_dataset, "test":formatted_test_dataset}

Map:   0%|          | 0/1049 [00:00<?, ? examples/s]

Map:   0%|          | 0/134 [00:00<?, ? examples/s]

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

In [9]:
formatted_test_dataset[0]["messages"][1]

{'content': 'Identify the sentence and specific phrase in the given text: \'New Haven Promise is offering workshops for parents on how to make sense of the college application process, which can be confusing. New Haven has pledged to have all students reading proficiently by the end of third grade, putting them on track for college readiness by high school. The Promise program will be considered a success if students are able to do well enough in college to finish with a 2-year or 4-year degree. Educators are tracking students who enter college on a Promise scholarship and finding that many do not enroll for a second year after finishing the first. Many are unable to keep up the minimum grade average. The ability to not just start college, but stay with it is called "college persistence." Middle-class students on the Promise Scholarships tend to do better than their lower-income peers. That goes to show that there are big barriers to achieving success, even when there is money pointed 

# Identify subjects of elaborations

Analyze the subjects to explore the types of elaboration sentences.

In [16]:
import re

def extract_target_sentence(response: str) -> str:
    """
    Extract the target_sentence from the model's response.
    """
    match = re.search(r"target_sentence='(.*?)'", response)
    if match:
        return match.group(1) 
    else:
        raise ValueError("target_sentence not found.")

## Examples

### Subject

In [33]:
import random 
example = random.choice(formatted_test_dataset)
print(example["messages"][1],end="\n\n")

completion = client.beta.chat.completions.parse(
  model=MODEL,
  messages=example["messages"],
    response_format= ExplanationTarget,
)

response = completion.choices[0].message.parsed
print(response)

{'content': "Identify the subject of the following explanation sentence: 'It is not easy to relax, or to feel positive.' within the given text: 'In Philadelphia, there's a 50 percent higher prevalence of smoking among the poor than among the non-poor, according to Giridhar Mallya, director of policy and planning for the Philadelphia Department of Public Health. The morality of buying cigarettes when you're poor is complicated. The poor smoke to manage high levels of stress and depression, Mallya said, as much a part of poverty as empty pockets. It's also harder for the poor to get smoking-cessation counseling and nicotine patches than others who may receive help through insurance, experts said.'", 'role': 'user'}

subject='the poor'


### Target phrase

In [19]:
example = "Identify the target phrase in the following text: '{}' that is being clarified by the explanation sentence: '{}'".format(
    dataset["test"][100]["source_text"], dataset["test"][100]["elaboration_sentence"])

completion = client.beta.chat.completions.parse(
  model=MODEL,
  messages=[
    {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT_TARGET}, 
    {"role": "user", "content": example }  
  ],
    response_format= ExplanationTarget,
)

response = completion.choices[0].message.parsed
print(response)

subject='drones'


### Target sentence

In [14]:
import random 
from pydantic import BaseModel

class ExplanationTarget(BaseModel):
    target_sentence: str
    target_phrase: str
    
example = random.choice(formatted_test_dataset)
print(example["messages"][1],end="\n\n")

completion = client.beta.chat.completions.parse(
  model=MODEL,
  messages=example["messages"],
    response_format= ExplanationTarget,
)

response = completion.choices[0].message.parsed
print(response)

{'content': "Identify the target sentence and target phrase in the context text: 'He insisted that Iran's nuclear program is something it will not surrender. Analysts say Obama must be careful not to look too eager to make a deal.' that is being clarified by the explanation sentence: 'That would hurt his ability to bargain.'", 'role': 'user'}

target_sentence='Analysts say Obama must be careful not to look too eager to make a deal.' target_phrase='not to look too eager to make a deal'


### Elaboration

In [19]:
import random

seed = 42
sample_size = 50 
random_sample_dataset = formatted_test_dataset.shuffle(seed=seed).select(range(sample_size))

In [18]:
random_sample_dataset

Dataset({
    features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating', 'messages'],
    num_rows: 50
})

In [13]:
import random 
from pydantic import BaseModel

example = random.choice(random_sample_dataset)
print(example["messages"][1],end="\n\n")


class ExplanationTarget(BaseModel):
    target_sentence: str
    target_phrase: str
    clarification_type: str
    explanation_sentence: str

completion = client.beta.chat.completions.parse(
  model=MODEL,
  messages=example["messages"],
    response_format= ExplanationTarget,
)

response = completion.choices[0].message.parsed
print(response)

{'content': "Identify the sentence and specific phrase in the given text: 'He said hate toward Roma people remains widespread there. His people still suffer unfair and insulting treatment, he said. They are also the first to be fired. He said that when poor Romani villagers come to Berlin, they find only overcrowded apartments.' that need clarification. Then, specify the type of clarificatio needed and generate one concise explanation sentence.", 'role': 'user'}

target_sentence='His people still suffer unfair and insulting treatment, he said.' target_phrase='unfair and insulting treatment' clarification_type='Example' explanation_sentence='Discrimination in housing and employment are common examples.'


In [40]:
df_results = pd.DataFrame({
    "doc_num": formatted_test_dataset["doc_num"],
    "source_text" : formatted_test_dataset["source_text"],
    "label_text" : formatted_test_dataset["label_text"],
    "elaboration_sentence":formatted_test_dataset["elaboration_sentence"],
    "response":"",
    "pred_elaboration":"",
    "target_sentence_target": "",
    "target_sentence_4o":"",
    "clarification_category":"",
    
})

df_results.to_csv(f"../data/elaborations/{split}_ds_c2spo_gpt4o_elab_preds_targets.csv", index=False)

## Create elaboration targets df

In [10]:
import pandas as pd
splits = ["train","validation","test"]

for split in splits:
    df_results = pd.DataFrame({
        "doc_num": datasets[split]["doc_num"],
        "source_text" : datasets[split]["source_text"],
        "elaboration_sentence":datasets[split]["elaboration_sentence"],
        "target_sentence_4o": datasets[split]["target_sentence"],
        "target_sentence_target":"",
    })
    df_results.to_csv(f"../data/elaborations/{split}_ds_{setting}_elab_targets.csv", index=False)
print(setting)

c2o


## Get subjects, targets and elaboration info

In [20]:
import pandas as pd
from tqdm.notebook import tqdm

splits = ["train","validation","test"]
split = "train"

df_results = pd.read_csv(f"../data/elaborations/{split}_ds_c2spo_subjects_targets.csv")

formatted_dataset = formatted_train_dataset
column_name = "target-phrase"
df_results[column_name] = "" #df_results[column_name].fillna("")

for idx, example in tqdm(enumerate(formatted_dataset),total=len(formatted_dataset)):
    if df_results.at[idx,column_name]=="":
        completion = client.chat.completions.create(model=MODEL,messages=example["messages"])
        df_results.at[idx,column_name] = completion.choices[0].message.content

  0%|          | 0/1049 [00:00<?, ?it/s]

In [24]:
df_results.rename(columns={"target-phrase":"elaboration-info"},inplace=True)

In [28]:
df_results.to_csv(f"../data/elaborations/{split}_ds_c2spo_subjects_targets.csv", index=False)

## Get target sentences for elaborations in specified response format

In [23]:
from pydantic import BaseModel

# for elaboration targets identification
class ExplanationTarget(BaseModel):
    #explanation_sentence: str
    subject: str
    #target_sentence: str
    #target_phrase: str

# for elaboration generation
class ExplanationTarget(BaseModel):
    target_sentence: str
    target_phrase: str
    #clarification_type: str
    #explanation_sentence: str

In [11]:
import pandas as pd
from tqdm.notebook import tqdm
from pydantic import BaseModel

class ExplanationTarget(BaseModel):
    target_phrase: str

splits = ["train","validation","test"]

for split in splits:
    df_results = pd.read_csv(f"../data/elaborations/{split}_ds_{setting}_elab_targets.csv")
    #df_results = pd.read_csv(f"../data/elaborations/{split}_ds_c2spo_gpt4o_elab_preds_targets.csv")
    column_name = "target_sentence_target" # column_names : "subject " "target_sentence_4o", "target_sentence_target"
    df_results[column_name] = df_results[column_name].fillna("")
    formatted_dataset = datasets[split]
    
    for idx, example in tqdm(enumerate(formatted_dataset),total=len(formatted_dataset)):
        if df_results.at[idx,column_name] == "":
            try:
                completion = client.beta.chat.completions.parse(model=MODEL,messages=example["messages"],response_format=ExplanationTarget)
                df_results.at[idx,column_name] = completion.choices[0].message.parsed
                df_results.to_csv(f"../data/elaborations/{split}_ds_{setting}_elab_targets.csv",index=False)
            except Exception as e:
                print(f"{e} for index {idx} in {split}")
                df_results.at[idx, column_name] = "filtered"

  0%|          | 0/1049 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/116 [00:00<?, ?it/s]

## Create additional df for prior-context only datasets

In [56]:
import pandas as pd
from tqdm.notebook import tqdm

split = "test"
dataset = load_dataset('csv', data_files=data_files_c2s)
df_sp = pd.read_csv(f"../data/elaborations/{split}_ds_sp_subjects_targets.csv")
df_sp = df_sp.drop_duplicates(subset="elaboration_sentence", keep="first")
df = pd.read_csv(f"../data/elaborations/{split}_ds_s_subjects_targets.csv")
"""df = pd.DataFrame({
    "source_text" : dataset[split]["source_text"],
    "label_text" : dataset[split]["label_text"],
    "elaboration_sentence":dataset[split]["elaboration_sentence"],
    
}) """

'df = pd.DataFrame({\n    "source_text" : dataset[split]["source_text"],\n    "label_text" : dataset[split]["label_text"],\n    "elaboration_sentence":dataset[split]["elaboration_sentence"],\n    \n}) '

In [57]:
print(len(df_sp))
print(len(df))

116
116


In [None]:
# merge the additional columns from df_sp based on elaboration_sentence
columns_to_add = ["target_sentence_target"]
#columns_to_add = ["subject", "target", "elaboration_info", "target_sentence_4o"]
df = pd.merge(df, df_sp[["elaboration_sentence"] + columns_to_add], on="elaboration_sentence", how="left")
df

In [60]:
df.to_csv(f"../data/elaborations/{split}_ds_s_subjects_targets.csv")

In [59]:
idx= 100
print(df.loc[idx,"target_sentence_4o"], end="\n\n")
print(df.loc[idx, column_name])

target_sentence='Drones are small flying aircraft.'

target_phrase='Drones'


# Check the targets

In [116]:
import pandas as pd 
import os

data_path = "../data/elaborations"

ds = "c2sp"
split = "test"
idx = 577 # 150

dfs ={
    "train":os.path.join(data_path,"train", f"train_ds_{ds}_elab_targets.csv"),
    "validation":os.path.join(data_path, "validation", f"validation_ds_{ds}_elab_targets.csv"),
    "test":os.path.join(data_path,"test", f"test_ds_{ds}_elab_targets.csv")
}

df_t = pd.read_csv(dfs[split])

In [101]:
print("ID:",idx)
print("Source text:",df_t.loc[idx,"source_text"], end="\n\n")
print("Elaboration:",df_t.loc[idx,"elaboration_sentence"])
print("Target phrase:",df_t.loc[idx,"target_sentence_target"])
print("Target sentence:",df_t.loc[idx,"target_sentence_4o"])
#df_t.at[idx,"target_sentence_target"] = "'the government can't borrow the money'"

ID: 577
Source text: BETHESDA, Md. " That something is statistical analysis. Statistics is a type of math. It involves looking at information, often in the form of numbers, and finding out what the information means. It has become popular to use for advertisements, for computers and now for sports.

Elaboration: For today's champion golfer, there's something just as important as having the right club.
Target phrase: nan
Target sentence: nan


In [5]:
#df_t.to_csv(dfs[split],index=False)

### Extract responses

In [12]:
df_t = pd.read_csv("../data/elaborations/test_ds_c2sp_gpt4o_elab_preds_targets.csv")
df_t.loc[0, "response"]

"target_sentence='It will make a difference by paying for college.' target_phrase='make a difference' clarification_type='Result' explanation_sentence='Increases access to education and job opportunities for youth.'"

In [14]:
import re

def extract_target(response: str) -> str:
    """
    Extract the target from the model's response.
    """
    match = re.search(r"clarification_type='(.*?)'", response)
    if match:
        return match.group(1) 
    else:
        raise ValueError("not found.")

df_t["clarification_category"] = df_t["response"].apply(extract_target)

In [16]:
df_t["clarification_category"].value_counts()

clarification_category
Reason        49
Definition    27
Background    20
Example       19
Result         1
Name: count, dtype: int64

In [18]:
df_t.to_csv("../data/elaborations/test_ds_c2sp_gpt4o_elab_preds_targets.csv", index=False)

In [26]:
for idx, row in df_t[df_t["clarification_category"]=="Reason"][:10].iterrows():
    print(row["label_text"], end="\n\n")
    print(row["elaboration_sentence"])
    print("-"*120)

Forecasters said more people could have died if there were no river gauges. A gauge is a kind of measuring stick. It sits in the water. The United States has 8,000 gauges to quickly track the rise of rivers. But that number may shrink.

It sits in the water.
------------------------------------------------------------------------------------------------------------------------
Helicopters were banned from flying over the school, CNN reported. Rescuers feared engine noise would make it hard to hear cries for help. They raced against the setting sun to search the area while they could still see. Briarwood Elementary School in Oklahoma City was also hit by the tornado. News reports said the twister shot cars through the front of the building.

They raced against the setting sun to search the area while they could still see.
------------------------------------------------------------------------------------------------------------------------
Istanbul is an important city for trade. Turke

In [4]:
import re

def extract_target_components(response):

    target_sentence = None
    target_phrase = None

    target_sentence, target_phrase = response.split("target_phrase=")
    target_sentence = target_sentence.split("target_sentence=")[1]

    return target_sentence, target_phrase

response = "target_sentence='He called the talk a \"fireside hangout.\"' target_phrase='\"fireside hangout\"'"
target_sentence, target_phrase = extract_target_components(response)
print(target_sentence)
print(target_phrase)

'He called the talk a "fireside hangout."' 
'"fireside hangout"'


In [11]:
df_t["target_sentence_4o"] = df_t["target_sentence_4o"].astype("str")
df_t["target_sentence_target"] = df_t["target_sentence_target"].astype("str")

for idx, row in df_t.iterrows():
    response = row["response"]  
    if response:
        try: 
            target_sentence, target_phrase = extract_target_components(response)
            df_t.at[idx, "target_sentence_4o"] = target_sentence
            df_t.at[idx, "target_sentence_target"] = target_phrase
        except Exception as e:
            df_t.at[idx, "target_sentence_4o"] = ""
            df_t.at[idx, "target_sentence_target"] = ""

df_t.isnull().sum()

doc_num                     0
source_text                 0
label_text                134
elaboration_sentence        0
response                    0
target_sentence_4o          0
target_sentence_target      0
dtype: int64

## Check for duplicates 

### (target sentence vs elaboration sentence) or (target phrase vs elaboration sentence) 

In [160]:
from tqdm.notebook import tqdm
from bert_score import BERTScorer

scorer = BERTScorer(model_type='bert-base-uncased',device='cuda:0')


for index, row in tqdm(df_t.iterrows(), total=len(df_t)):
    elaboration = row['target_sentence_target']
    target = row['target_sentence_4o']
    try:
        #  BERTScore for this pair
        P, R, F1 = scorer.score(
            cands=[target],  
            refs=[elaboration],              
        )
        
        df_t.at[index,"targets_bsf1"] = F1.mean().item()
    except Exception as e:
        print(index)

  0%|          | 0/116 [00:00<?, ?it/s]

In [162]:
df_check = df_t[df_t["targets_bsf1"]<0.6]
for index, row in df_check.iterrows():
    print("ID:", index)
    print("Similarity score:", round(row["targets_bsf1"],3))
    print(row["source_text"], end="\n\n")
    print("Elaboration:",row["elaboration_sentence"])
    print("Target sent:",row["target_sentence_4o"], end="\n\n")
    print("Target phrase:",row["target_sentence_target"], end="\n\n")
    print("-"*120)

ID: 1
Similarity score: 0.503
WASHINGTON – At least four people died in Midwest floods this Spring. But the death toll could have been higher. Forecasters said more people could have died if there were no river gauges. It sits in the water. The United States has 8,000 gauges to quickly track the rise of rivers. But that number may shrink. The reason is that lawmakers in Washington are in the middle of a fight.

Elaboration: A gauge is a kind of measuring stick.
Target sent: 'The United States has 8,000 gauges to quickly track the rise of rivers.' 

Target phrase: 'gauges'

------------------------------------------------------------------------------------------------------------------------
ID: 2
Similarity score: 0.559
WASHINGTON – At least four people died in Midwest floods this Spring. But the death toll could have been higher. Forecasters said more people could have died if there were no river gauges. A gauge is a kind of measuring stick. The United States has 8,000 gauges to quic

In [122]:
df_train.loc[728, "target_sentence_target"] = "'Healthy, Hunger-Free Kids Act'"

"'Healthy, Hunger-Free Kids Act'"

## Check if target phrase is in target sent

In [117]:
import re
from tqdm.notebook import tqdm

def is_substring_in_sentence(target_phrase, target_sentence, elaboration_sentence=None):
    """
    Check if a normalized version of the target phrase is in the target sentence.
    """

    def normalize(text):
        # convert to lowercase
        text = text.lower()
        # remove punctuation
        text = re.sub(r'[^\w\s]', '', text)
        # remove extra spaces
        text = " ".join(text.split())
        return text

    normalized_phrase = normalize(target_phrase)
    normalized_sentence = normalize(target_sentence)
    if elaboration_sentence:
        normalized_elab_sentence = normalize(elaboration_sentence)
        return normalized_phrase in normalized_sentence and normalized_phrase in normalized_elab_sentence

    return normalized_phrase in normalized_sentence

for index, row in tqdm(df_t.iterrows(), total=len(df_t)):
    target = row['target_sentence_target']
    target_sent = row['target_sentence_4o']
    elab_sent = row['elaboration_sentence']
    try:
        df_t.at[index,"target_isin"] = is_substring_in_sentence(target, target_sent)
    except Exception as e:
        print(index)

  0%|          | 0/116 [00:00<?, ?it/s]

In [84]:
df_check = df_t[df_t["target_isin"]==True]
print(len(df_check))
for index, row in df_check.iterrows():
    print("ID:", index)
    #print("Similarity score:", round(row["targets_bsf1"],3))
    print(row["source_text"], end="\n\n")
    print("Elaboration:",row["elaboration_sentence"])
    print("Target sent:",row["target_sentence_4o"], end="\n\n")
    print("Target phrase:",row["target_sentence_target"], end="\n\n")
    print("-"*120)

54
ID: 1
He called the talk a "fireside hangout." President Franklin D. Roosevelt used to make informal radio broadcasts called "fireside chats."

Elaboration: Obama's fireside hangout included many questions from different kinds of people.
Target sent: 'He called the talk a "fireside hangout."' 

Target phrase: '"fireside hangout"'

------------------------------------------------------------------------------------------------------------------------
ID: 16
And a lot of criminals have gotten away, too. One of the biggest robberies happened in February 2005 in the Netherlands at an airport in Amsterdam.

Elaboration: The Netherlands borders Belgium.
Target sent: 'One of the biggest robberies happened in February 2005 in the Netherlands at an airport in Amsterdam.' 

Target phrase: 'the Netherlands'

------------------------------------------------------------------------------------------------------------------------
ID: 21
These animals are part of American history. They  are relate

## Correct

In [53]:
import pandas as pd
from tqdm.notebook import tqdm
from pydantic import BaseModel


df_targets = pd.read_csv(f"../data/elaborations/test_ds_sp_subjects_targets.csv")
column_name = "target_sentence_4o" 
df_targets[column_name] = df_targets[column_name].fillna("")


class ExplanationTarget(BaseModel):
    #subject: str
    target_sentence: str

SYSTEM_PROMPT_TARGET_SENT = """
You are an expert in identifying the sentence from the CONTEXT TEXT that the provided explanation sentence clarifies or refers to. 
Your task is to return the sentence from the CONTEXT TEXT, not the explanation sentence itself.
"""

def format_target_sent_example(example):
    return [
            {"role": "system", "content": SYSTEM_PROMPT_TARGET_SENT},
            {"role": "user", "content":  "Identify the target sentence that the following explanation: '{}' clarifies or refers to within the given text: '{}'".format(
    example["elaboration_sentence"], example["label_text"])},
        ]

df_targets["messages"] = df_targets.apply(format_target_sent_example, axis=1)

for idx, example in tqdm(df_targets.iterrows(),total=len(df_targets)):
    if df_targets.at[idx,column_name] == "":
        completion = client.beta.chat.completions.parse(model=MODEL,messages=example["messages"],response_format=ExplanationTarget)
        df_targets.at[idx,column_name] = completion.choices[0].message.parsed

  0%|          | 0/116 [00:00<?, ?it/s]

In [48]:
print(df_targets["messages"].iloc[42])

[{'role': 'system', 'content': '\nYou are an expert in identifying the sentence from the CONTEXT TEXT that the provided explanation sentence clarifies or refers to. \nYour task is to return the sentence from the CONTEXT TEXT, not the explanation sentence itself.\n'}, {'role': 'user', 'content': 'Identify the target sentence that the following explanation: \'Tim Cook is the head of Apple.\' clarifies or refers to within the given text: \'Apple also agreed to change the way it bills customers. It will let people know when someone buys something on a game or app. Tim Cook is the head of Apple. He said the company gave in to the FTC. The agreement does not ask Apple "to do anything we weren\'t already going to do."\''}]


In [55]:
df_targets.loc[,"target_sentence_4o"]

ExplanationTarget(target_sentence='When the people who make the fantasy teams bet money that their players will score the most points, it is considered gambling.')

In [13]:
import pandas as pd

split = "train"
setting = "c4sp"
filename = f"../data/elaborations/train/{split}_ds_{setting}_elab_targets.csv" #f"../data/elaborations/{split}_ds_{setting}_subjects_targets.csv"#f"../data/elaborations/{split}_ds_{setting}_elab_targets.csv"
df_t = pd.read_csv(filename)

In [14]:
df_t[df_t["target_sentence_4o"]==""]

Unnamed: 0,doc_num,source_text,label_text,elaboration_sentence,response,target_sentence_4o,target_sentence_target
