# Load model

In [4]:
import os
#os.environ["OPENAI_API_KEY"] = "my_key"

In [3]:
from openai import OpenAI 
import os

MODEL="gpt-4o-mini"
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Load data

In [5]:
from datasets import load_dataset
import os

data_path = "../data/elaborations"

data_files_c2sp = {
    'train': os.path.join(data_path,"train","train_ds_c2sp.csv"),      
    'validation': os.path.join(data_path,"validation","valid_ds_c2sp.csv"),  
    'test': os.path.join(data_path,"test","test_ds_c2sp.csv")         
}

dataset = load_dataset('csv', data_files=data_files_c2sp)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 1049
    })
    validation: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 134
    })
    test: Dataset({
        features: ['doc_num', 'source_text', 'label_text', 'elaboration_sentence', 'contextual_specificity_rating'],
        num_rows: 116
    })
})


# Prompt and messages

In [6]:
SYSTEM_PROMPT_TARGET = """You are an expert in determining what the explanation sentence provided is explaining within the context text.
Guidelines: 
- The "explanation target" should be a specific phrase, term, action, or concept in the context text provided. 
- The "explanation target" should be written as a noun phrase, not as a complete sentence.
- If there are multiple possible targets, select the first target that you think is most appropriate.

Return your answer in the following format:
{"sentence":"<provided explanation sentence>", "target": "<explanation target>"}
"""

# Turn examples into ChatML format

In [22]:
def format_example(example):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_SUBJECT},
            {"role": "user", "content":  "Identify the subject of the following explanation sentence: '{}' within the given text: '{}'".format(
    example["elaboration_sentence"], example["label_text"])},
        ]
    }
#formatted_train_dataset = dataset["train"].map(format_example)
#formatted_validation_dataset = dataset["validation"].map(format_example)
formatted_test_dataset = dataset["test"].map(format_example)

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

# Identify subjects of elaborations

Analyze the subjects to explore the types of elaboration sentences.

In [24]:
SYSTEM_PROMPT_SUBJECT = """You are an expert in identifying the subject of the provided explanation sentence based on the context text. 
If the subject of the explanation sentence is a **pronoun (e.g., "it," "they," "he," "she")**, determine what the pronoun is referring to within the context.
The subject should be written as a phrase, not as a complete sentence.
If there are multiple possible subjects, select the first one you think is most appropriate.
"""

SYSTEM_PROMPT_SUBJECT_PRESENT_IN_CONTEXT = """You are an expert in identifying the subject of the provided explanation sentence based on the context text. 
If the subject of the explanation sentence is a **pronoun (e.g., "it," "they," "he," "she")**, determine what the pronoun is referring to within the context.
The subject should be written as a phrase, not as a complete sentence.
The subject phrase should be present in the context text.
If the subject of explanation sentence is not present in the context text, respond with 'NO'.
"""

SYSTEM_PROMPT_KEYWORD = """You are an expert in identifying the subject of the provided explanation sentence based on the context text. 
Return the main keyword which explanation sentence is referring to within the context.
The keyword should be made up of only one word, and it should be found in the context text.
If the keyword is not present in the context text, respond with 'NO'.
"""

In [27]:
example = "Identify the subject of the following explanation sentence: '{}' within the given text: '{}'".format(
    dataset["test"][0]["elaboration_sentence"], dataset["test"][0]["label_text"])
print(example, end="\n\n")

completion = client.chat.completions.create(
  model=MODEL,
  messages=[
    {"role": "system", "content": SYSTEM_PROMPT_KEYWORD}, 
    {"role": "user", "content": example }  
  ]
)

print("Assistant: " + completion.choices[0].message.content)

Identify the subject of the following explanation sentence: 'Many do not have the money to get the training they need.' within the given text: 'New companies have come that need skilled workers with more education. New Haven youth want those jobs, but they do not have the education or the skills. Many do not have the money to get the training they need. That is where New Haven Promise comes in. It will make a difference by paying for college.'

Assistant: money


In [40]:
import pandas as pd
from tqdm.notebook import tqdm

df_results = pd.DataFrame({
    'source_text': dataset['test']['source_text'], 
    'label_text': dataset['test']['label_text'],
    'elaboration_sentence': dataset['test']['elaboration_sentence'],
    'subject': ""
})

subjects = []
for example in tqdm(formatted_test_dataset,total=len(formatted_test_dataset)):
    completion = client.chat.completions.create(model=MODEL,messages=example["messages"])
    subjects.append(completion.choices[0].message.content)

  0%|          | 0/116 [00:00<?, ?it/s]

In [41]:
df_results["subject"] = subjects 

In [42]:
df_results

Unnamed: 0,source_text,label_text,elaboration_sentence,subject
0,New companies have come that need skilled work...,New companies have come that need skilled work...,Many do not have the money to get the training...,Many (youth)
1,But the death toll could have been higher. For...,But the death toll could have been higher. For...,A gauge is a kind of measuring stick.,A gauge
2,Forecasters said more people could have died i...,Forecasters said more people could have died i...,It sits in the water.,a gauge
3,Helicopters were banned from flying over the s...,Helicopters were banned from flying over the s...,They raced against the setting sun to search t...,Rescuers
4,Istanbul is an important city for trade. Turke...,Istanbul is an important city for trade. Turke...,Turkey is larger than the state of Texas.,Turkey
...,...,...,...,...
111,"Instead of preparing for the Super Bowl, he ha...","Instead of preparing for the Super Bowl, he ha...","Like many mysteries, this one may not be solve...",this one
112,Should kids play tackle football? Football is ...,Should kids play tackle football? Football is ...,Players get bounced around.,Players
113,He signed two of the biggest acts of last year...,He signed two of the biggest acts of last year...,Then the companies sell and promote the records.,the companies
114,It worries about the spread of disease. It als...,It worries about the spread of disease. It als...,There are certainly good reasons to be worried.,the spread of disease and the inability of new...


In [45]:
idx= 114
print(df_results.loc[idx,"label_text"], end="\n\n")
print(df_results.loc[idx, "subject"])

It worries about the spread of disease. It also worries that newly released orcas will not be able to find food on their own. There are certainly good reasons to be worried. Other attempts to release captive orcas were unsuccessful. Some freed orcas died.

the spread of disease and the inability of newly released orcas to find food


In [46]:
df_results.to_csv("../data/elaborations/test_ds_subjects.csv", index=False)