# GPT workflow

In [1]:
import os
from dotenv import load_dotenv
from openai import OpenAI

%load_ext autoreload
%autoreload 2

In [2]:
# load environment variables
load_dotenv()

# get API key
api_key = os.getenv("OPENAI_API_KEY")

# init OpenAI client
client = OpenAI(api_key=api_key)


def prompt_gpt(
    model,
    prompt: str,
    temperature: float = 0.0,
    max_tokens: int = 200,
):
    # query ChatGPT
    chat_completion = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                # "content": f"You are a helpful assistant designed to answer the user's prompt.",
                "content": "You are a helpful assistant designed to carefully analyze academic abstracts based on specific inclusion and exclusion criteria.",
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],
        temperature=temperature,
        max_tokens=max_tokens,
    )
    return chat_completion.choices[0].message.content

In [3]:
model = "gpt-4o-mini"
prompt = "Say this is a test and nothing else."
response = prompt_gpt(model, prompt)
print(response)

This is a test.


In [4]:
import pandas as pd
from pathlib import Path


def get_data_path():
    return Path().resolve().parents[0]


file_path = "data/dataset-updated.xlsx"
data_path = get_data_path() / file_path

df = pd.read_excel(data_path)
df.head(3)

Unnamed: 0,database,authors,title,year,abstract,author keywords,index keywords,low resource,morphology,languages,...,languages old,status,reviewer 1,reviewer 1 verdict,reviewer 1 reason,reviewer 1 notes,reviewer 2,reviewer 2 verdict,reviewer 2 reason,reviewer 2 notes
0,scopus,"rikters, matiss (57190290578); pinnis, marcis ...",advancing estonian machine translation,2018,"in this paper, we present tildes work on boost...",morphologically rich languages; multi-way mach...,computational linguistics; computer aided lang...,less resource,morpholog,estonian (3),...,estonian (3),False,Agustin,False,Not Low-Resource,"Estonian is level 3, not 2 or lower, and only ...",Ahmad,0.0,,
1,acl-anthology,"[baliber, renz iver, cheng, charibeth, adlaon,...",bridging philippine languages with multilingua...,2020,the philippines is home to more than 150 langu...,,,low-resource,morpholog,"cebuano (3), tagalog (3), english (5)",...,"cebuano (3), tagalog (3)",False,Agustin,False,Not Low-Resource,Both class 3,Ahmad,0.0,,
2,scopus,"trieu, hai-long (56051704500); nguyen, le-minh...",enhancing pivot translation using grammatical ...,2018,pivot translation can be one of the solutions ...,factored models; lemma forms; part-of-speech t...,computational linguistics; computer aided lang...,"low-resource, low resource",morpholog,"indonesian (3), malay (3), vietnamese (4)",...,,False,Agustin,False,Not Low-Resource,"None are class 2 or lower, but the authors des...",Ahmad,1.0,,


In [6]:
# get first abstract
abstract = df["abstract"].iloc[0]

research_questions = """\
1. What challenges do varying degrees of morphological complexity (e.g., isolating, fusional, agglutinative, and polysynthetic languages) pose to machine translation systems in a low-resource language context? 
2. What techniques (e.g., rule-based methods, statistical models, or neural architectures) have been proposed to address these challenges?
3. How do morphology-aware techniques (e.g. subword modeling, morphological analyzers) compare in effectiveness for low-resource machine translation? 
4. What are the specific findings, challenges, and proposed solutions and results for machine translation of languages in each different morphological typology (polysynthetic, agglutinative, fusional)?
"""


# Chain-of-Thought prompt
def chain_of_thought_prompt(abstract: str, research_questions: str):
    prompt = f"""\
    You are an expert researcher. Your task is to analyze the abstract below and use its content to answer a set of specific research questions.

    ---

    Abstract:
    {abstract}

    ---

    Research Questions:
    {research_questions}

    ---

    ### Follow these steps carefully:

    Step 1: Read and Understand the Abstract
    Carefully read the abstract to extract all relevant information, including stated challenges, techniques used, comparisons, findings, and specific language typologies.

    Step 2: Analyze the Abstract for Each Research Question
    For each question below, extract answers **only if the abstract provides evidence or insights**. If the abstract does not include the required information, state that explicitly.

    Step 3: Provide a Structured Answer with Reasoning
    For each research question, write:
    - Answer: A direct response based on the abstract.
    - Evidence: A brief explanation or citation of the part of the abstract that supports your answer.

    ---

    ### Return your response in the following format:

    ### Step-by-Step Analysis and Reasoning

    1. Challenges of morphological complexity
    Answer:  
    Evidence:  

    2. Proposed techniques
    Answer:  
    Evidence:  

    3. Morphology-aware techniques**  
    Answer:  
    Evidence:  

    4. Specific findings per morphological typology (e.g., polysynthetic, agglutinative, fusional)
    Answer:  
    Evidence:  

    ---

    ### Final Summary (REASONING):
    Briefly summarize what the abstract reveals overall in relation to the research questions. Note any gaps or limitations in the information provided.
    """
    return prompt


# query ChatGPT
def workflow(
    model,
    abstract,
    temperature: float = 0.5,
    max_tokens: int = 250,
):
    prompt = chain_of_thought_prompt(abstract, research_questions)
    response = prompt_gpt(model, prompt, temperature, max_tokens)
    return response


# response = prompt_gpt(model, abstract)
model = "gpt-4o-mini"
response = workflow(model, abstract, temperature=0, max_tokens=500)
print(f"Abstract:\n{abstract}\n")
print(response)

Abstract:
in this paper, we present tildes work on boosting the output quality and availability of estonian machine translation systems, focusing mostly on the less resourced and morphologically complex language pairs between estonian and russian.we describe our efforts on collecting parallel and monolingual data for the development of better neural machine translation models, as well as experiments with various model architectures with the goal to find the best performing model for our data.we attain state of the art mt results by training a multi way transformer model that improves the quality by up to +3.27 bleu points over the baseline system.we also provide a publicly available translation service via a mobile phone application. (c) 2018 the authors and ios press.

### Step-by-Step Analysis and Reasoning

1. **Challenges of morphological complexity**  
   Answer: The abstract mentions that the focus is on "less resourced and morphologically complex language pairs" between Estonian

In [None]:
import re


def extract_output_and_reasoning(result: str) -> tuple:
    # Use re.DOTALL to make '.' match newline characters
    match = re.search(r"OUTPUT:\s*\**(.*?)\**\s*REASONING:\s*(.*)", result, re.DOTALL)
    if match:
        output_text = match.group(1).replace("\n", " ").strip()
        reasoning_text = match.group(2).replace("\n", " ").strip()
        return output_text, reasoning_text
    else:
        return None, None

In [None]:
output, reasoning = extract_output_and_reasoning(response)
print(output)
print(reasoning)

In [None]:
data = []
data.append({"abstract": abstract, "output": output, "reasoning": reasoning})

df_output = pd.DataFrame(data)
df_output.head()

In [None]:
abstract_list = df["abstract"].tolist()
subset_abstracts = abstract_list[:10]

# process a subset of abstracts
for idx, abstract in enumerate(subset_abstracts):
    response = workflow(model, abstract, temperature=0, max_tokens=250)
    output, reasoning = extract_output_and_reasoning(response)
    data.append({"abstract": abstract, "output": output, "reasoning": reasoning})
    if idx % 1 == 0:
        print(f"processed {idx}/{len(subset_abstracts)} abstracts.")

In [None]:
df_output = pd.DataFrame(data)
df_output.head(10)

In [None]:
df_output["reasoning"].iloc[2]