# GPT workflow

In [1]:
import os
from dotenv import load_dotenv
from openai import OpenAI

%load_ext autoreload
%autoreload 2

In [2]:
# load environment variables
load_dotenv()

# get API key
api_key = os.getenv("OPENAI_API_KEY")

# init OpenAI client
client = OpenAI(api_key=api_key)


def prompt_gpt(
    model,
    prompt: str,
    temperature: float = 0.0,
    max_tokens: int = 200,
):
    # query ChatGPT
    chat_completion = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                # "content": f"You are a helpful assistant designed to answer the user's prompt.",
                "content": "You are a helpful assistant designed to carefully analyze academic abstracts based on specific inclusion and exclusion criteria.",
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],
        temperature=temperature,
        max_tokens=max_tokens,
    )
    return chat_completion.choices[0].message.content

In [3]:
model = "gpt-4o-mini"
prompt = "Say this is a test and nothing else."
response = prompt_gpt(model, prompt)
print(response)

This is a test.


In [None]:
import pandas as pd
from pathlib import Path


def get_data_path():
    return Path().resolve().parents[0]


file_path = "data/20250401-dataset-updated.xlsx"
data_path = get_data_path() / file_path

df = pd.read_excel(data_path)
df.head(3)

Unnamed: 0,database,authors,title,publisher,type,citation,Unnamed: 6,year,abstract,author keywords,...,reason,additional reason,reviewer 1,reviewer 1 verdict,reviewer 1 reason,reviewer 1 notes,reviewer 2,reviewer 2 verdict,reviewer 2 reason,reviewer 2 notes
0,scopus,"lakew, surafel m. (57198899157); cettolo, maur...",a comparison of transformer and recurrent neur...,,,,,2018,"recently, neural machine translation (nmt) has...",,...,Not Morphology,Not Low-Resource,Agustin,False,Not Morphology,"lacks true references to morphology, & low-res...",Ahmad,False,,
1,scopus,"perez-martin, jesus (57216846545); gomez-roble...",cross-lingual search for e-commerce based on q...,,,,,2023,online stores in the us offer a unique scenari...,back-translation; cross-lingual information re...,...,Not Translation,Not Low-Resource,Agustin,False,Not Translation,,Ahmad,False,Not Low-Resource,
2,scopus,"rikters, matiss (57190290578); pinnis, marcis ...",advancing estonian machine translation,,,,,2018,"in this paper, we present tildes work on boost...",morphologically rich languages; multi-way mach...,...,Not Low-Resource,Not Morphology,Agustin,False,Not Low-Resource,"Estonian is level 3, not 2 or lower, and only ...",Ahmad,False,,


In [5]:
# get first abstract
abstract = df["abstract"].iloc[1]

research_questions = """\
1. What challenges do varying degrees of morphological complexity (e.g., isolating, fusional, agglutinative, and polysynthetic languages) pose to machine translation systems in a low-resource language context? 
2. What techniques (e.g., rule-based methods, statistical models, or neural architectures) have been proposed to address these challenges?
3. How do morphology-aware techniques (e.g. subword modeling, morphological analyzers) compare in effectiveness for low-resource machine translation? 
4. What are the specific findings, challenges, and proposed solutions and results for machine translation of languages in each different morphological typology (polysynthetic, agglutinative, fusional)?
"""


# Chain-of-Thought prompt
def chain_of_thought_prompt(abstract: str, research_questions: str):
    prompt = f"""\
    You are an expert researcher. Your task is to analyze the abstract below and use its content to answer a set of specific research questions.

    ---

    Abstract:
    {abstract}

    ---

    Research Questions:
    {research_questions}

    ---

    Follow these steps carefully:

    Step 1: Read and Understand the Abstract
    Carefully read the abstract to extract all relevant information, including stated challenges, techniques used, comparisons, findings, and specific language typologies.

    Step 2: Analyze the Abstract for Each Research Question
    For each question below, extract answers **only if the abstract provides evidence or insights**. If the abstract does not include the required information, state that explicitly.

    Step 3: Provide a Structured Answer with Reasoning
    For each research question, write:
    - Answer: A direct response based on the abstract.
    - Evidence: A brief explanation or citation of the part of the abstract that supports your answer.

    ---

    Return your response in the following format:

    OUTPUT:
    1. Challenges of morphological complexity
    Answer:  
    Evidence:  

    2. Proposed techniques
    Answer:  
    Evidence:  

    3. Morphology-aware techniques**  
    Answer:  
    Evidence:  

    4. Specific findings per morphological typology (e.g., polysynthetic, agglutinative, fusional)
    Answer:  
    Evidence:  

    ---

    REASONING: Briefly summarize what the abstract reveals overall in relation to the research questions. Note any gaps or limitations in the information provided.
    """
    return prompt


# query ChatGPT
def workflow(
    model,
    abstract,
    temperature: float = 0.5,
    max_tokens: int = 250,
):
    prompt = chain_of_thought_prompt(abstract, research_questions)
    response = prompt_gpt(model, prompt, temperature, max_tokens)
    return response


# response = prompt_gpt(model, abstract)
model = "gpt-4o-mini"
response = workflow(model, abstract, temperature=0, max_tokens=500)
print(response)

OUTPUT:
1. Challenges of morphological complexity  
Answer: The abstract does not explicitly mention challenges related to varying degrees of morphological complexity in low-resource language contexts.  
Evidence: The challenges discussed focus on polysemy, high latency, context scarcity, non-translatable entities, loanwords, ambiguous morphemes, and cross-language ambiguity, but do not specifically address morphological typologies.

2. Proposed techniques  
Answer: The abstract mentions the use of domain adaptation on a simplified architecture of neural machine translation (NMT) and a classifier to determine the appropriateness of automatic translation.  
Evidence: The abstract states, "we leverage domain adaptation on a simplified architecture of neural machine translation (nmt)" and "the problem of query translatability is addressed by a classifier on whether the translation should be automatic or explicitly requested."

3. Morphology-aware techniques  
Answer: The abstract does not

In [6]:
import re


def extract_output_and_reasoning(result: str) -> tuple:
    # Use re.DOTALL to make '.' match newline characters
    match = re.search(r"OUTPUT:\s*\**(.*?)\**\s*REASONING:\s*(.*)", result, re.DOTALL)
    if match:
        output_text = match.group(1).replace("\n", " ").strip().strip("*")
        reasoning_text = match.group(2).replace("\n", " ").strip()
        return output_text, reasoning_text
    else:
        return None, None

In [7]:
output, reasoning = extract_output_and_reasoning(response)
print(output)
print(reasoning)

1. Challenges of morphological complexity   Answer: The abstract does not explicitly mention challenges related to varying degrees of morphological complexity in low-resource language contexts.   Evidence: The challenges discussed focus on polysemy, high latency, context scarcity, non-translatable entities, loanwords, ambiguous morphemes, and cross-language ambiguity, but do not specifically address morphological typologies.  2. Proposed techniques   Answer: The abstract mentions the use of domain adaptation on a simplified architecture of neural machine translation (NMT) and a classifier to determine the appropriateness of automatic translation.   Evidence: The abstract states, "we leverage domain adaptation on a simplified architecture of neural machine translation (nmt)" and "the problem of query translatability is addressed by a classifier on whether the translation should be automatic or explicitly requested."  3. Morphology-aware techniques   Answer: The abstract does not provide

In [8]:
data = []
data.append({"abstract": abstract, "output": output, "reasoning": reasoning})

df_output = pd.DataFrame(data)
df_output.head()

Unnamed: 0,abstract,output,reasoning
0,online stores in the us offer a unique scenari...,1. Challenges of morphological complexity An...,The abstract primarily addresses the challenge...


In [9]:
abstract_list = df["abstract"].tolist()
subset_abstracts = abstract_list[:5]

# process a subset of abstracts
data = []
for idx, abstract in enumerate(subset_abstracts):
    response = workflow(model, abstract, temperature=0, max_tokens=500)
    output, reasoning = extract_output_and_reasoning(response)
    data.append({"abstract": abstract, "output": output, "reasoning": reasoning})
    if idx % 1 == 0:
        print(f"processed {idx}/{len(subset_abstracts)} abstracts.")

processed 0/5 abstracts.
processed 1/5 abstracts.
processed 2/5 abstracts.
processed 3/5 abstracts.
processed 4/5 abstracts.


In [10]:
df_output = pd.DataFrame(data)
df_output.head(5)

Unnamed: 0,abstract,output,reasoning
0,"recently, neural machine translation (nmt) has...",1. Challenges of morphological complexity An...,The abstract primarily focuses on the performa...
1,online stores in the us offer a unique scenari...,1. Challenges of morphological complexity An...,The abstract primarily focuses on the challeng...
2,"in this paper, we present tildes work on boost...",1. Challenges of morphological complexity An...,The abstract provides insights into the challe...
3,"unlike english, in highly inflected indian lan...",1. Challenges of morphological complexity An...,The abstract primarily addresses the challenge...
4,"in this work, we studied methods for automatic...",1. Challenges of morphological complexity An...,The abstract primarily discusses the developme...


In [11]:
df_output["output"].iloc[2]

'1. Challenges of morphological complexity   Answer: The abstract indicates that the challenges arise from working with less resourced and morphologically complex language pairs, specifically between Estonian and Russian.   Evidence: The abstract states, "focusing mostly on the less resourced and morphologically complex language pairs between estonian and russian."  2. Proposed techniques   Answer: The abstract mentions the collection of parallel and monolingual data and experimentation with various model architectures as techniques to improve machine translation systems.   Evidence: The abstract states, "we describe our efforts on collecting parallel and monolingual data for the development of better neural machine translation models, as well as experiments with various model architectures."  3. Morphology-aware techniques   Answer: The abstract does not provide specific information on the effectiveness of morphology-aware techniques such as subword modeling or morphological analyzers