# Categorization of abstracts

In [17]:
import os
from dotenv import load_dotenv
from openai import OpenAI

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
# load environment variables
load_dotenv()

# get API key
api_key = os.getenv("OPENAI_API_KEY")

# init OpenAI client
client = OpenAI(api_key=api_key)


def prompt_gpt(
    model,
    prompt: str,
    temperature: float = 0.0,
    max_tokens: int = 200,
):
    # query ChatGPT
    chat_completion = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant designed to carefully analyze academic abstracts based on specific inclusion and exclusion criteria.",
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],
        temperature=temperature,
        max_tokens=max_tokens,
    )
    return chat_completion.choices[0].message.content

In [19]:
model = "gpt-4o-mini"
prompt = "Say this is a test and nothing else."
response = prompt_gpt(model, prompt)
print(response)

This is a test.


In [23]:
import pandas as pd
from gpt.config import get_data_path


file_name = "limit_n=10_text_answers_gpt-4o-mini.csv"
data_path = get_data_path()
input_path = f"{data_path}/generated_answers/{file_name}"
df = pd.read_csv(input_path)
df.head(5)

Unnamed: 0,title,generated_anwers,reasoning
0,atlaslang mts 1- arabic text language into ara...,1. Challenges of morphological complexity An...,The abstract reveals that the authors are awar...
1,"utilizing lexical similarity between related, ...",1. Challenges of morphological complexity An...,The abstract provides insights into the techni...
2,structural biases for improving transformers o...,1. Challenges of morphological complexity An...,The abstract reveals that the authors are addr...
3,comparative study of low resource digaru langu...,1. Challenges of morphological complexity An...,The abstract provides insights into the challe...
4,hybrid approaches for augmentation of translat...,1. Challenges of morphological complexity An...,The abstract lacks specific information regard...


In [None]:
answer_outputs = df["generated_answers"].tolist()
answer_outputs[:5]

['1. Challenges of morphological complexity   Answer: Yes, the paper discusses challenges posed by the morphological complexity of the Arabic language.   Evidence: The abstract mentions that "Arabic is a Semitic language written from right to left. It is a derivational and flexional language, which is morphologically complex," indicating the inherent challenges in processing Arabic due to its morphological structure.  2. Proposed techniques   Answer: The paper proposes a combination of rule-based and example-based techniques to address the challenges of morphological complexity in machine translation.   Evidence: The abstract states that the system "is based on rule-based Interlingua and example-based approaches," highlighting the methods employed to tackle the complexities of Arabic text translation into Arabic Sign Language.  3. Morphology-aware techniques   Answer: Yes, the paper utilizes morphology-aware techniques, specifically a morphological analyzer for processing Arabic text. 

In [40]:
import re

# regular expressions for extracting answers
rq_patterns = {
    "rq1": r"1\. Challenges of morphological complexity\s+Answer: (.*?)\s+Evidence:",
    "rq2": r"2\. Proposed techniques\s+Answer: (.*?)\s+Evidence:",
    "rq3": r"3\. Morphology-aware techniques\s+Answer: (.*?)\s+Evidence:",
    "rq4": r"4\. Specific findings per morphological typology.*?Answer: (.*?)\s+Evidence:",
}


# function to extract answers based on patterns
def extract_answers(row, patterns):
    answers = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, row)
        answers[key] = match.group(1) if match else None
    return answers


# apply the extraction function to each row
answers_df = df["generated_anwers"].apply(lambda x: extract_answers(x, rq_patterns))
# convert the list of answers into separate columns
answers_df = pd.DataFrame(answers_df.tolist(), index=df.index)
# merge the extracted answers with the original dataframe
df = df.drop(columns=[col for col in df.columns if col.startswith("rq")])
df = pd.concat([df, answers_df], axis=1)
df.head(3)

Unnamed: 0,title,generated_anwers,reasoning,rq1,rq2,rq3,rq4
0,atlaslang mts 1- arabic text language into ara...,1. Challenges of morphological complexity An...,The abstract reveals that the authors are awar...,"Yes, the paper discusses challenges posed by t...",The paper proposes a combination of rule-based...,"Yes, the paper utilizes morphology-aware techn...",The paper does not provide specific findings o...
1,"utilizing lexical similarity between related, ...",1. Challenges of morphological complexity An...,The abstract provides insights into the techni...,The paper does not explicitly discuss the chal...,The paper proposes using subword-level pivot-b...,"Yes, the paper employs subword modeling techni...",The paper reports improvements in translation ...
2,structural biases for improving transformers o...,1. Challenges of morphological complexity An...,The abstract reveals that the authors are addr...,"Yes, the paper discusses challenges posed by m...",The paper proposes two techniques to address t...,"Yes, the paper uses morphology-aware technique...",The paper provides findings specific to agglut...


In [43]:
# extract the answers into separate lists
rq1_answers = df["rq1"].tolist()
rq2_answers = df["rq2"].tolist()
rq3_answers = df["rq3"].tolist()
rq4_answers = df["rq4"].tolist()

# print the first 5 answers for each research question
print(rq1_answers[0])
print(answer_outputs[0])

Yes, the paper discusses challenges posed by the morphological complexity of the Arabic language.
1. Challenges of morphological complexity   Answer: Yes, the paper discusses challenges posed by the morphological complexity of the Arabic language.   Evidence: The abstract mentions that "Arabic is a Semitic language written from right to left. It is a derivational and flexional language, which is morphologically complex," indicating the inherent challenges in processing Arabic due to its morphological structure.  2. Proposed techniques   Answer: The paper proposes a combination of rule-based and example-based techniques to address the challenges of morphological complexity in machine translation.   Evidence: The abstract states that the system "is based on rule-based Interlingua and example-based approaches," highlighting the methods employed to tackle the complexities of Arabic text translation into Arabic Sign Language.  3. Morphology-aware techniques   Answer: Yes, the paper utilizes

In [52]:
def combine_answers(answers: list) -> str:
    combined = ", ".join(answers)
    return combined


# combine all answers into a single string
rq1_answers_combined = combine_answers(rq1_answers)
rq2_answers_combined = combine_answers(rq2_answers)
rq3_answers_combined = combine_answers(rq3_answers)
rq4_answers_combined = combine_answers(rq4_answers)
# print the combined answers
rq1_answers_combined[:200]

'Yes, the paper discusses challenges posed by the morphological complexity of the Arabic language., The paper does not explicitly discuss the challenges posed by the morphological complexity of the tar'

In [58]:
def get_prompt(answer: str):
    prompt = f"""
    Based on the following analysis, identify five **thematic categories** that summarize the key concepts, methods, or contributions mentioned.

    --- BEGIN ANALYSIS ---
    {answer}
    --- END ANALYSIS ---

    The categories should reflect actual *topics or themes* found in the answers, such as techniques, challenges, or typologies discussed (e.g., "Cross-lingual Transfer Learning", "Morphological Generation", "Polysynthetic Language Challenges").

    Only return the categories, comma-separated, in the following format:
    OUTPUT: Category 1, Category 2, Category 3, Category 4, Category 5
    """
    return prompt


def extract_output(result: str) -> tuple:
    # Use re.DOTALL to make '.' match newline characters
    match = re.search(r"OUTPUT:\s*(.*)", result, re.DOTALL)
    if match:
        output_text = match.group(1).replace("\n", " ").strip().strip("*")
        return output_text
    else:
        return result


responses = {}
combined_answers = [
    rq1_answers_combined,
    rq2_answers_combined,
    rq3_answers_combined,
    rq4_answers_combined,
]
# loop through each answer and query ChatGPT
for i, answer in enumerate(combined_answers):
    prompt = get_prompt(answer)
    # query ChatGPT
    response = prompt_gpt(model, prompt, max_tokens=500)
    response = extract_output(response)
    responses[f"rq{i+1}_categories"] = response
    print(f"Response {i+1}/{len(combined_answers)}...")

Response 1/4...
Response 2/4...
Response 3/4...
Response 4/4...


In [59]:
for key, value in responses.items():
    print(f"{key}: {value}")

rq1_categories: Morphological Complexity Challenges, Low-Resource Language Issues, Agglutinative Language Characteristics, Neural Machine Translation (NMT) Challenges, Language-Specific Translation Difficulties
rq2_categories: Morphological Complexity in Machine Translation, Techniques for Low-Resource Contexts, Subword Segmentation Methods, Evaluation of Machine Translation Approaches, Rule-Based and Example-Based Techniques
rq3_categories: Morphology-aware Techniques, Subword Modeling, Morphological Analysis, Comparative Effectiveness, Pre-annotation Techniques
rq4_categories: Agglutinative Language Challenges, Polysynthetic Language Findings, Morphological Complexity in Translation, Translation Performance Improvements, Language-Specific Morphological Issues
