# Tagging paper categories

In [1]:
import os
from dotenv import load_dotenv
from openai import OpenAI

%load_ext autoreload
%autoreload 2

In [2]:
# load environment variables
load_dotenv()

# get API key
api_key = os.getenv("OPENAI_API_KEY")

# init OpenAI client
client = OpenAI(api_key=api_key)


def prompt_gpt(
    model,
    prompt: str,
    temperature: float = 0.0,
    max_tokens: int = 200,
):
    # query ChatGPT
    chat_completion = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant designed to carefully analyze academic papers based on specific instructions.",
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],
        temperature=temperature,
        max_tokens=max_tokens,
    )
    return chat_completion.choices[0].message.content

In [3]:
model = "gpt-4o-mini"
prompt = "Say 'this is a test' and nothing else."
response = prompt_gpt(model, prompt)
print(response)

This is a test.


In [4]:
import pandas as pd

data_path = "../data/full-text.csv"
full_df = pd.read_csv(data_path)
# remove .pdf from the title
full_df["title"] = full_df["title"].str.replace(".pdf", "", regex=False)
display(full_df.head())
display(full_df.count())

Unnamed: 0,title,text,morphology,methodology
0,a comparative study of english to kannada base...,International Journal of Applied Engineering R...,agglutinat,data augmentation
1,"a comparison of transformer, recurrent neural ...",2020 20th International Conference on Advances...,agglutinat,"morphological analyzer, segmentation"
2,a morpheme-based weighting for chinese-mongoli...,"IEICE TRANS. INF. & SYST., VOL.E99–D, NO.11 NO...",,segmentation
3,a morphological analyser for maltese,ScienceDirect\nAvailable online at www.science...,fusional,morphological analyzer
4,a multi-agent solution for managing complexity...,"B. Hettige et al., Int. J. of Design & Nature ...",,morphological analyzer


title          177
text           177
morphology     125
methodology    150
dtype: int64

In [5]:
murilo_df = pd.read_csv("../data/murilo_titles.csv")
display(murilo_df.head())
display(murilo_df.count())

Unnamed: 0,title
0,a morphological analyser for maltese
1,a novel morphological analysis based approach ...
2,a systematic analysis of subwords and cross-li...
3,an evaluation of subword segmentation strategi...
4,analysis of subword tokenization for transform...


title    47
dtype: int64

In [17]:
# filter the dataframe to only include the titles in murilo_titles
filtered_df = full_df[full_df["title"].isin(murilo_df["title"])]
# reset the index
filtered_df = filtered_df.reset_index(drop=True)
display(filtered_df.head())
display(filtered_df.count())

Unnamed: 0,title,text,morphology,methodology
0,a morphological analyser for maltese,ScienceDirect\nAvailable online at www.science...,fusional,morphological analyzer
1,a novel morphological analysis based approach ...,(IJACSA) International Journal of Advanced Com...,,
2,a systematic analysis of subwords and cross-li...,Findings of the Association for Computational ...,"analytic, agglutinat","subword modeling, segmentation"
3,analysis of subword tokenization for transform...,\n979-8-3503-5362-4/24/$31.00 ©2024 IEEE Ana...,agglutinat,"subword modeling, segmentation"
4,chinese-uyghur bilingual lexicon induction bas...,Chinese-Uyghur Bilingual Lexicon Induction Ba...,,"morphological analyzer, segmentation, data aug..."


title          40
text           40
morphology     32
methodology    35
dtype: int64

In [18]:
# RQs
research_questions = """
    1. What challenges do varying degrees of morphological complexity (e.g., isolating, fusional, agglutinative, and polysynthetic languages) pose to machine translation systems in a low-resource language context?
    2. What techniques (e.g., rule-based methods, statistical models, or neural architectures) have been proposed to address these challenges?
    3. How do morphology-aware techniques (e.g. subword modeling, morphological analyzers) compare in effectiveness for low-resource machine translation?
    4. What are the specific findings, challenges, and proposed solutions and results for machine translation of languages in each different morphological typology (polysynthetic, agglutinative, fusional)?
"""

In [19]:
# get the first paper
text = filtered_df["text"].iloc[0]

prompt = f"""
Based on the following paper, answer the following research questions with as much detail as possible and using highly technical language.

RESEARCH QUESTIONS:
{research_questions}

PAPER:
{text}

Only return the answers to each research question in the following format:
RESEARCH QUESTIONS ANSWERS:
1. Research Question 1 Answer.
2. Research Question 2 Answer.
3. Research Question 3 Answer.
4. Research Question 4 Answer.
"""

In [20]:
model = "gpt-4o-mini"
response = prompt_gpt(model, prompt, max_tokens=2000)
response

"RESEARCH QUESTIONS ANSWERS:\n1. The varying degrees of morphological complexity in languages, such as isolating, fusional, agglutinative, and polysynthetic typologies, present significant challenges to machine translation systems, particularly in low-resource language contexts. Isolating languages, characterized by minimal inflection, may lead to a lack of morphological information, complicating the translation process. In contrast, fusional languages, which combine multiple grammatical categories into single morphemes, can result in ambiguity and difficulty in disambiguating forms. Agglutinative languages, with their extensive use of affixes, may overwhelm translation systems with numerous morphological variations, while polysynthetic languages, which can express complex ideas in single words through extensive morphological processes, pose challenges in capturing the full semantic content during translation. The limited availability of training data for low-resource languages exacerb

In [26]:
# extract the answers from the response
def extract_answers(response):
    response_answers = response.split("\n")
    return [
        ans
        for ans in response_answers
        if ans.strip() and not ans.startswith("RESEARCH QUESTIONS ANSWERS:")
    ]


answers = extract_answers(response)
answers

['1. The varying degrees of morphological complexity in languages, such as isolating, fusional, agglutinative, and polysynthetic typologies, present significant challenges to machine translation systems, particularly in low-resource language contexts. Isolating languages, characterized by minimal inflection, may lead to a lack of morphological information, complicating the translation process. In contrast, fusional languages, which combine multiple grammatical categories into single morphemes, can result in ambiguity and difficulty in disambiguating forms. Agglutinative languages, with their extensive use of affixes, may overwhelm translation systems with numerous morphological variations, while polysynthetic languages, which can express complex ideas in single words through extensive morphological processes, pose challenges in capturing the full semantic content during translation. The limited availability of training data for low-resource languages exacerbates these issues, as machin

### wrapping the workflow in functions

In [38]:
def get_prompt(research_questions: str, paper_text: str) -> str:
    prompt = f"""
    Based on the following paper, answer the following research questions with as much detail as possible and using highly technical language.

    RESEARCH QUESTIONS:
    {research_questions}

    PAPER:
    {paper_text}

    Only return the answers to each research question in the following format:
    RESEARCH QUESTIONS ANSWERS:
    1. Research Question 1 Answer.
    2. Research Question 2 Answer.
    3. Research Question 3 Answer.
    4. Research Question 4 Answer.
    """
    return prompt


# create and save dataframe with the answers
def create_answers_df(answers: list, paper_title: str) -> pd.DataFrame:
    answers_dict = {
        "title": paper_title,
        "rq1_answer": answers[0],
        "rq2_answer": answers[1],
        "rq3_answer": answers[2],
        "rq4_answer": answers[3],
    }
    return pd.DataFrame([answers_dict])


def generate_answers(
    df: pd.DataFrame,
    model: str,
    research_questions: str,
    output_dir: str,
    max_tokens: int = 2000,
    use_subset: bool = False,
):
    if use_subset:
        df = df.iloc[:5]
    num_papers = len(df)

    # init the dataframe to save the answers
    file_name = "paper_answers.csv"
    output_path = f"{output_dir}/{file_name}"
    if not os.path.exists(output_path):
        pd.DataFrame(
            columns=["title", "rq1_answer", "rq2_answer", "rq3_answer", "rq4_answer"]
        ).to_csv(output_path, index=False)

    for idx, row in df.iterrows():
        print(f"Processing paper {idx + 1}/{num_papers}")
        paper_text = row["text"]
        paper_title = row["title"]

        prompt = get_prompt(research_questions, paper_text)
        response = prompt_gpt(model, prompt, max_tokens=max_tokens)
        answers = extract_answers(response)

        answers_df = create_answers_df(answers, paper_title)
        # append to the CSV file
        answers_df.to_csv(output_path, mode="a", header=False, index=False)

In [40]:
# generate answers for the first 5 papers
output_dir = "../data/generated_answers"
generate_answers(
    filtered_df,
    model,
    research_questions,
    output_dir,
    max_tokens=2000,
    use_subset=True,
)

Processing paper 1/5
Processing paper 2/5
Processing paper 3/5
Processing paper 4/5
Processing paper 5/5
