In [1]:
import json

def load_jsonl(file_path):
    """
    Loads data from a .jsonl file.

    Args:
        file_path (str): The path to the .jsonl file.

    Returns:
        list: A list of dictionaries, where each dictionary represents a JSON object from a line.
    """
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data.append(json.loads(line.strip()))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line: {line.strip()} - {e}")
    return data

In [2]:
training_data = load_jsonl("scholarly/train/documents.jsonl")
len(training_data), training_data[0]

(40,
 {'id': '36_0',
  'title': 'Temporal Qualifiers in Linguistics: Outdated, Obsolete, and Archaic Forms',
  'text': 'In linguistics, certain forms are used to indicate that a word or expression is no longer in current use. These forms are classified as temporal qualifiers, which provide context about the timing or period when a particular linguistic expression was used. Specifically, the outdated form, obsolete form, and archaic form are all types of temporal qualifiers. The outdated form refers to a word or expression that was once commonly used but has since fallen out of favor. Similarly, the obsolete form is a word or expression that has completely gone out of use. The archaic form, on the other hand, is a word or expression that is no longer used in everyday language but may still be found in historical or literary contexts. All these forms serve as temporal qualifiers, helping to situate the language in a particular time or context.'})

In [3]:
def load_txt_file_content(filepath):
    """
    Loads the entire content of a text file into a single string.

    Args:
        filepath (str): The path to the text file.

    Returns:
        str: The content of the file, or None if an error occurs.
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.readlines()
        return content
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        return None
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return None

In [4]:
terms_text_list = load_txt_file_content("scholarly/train/types.txt")

In [5]:
# final_term_to_doc_mapping = {}
# for term in terms_text_list:
#     lst_1 = []
#     for data_1 in training_data:
#         if term in data_1['title'] or term in data_1['text']:
#             lst_1.append(data_1['id'])
#     final_term_to_doc_mapping[term]= lst_1

In [6]:
test_data = load_jsonl('scholarly/test/text2onto_scholarly_test_documents.jsonl')

In [7]:
test_data

[{'id': '35_0',
  'title': 'Types of Modification in Linguistic Information',
  'text': 'In the realm of linguistic information, modification types are crucial for understanding how words or phrases modify each other. There are several types of modification, including pre-modifiers, post-modifiers, and indifferent modifiers. A pre-modifier is a type of modification that occurs before the word it modifies. On the other hand, a post-modifier is a modification type that occurs after the word it modifies. Additionally, there is a modification type known as indifferent, which likely refers to a modifier whose position relative to the word it modifies is not fixed or is indifferent to the word order.'},
 {'id': '33_0',
  'title': 'Types of MultiWordExpressions in Linguistics',
  'text': 'In the realm of linguistic information, several types of phrases are classified as MultiWordExpressions. Specifically, preposition phrases, verb phrases, noun phrases, and adjective phrases all fall under th

In [8]:
relevant_docs = []
terms_rel = ['type', 'types', 'subtypes', 'subtype', 'Types', 'categorized', 'category','categorization ']
for doc in test_data:
    title= doc['title']
    text= doc['text']
    for term in terms_rel:
        if term in title or term in text:
            if doc not in relevant_docs:
                relevant_docs.append(doc)


In [9]:
len(training_data)

40

In [10]:
len(relevant_docs)

10

In [11]:
len(test_data)

10

In [None]:
from google import genai
model = "gemini-2.5-pro"
client= genai.Client(api_key="")

In [38]:
from tqdm import tqdm
final_ans = []

In [39]:
import json

# Assuming 'tqdm', 'client', 'model', and 'relevant_docs' are defined elsewhere.
# The following is the modified code block.

# This is the document you provided, which we will now use as a one-shot example.
example_doc_for_prompt = {
    'id': '36_0',
    'title': 'Temporal Qualifiers in Linguistics: Outdated, Obsolete, and Archaic Forms',
    'text': 'In linguistics, certain forms are used to indicate that a word or expression is no longer in current use. These forms are classified as temporal qualifiers, which provide context about the timing or period when a particular linguistic expression was used. Specifically, the outdated form, obsolete form, and archaic form are all types of temporal qualifiers. The outdated form refers to a word or expression that was once commonly used but has since fallen out of favor. Similarly, the obsolete form is a word or expression that has completely gone out of use. The archaic form, on the other hand, is a word or expression that is no longer used in everyday language but may still be found in historical or literary contexts. All these forms serve as temporal qualifiers, helping to situate the language in a particular time or context.'
}

# The expected output for the example document.
example_output_for_prompt = [
    {"outdated form": "temporal qualifiers"},
    {"obsolete form": "temporal qualifiers"},
    {"archaic form": "temporal qualifiers"}
]


# The loop from your code
for i, doc in tqdm(enumerate(test_data)):
    
    # --- THIS IS THE MODIFIED PROMPT ---
    prompt = f"""
    You are an expert AI specializing in information science and ontology. Your task is to carefully read a document's title and text, identify specific conceptual terms, and classify them according to the hierarchical relationships provided within the text.

    ## Domain Description: Scholarly Ontology
    The domain is **Scholarly Ontology**, which involves the formal structuring of knowledge within academic and research fields. It defines concepts, terms, and the relationships between them to create a clear, unambiguous framework for data and information. This includes classifying entities like linguistic qualifiers, publication types, research methods, and data formats to ensure consistency and facilitate automated reasoning across scholarly works.

    ## Instructions
    1.  Read the provided title and text to understand the main concepts and their relationships.
    2.  Identify specific, named examples or instances of a broader category mentioned in the document.
    3.  For each identified term, find its corresponding parent category or type as explicitly defined in the document.
    4.  Provide the output as a JSON list of dictionaries. Each dictionary must contain a single key-value pair, where the key is the specific term (child) and the value is its assigned parent type.
    5.  Only extract terms and types explicitly mentioned in the text. Do not invent terms or categories.

    ## Example
    **Input Document:**
    {json.dumps(example_doc_for_prompt, indent=4)}

    **Correct Output:**
    {json.dumps(example_output_for_prompt, indent=4)}

    ## Your Task
    Process the following document and provide the output in the specified JSON format:

    **Input Document:**
    {json.dumps(doc, indent=4)}"""

    generation_config = {"response_mime_type": "application/json"}

    response = client.models.generate_content(
            contents=prompt,
            config=generation_config,
            model=model
        )
    final_ans.append(response.text)
    if i % 5 == 0:
        print(response.text)

1it [00:34, 34.59s/it]

[
    {
        "pre-modifier": "modification"
    },
    {
        "post-modifier": "modification type"
    },
    {
        "indifferent": "modification type"
    }
]


6it [01:18, 10.85s/it]

[
    {
        "prefix": "affix"
    },
    {
        "suffix": "affix"
    },
    {
        "infix": "affix"
    },
    {
        "transfix": "affix"
    },
    {
        "simulfix": "affix"
    },
    {
        "zero morph": "affix"
    }
]


10it [01:48, 10.84s/it]


In [40]:
final_ans

['[\n    {\n        "pre-modifier": "modification"\n    },\n    {\n        "post-modifier": "modification type"\n    },\n    {\n        "indifferent": "modification type"\n    }\n]',
 '[\n    {\n        "preposition phrases": "MultiWordExpressions"\n    },\n    {\n        "verb phrases": "MultiWordExpressions"\n    },\n    {\n        "noun phrases": "MultiWordExpressions"\n    },\n    {\n        "adjective phrases": "MultiWordExpressions"\n    }\n]',
 '[\n    {\n        "infinitive clause": "clausal argument"\n    },\n    {\n        "prepositional interrogative clause": "clausal argument"\n    },\n    {\n        "possessive infinitive clause": "clausal argument"\n    },\n    {\n        "interrogative clause": "clausal argument"\n    },\n    {\n        "subjunctive clause": "clausal argument"\n    },\n    {\n        "sentential clause": "clausal argument"\n    },\n    {\n        "prepositional gerund clause": "clausal argument"\n    },\n    {\n        "interrogative infinitive clause": 

In [None]:
# for i, doc in tqdm(enumerate(relevant_docs[285:])):
        
#         prompt=f"""
#         You are an expert AI specializing in ecology and information extraction. Your task is to carefully read an article's title and text, identify specific ecological terms, and classify them according to the categories provided within the text.

#         ## Domain Description: Ecology
#         Ecology is the scientific study of the intricate relationships between living organisms and their physical environment. It examines how organisms interact with each other and with abiotic factors like climate, soil, and water. The domain covers vast topics including biodiversity, population dynamics, ecosystem functioning, and nutrient cycles. This field is critical for understanding and addressing major environmental challenges, from conservation to the impacts of climate change.

#         ## Instructions
#         1. Read the provided title and text to understand the main concepts and their relationships.
#         2. Identify specific, named examples or instances of a broader category mentioned in the text.
#         3. For each identified term, find its corresponding parent category or type as defined in the article.
#         4. Provide the output as a JSON list of dictionaries. Each dictionary must contain a single key-value pair, where the key is the specific term extracted, and the value is its assigned type.
#         5. Only extract terms and types explicitly mentioned in the text. Do not invent terms or categories.

#         ## Example
#         **Input Document:**
#         {json.dumps({
#             "id": "1183_0",
#             "title": "Types of Environmental Material Temperatures",
#             "text": "The temperature of various environmental materials is a crucial aspect of environmental monitoring. Specifically, temperatures of air, soil, and water are categorized under the broader classification of temperature of environmental material. The temperature of air is one such type, playing a significant role in weather patterns and climate conditions. Similarly, the temperature of soil is another important category, influencing plant growth and microbial activity. Lastly, the temperature of water, whether in rivers, lakes, or oceans, affects aquatic life and is a vital component in the Earth's climate system. Understanding these different temperatures is essential for assessing and mitigating the impact of environmental changes."
#         })}

#         **Correct Output:**
        
#         [
#             {{"temperature of air": "Temperature of Environmental Material"}},
#             {{"temperature of soil": "Temperature of Environmental Material"}},
#             {{"temperature of water": "Temperature of Environmental Material"}}
#         ]

#         ## Your Task
#         Process the following document and provide the output in the specified JSON format:

#         **Input Document:**
#         {doc}
#         """

#         generation_config = {"response_mime_type": "application/json"}

#         response = client.models.generate_content(
#                 contents= prompt,
#                 config=generation_config,
#                 model=model
#             )
#         final_ans.append(response.text)
#         if i%5==0:
#             print(response.text)
        
        

1it [00:04,  4.72s/it]

[
  {
    "Atmospheric wind speed": "speed"
  },
  {
    "speed of a water current": "speed"
  },
  {
    "speed at which water infiltrates the soil": "speed"
  },
  {
    "speed at which soil drains water": "speed"
  },
  {
    "increased speed": "speed"
  },
  {
    "decreased speed": "speed"
  },
  {
    "sound speed": "speed"
  },
  {
    "normal speed": "speed"
  }
]


6it [00:22,  4.11s/it]

[
  {
    "asterids": "Pentapetalae"
  }
]


11it [00:34,  2.62s/it]

[
  {
    "Arecales": "commelinids"
  },
  {
    "commelinids": "monocotyledons"
  }
]


15it [00:40,  2.73s/it]


In [41]:
final_ans

['[\n    {\n        "pre-modifier": "modification"\n    },\n    {\n        "post-modifier": "modification type"\n    },\n    {\n        "indifferent": "modification type"\n    }\n]',
 '[\n    {\n        "preposition phrases": "MultiWordExpressions"\n    },\n    {\n        "verb phrases": "MultiWordExpressions"\n    },\n    {\n        "noun phrases": "MultiWordExpressions"\n    },\n    {\n        "adjective phrases": "MultiWordExpressions"\n    }\n]',
 '[\n    {\n        "infinitive clause": "clausal argument"\n    },\n    {\n        "prepositional interrogative clause": "clausal argument"\n    },\n    {\n        "possessive infinitive clause": "clausal argument"\n    },\n    {\n        "interrogative clause": "clausal argument"\n    },\n    {\n        "subjunctive clause": "clausal argument"\n    },\n    {\n        "sentential clause": "clausal argument"\n    },\n    {\n        "prepositional gerund clause": "clausal argument"\n    },\n    {\n        "interrogative infinitive clause": 

In [42]:
term2docs_mapping_test = {}
term2types_mapping_test = {}
for ans in final_ans:
    ans_dict = json.loads(ans)
    for term_type in ans_dict:
        for term, type in term_type.items():
            if term not in term2types_mapping_test.keys():
                term2types_mapping_test[term]=[type]
            else:
                term2types_mapping_test[term].append(type)
term2types_mapping_test


{'pre-modifier': ['modification'],
 'post-modifier': ['modification type'],
 'indifferent': ['modification type'],
 'preposition phrases': ['MultiWordExpressions'],
 'verb phrases': ['MultiWordExpressions'],
 'noun phrases': ['MultiWordExpressions'],
 'adjective phrases': ['MultiWordExpressions'],
 'infinitive clause': ['clausal argument'],
 'prepositional interrogative clause': ['clausal argument'],
 'possessive infinitive clause': ['clausal argument'],
 'interrogative clause': ['clausal argument'],
 'subjunctive clause': ['clausal argument'],
 'sentential clause': ['clausal argument'],
 'prepositional gerund clause': ['clausal argument'],
 'interrogative infinitive clause': ['clausal argument'],
 'declarative clause': ['clausal argument'],
 'gerund clause': ['clausal argument'],
 'yes': ['negatives'],
 'no': ['negatives'],
 'commonly used': ['frequency'],
 'infrequently used': ['frequency'],
 'rarely used': ['frequency'],
 'prefix': ['affix'],
 'suffix': ['affix'],
 'infix': ['affix'

In [43]:
import json

# Save final_clusters_list to JSON file
with open("text_to_ontol_keyword_LLM_scholarly.json", "w", encoding="utf-8") as f:
    json.dump(term2types_mapping_test, f, indent=2, ensure_ascii=False)

print("Clusters saved to final_clusters.json ✅")

Clusters saved to final_clusters.json ✅


In [44]:
len(test_data)

10

In [45]:
term2docs_mapping_test = {}
term2types_mapping_test = {}
for ans in final_ans:
    ans_dict = json.loads(ans)
    for term_type in ans_dict:
        for term, type in term_type.items():
            term2docs_mapping_test[term]=[]
            for doc in test_data:
                if term in doc['title'] or term in doc['text']:
                    term2docs_mapping_test[term].append(doc['id'])


In [46]:
term2docs_mapping_test

{'pre-modifier': ['35_0'],
 'post-modifier': ['35_0'],
 'indifferent': ['35_0'],
 'preposition phrases': ['33_0'],
 'verb phrases': ['33_0'],
 'noun phrases': ['33_0'],
 'adjective phrases': ['33_0'],
 'infinitive clause': ['22_0'],
 'prepositional interrogative clause': ['22_0'],
 'possessive infinitive clause': ['22_0'],
 'interrogative clause': ['22_0'],
 'subjunctive clause': ['22_0'],
 'sentential clause': ['22_0'],
 'prepositional gerund clause': ['22_0'],
 'interrogative infinitive clause': ['22_0'],
 'declarative clause': ['22_0'],
 'gerund clause': ['22_0'],
 'yes': ['9_0'],
 'no': ['35_0', '33_0', '22_0', '9_0', '25_0', '32_0', '20_0', '17_0'],
 'commonly used': ['19_0'],
 'infrequently used': ['19_0'],
 'rarely used': ['19_0'],
 'prefix': ['25_0', '17_0'],
 'suffix': ['25_0', '17_0'],
 'infix': ['25_0', '17_0'],
 'transfix': ['25_0'],
 'simulfix': ['25_0'],
 'zero morph': ['25_0'],
 'masculine': ['32_0'],
 'feminine': ['32_0'],
 'neuter': ['32_0'],
 'common gender': ['32_0']

In [47]:
import json

# Save final_clusters_list to JSON file
with open("text_to_ontol_keyword_LLM_scholarly_term_extraction.json", "w", encoding="utf-8") as f:
    json.dump(term2docs_mapping_test, f, indent=2, ensure_ascii=False)

print("Clusters saved to final_clusters.json ✅")

Clusters saved to final_clusters.json ✅


In [48]:
import json
def load_json(data_path):
    with open(data_path, "r") as file_1:
        pred_data= json.load(file_1)
    return pred_data

pred_data = load_json("text_to_ontol_keyword_LLM_scholarly.json")

In [49]:
import json

# Assuming pred_data is your dictionary of {term: [type1, type2, ...]}
# Here's how to write it properly:
with open("scholarly/test/1st_method_pro_gemini_terms.txt", "w", encoding="utf-8") as term_file:
    for key in pred_data.keys():
        term_file.write(f"{key}\n")

with open("scholarly/test/1st_method_pro_flash_types.txt", "w", encoding="utf-8") as type_file:
    for value in pred_data.values():
        # If multiple types are present, join them into one line as JSON list
        type_file.write(json.dumps(value) + "\n")


In [51]:
# Write keys (terms) to terms.txt
with open("scholarly/test/1st_method_pro_gemini_terms.txt", "w", encoding="utf-8") as term_file:
    for key in pred_data.keys():
        term_file.write(f"{key}\n")

# Track and write only unique types (no duplicates)
written_types = set()

with open("scholarly/test/1st_method_pro_types.txt", "w", encoding="utf-8") as type_file:
    for value_list in pred_data.values():
        for val in value_list:
            if val not in written_types:
                type_file.write(f"{val}\n")
                written_types.add(val)
