In [1]:
import json

def load_jsonl(file_path):
    """
    Loads data from a .jsonl file.

    Args:
        file_path (str): The path to the .jsonl file.

    Returns:
        list: A list of dictionaries, where each dictionary represents a JSON object from a line.
    """
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data.append(json.loads(line.strip()))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line: {line.strip()} - {e}")
    return data

In [2]:
training_data = load_jsonl("ecology/train/documents.jsonl")
len(training_data), training_data[0]

(2000,
 {'id': '942_0',
  'title': 'Drumlin Fields: A Type of Hill Range Shaped by Glaciers',
  'text': 'A drumlin field is categorized as a type of hill range, typically formed through the movement of glaciers. These landscapes are characteristic of regions where there has been significant glacial activity, shaping the terrain into elongated hill ranges. Drumlin fields are not only fascinating geological formations but also play a crucial role in the local ecosystem and can influence local climate conditions.'})

In [3]:
def load_txt_file_content(filepath):
    """
    Loads the entire content of a text file into a single string.

    Args:
        filepath (str): The path to the text file.

    Returns:
        str: The content of the file, or None if an error occurs.
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.readlines()
        return content
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        return None
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return None

In [4]:
terms_text_list = load_txt_file_content("ecology/train/types.txt")

In [None]:
# final_term_to_doc_mapping = {}
# for term in terms_text_list:
#     lst_1 = []
#     for data_1 in training_data:
#         if term in data_1['title'] or term in data_1['text']:
#             lst_1.append(data_1['id'])
#     final_term_to_doc_mapping[term]= lst_1

In [12]:
test_data = load_jsonl('ecology/test/text2onto_ecology_test_documents.jsonl')

Error decoding JSON on line:  - Expecting value: line 1 column 1 (char 0)


In [13]:
test_data

[{'id': '1012_0',
  'title': 'Understanding the State of Being Broken',
  'text': 'When something is broken into two pieces, it is considered to be in a state of being broken. In other words, being broken into two pieces is a specific way in which something can be broken, indicating a more severe or distinct form of damage.'},
 {'id': '2012_0',
  'title': 'Classification of Carbon Concentration in Water',
  'text': 'The concentration of carbon atoms in water is categorized as a specific measure of the amount of carbon atoms present in water. This classification is significant in environmental studies as it helps in understanding the levels and impact of carbon in aquatic systems.'},
 {'id': '314_0',
  'title': 'Understanding Permeability: The Role of Permeable and Impermeable Materials in Environmental Science',
  'text': 'In the context of environmental science, the permeability of materials plays a crucial role in understanding various ecological and geological processes. Permeabilit

In [14]:
relevant_docs = []
terms_rel = ['type', 'types', 'subtypes', 'subtype', 'Types']
for doc in test_data:
    title= doc['title']
    text= doc['text']
    for term in terms_rel:
        if term in title or term in text:
            if doc not in relevant_docs:
                relevant_docs.append(doc)


In [15]:
len(training_data)

2000

In [16]:
len(relevant_docs)

300

In [1]:
len(test_data)

NameError: name 'test_data' is not defined

In [None]:
from google import genai
model = "gemini-2.5-flash"
client= genai.Client(api_key="")

In [25]:
from tqdm import tqdm
final_ans = []

In [27]:
for i, doc in tqdm(enumerate(relevant_docs[285:])):
        
        prompt=f"""
        You are an expert AI specializing in ecology and information extraction. Your task is to carefully read an article's title and text, identify specific ecological terms, and classify them according to the categories provided within the text.

        ## Domain Description: Ecology
        Ecology is the scientific study of the intricate relationships between living organisms and their physical environment. It examines how organisms interact with each other and with abiotic factors like climate, soil, and water. The domain covers vast topics including biodiversity, population dynamics, ecosystem functioning, and nutrient cycles. This field is critical for understanding and addressing major environmental challenges, from conservation to the impacts of climate change.

        ## Instructions
        1. Read the provided title and text to understand the main concepts and their relationships.
        2. Identify specific, named examples or instances of a broader category mentioned in the text.
        3. For each identified term, find its corresponding parent category or type as defined in the article.
        4. Provide the output as a JSON list of dictionaries. Each dictionary must contain a single key-value pair, where the key is the specific term extracted, and the value is its assigned type.
        5. Only extract terms and types explicitly mentioned in the text. Do not invent terms or categories.

        ## Example
        **Input Document:**
        {json.dumps({
            "id": "1183_0",
            "title": "Types of Environmental Material Temperatures",
            "text": "The temperature of various environmental materials is a crucial aspect of environmental monitoring. Specifically, temperatures of air, soil, and water are categorized under the broader classification of temperature of environmental material. The temperature of air is one such type, playing a significant role in weather patterns and climate conditions. Similarly, the temperature of soil is another important category, influencing plant growth and microbial activity. Lastly, the temperature of water, whether in rivers, lakes, or oceans, affects aquatic life and is a vital component in the Earth's climate system. Understanding these different temperatures is essential for assessing and mitigating the impact of environmental changes."
        })}

        **Correct Output:**
        
        [
            {{"temperature of air": "Temperature of Environmental Material"}},
            {{"temperature of soil": "Temperature of Environmental Material"}},
            {{"temperature of water": "Temperature of Environmental Material"}}
        ]

        ## Your Task
        Process the following document and provide the output in the specified JSON format:

        **Input Document:**
        {doc}
        """

        generation_config = {"response_mime_type": "application/json"}

        response = client.models.generate_content(
                contents= prompt,
                config=generation_config,
                model=model
            )
        final_ans.append(response.text)
        if i%5==0:
            print(response.text)
        
        

1it [00:04,  4.72s/it]

[
  {
    "Atmospheric wind speed": "speed"
  },
  {
    "speed of a water current": "speed"
  },
  {
    "speed at which water infiltrates the soil": "speed"
  },
  {
    "speed at which soil drains water": "speed"
  },
  {
    "increased speed": "speed"
  },
  {
    "decreased speed": "speed"
  },
  {
    "sound speed": "speed"
  },
  {
    "normal speed": "speed"
  }
]


6it [00:22,  4.11s/it]

[
  {
    "asterids": "Pentapetalae"
  }
]


11it [00:34,  2.62s/it]

[
  {
    "Arecales": "commelinids"
  },
  {
    "commelinids": "monocotyledons"
  }
]


15it [00:40,  2.73s/it]


In [28]:
final_ans

['[\n  {\n    "certain types of rock": "permeable materials"\n  },\n  {\n    "soil": "permeable materials"\n  },\n  {\n    "clay": "impermeable materials"\n  },\n  {\n    "solid rock": "impermeable materials"\n  }\n]',
 '[\n  {\n    "carbon dioxide pooling disposition": "carbon pooling disposition"\n  }\n]',
 '[\n  {\n    "respiratory electron transport chain": "electron transport chain"\n  }\n]',
 '[\n  {\n    "amplitude of temperature of air": "amplitude"\n  }\n]',
 '[\n  {\n    "Toxin": "metabolite"\n  },\n  {\n    "eukaryotic metabolites": "metabolite"\n  },\n  {\n    "animal metabolites": "eukaryotic metabolites"\n  },\n  {\n    "fungal metabolites": "eukaryotic metabolites"\n  },\n  {\n    "prokaryotic metabolites": "metabolite"\n  }\n]',
 '[\n  {\n    "flat": "landform"\n  }\n]',
 '[\n  {\n    "the cooling of solids": "material cooling process"\n  },\n  {\n    "the cooling of fluids": "material cooling process"\n  }\n]',
 '[\n  {\n    "Inorganic macronutrients that are dissolved

In [32]:
term2docs_mapping_test = {}
term2types_mapping_test = {}
for ans in final_ans:
    ans_dict = json.loads(ans)
    for term_type in ans_dict:
        for term, type in term_type.items():
            if term not in term2types_mapping_test.keys():
                term2types_mapping_test[term]=[type]
            else:
                term2types_mapping_test[term].append(type)
term2types_mapping_test


{'certain types of rock': ['permeable materials'],
 'soil': ['permeable materials'],
 'clay': ['impermeable materials'],
 'solid rock': ['impermeable materials'],
 'carbon dioxide pooling disposition': ['carbon pooling disposition'],
 'respiratory electron transport chain': ['electron transport chain'],
 'amplitude of temperature of air': ['amplitude'],
 'Toxin': ['metabolite'],
 'eukaryotic metabolites': ['metabolite'],
 'animal metabolites': ['eukaryotic metabolites'],
 'fungal metabolites': ['eukaryotic metabolites'],
 'prokaryotic metabolites': ['metabolite'],
 'flat': ['landform'],
 'the cooling of solids': ['material cooling process'],
 'the cooling of fluids': ['material cooling process'],
 'Inorganic macronutrients that are dissolved in ocean water': ['macronutrient'],
 'concentration of carbon atoms in seawater': ['carbon atom concentrations found in water',
  'carbon atom concentrations found in water'],
 'protic solvent': ['polar solvent', 'polar solvent'],
 'megathrust eart

In [33]:
import json

# Save final_clusters_list to JSON file
with open("text_to_ontol_keyword_LLM_ecology.json", "w", encoding="utf-8") as f:
    json.dump(term2types_mapping_test, f, indent=2, ensure_ascii=False)

print("Clusters saved to final_clusters.json ✅")

Clusters saved to final_clusters.json ✅


In [34]:
len(test_data)

483

In [35]:
term2docs_mapping_test = {}
term2types_mapping_test = {}
for ans in final_ans:
    ans_dict = json.loads(ans)
    for term_type in ans_dict:
        for term, type in term_type.items():
            term2docs_mapping_test[term]=[]
            for doc in test_data:
                if term in doc['title'] or term in doc['text']:
                    term2docs_mapping_test[term].append(doc['id'])


In [36]:
term2docs_mapping_test

{'certain types of rock': ['314_0'],
 'soil': ['314_0',
  '707_0',
  '2010_0',
  '263_0',
  '1028_0',
  '194_0',
  '2005_0',
  '1207_0',
  '951_0',
  '486_0',
  '2002_0',
  '487_0',
  '432_0',
  '769_0',
  '849_0',
  '952_0',
  '2227_0',
  '2014_0',
  '2008_0',
  '883_0',
  '448_0',
  '515_0',
  '746_0',
  '431_0',
  '2017_0',
  '1035_0',
  '1959_0',
  '1183_0',
  '1120_0',
  '847_0',
  '1996_0',
  '1924_0',
  '313_0',
  '2353_0',
  '616_0',
  '66_0',
  '1921_0',
  '2355_0',
  '1922_0',
  '1995_0',
  '122_0',
  '1923_0'],
 'clay': ['314_0', '486_0'],
 'solid rock': ['314_0'],
 'carbon dioxide pooling disposition': [],
 'respiratory electron transport chain': ['2189_0', '584_0'],
 'amplitude of temperature of air': ['93_0'],
 'Toxin': ['1298_0'],
 'eukaryotic metabolites': ['1298_0'],
 'animal metabolites': ['1298_0'],
 'fungal metabolites': ['1298_0', '1069_0'],
 'prokaryotic metabolites': ['1298_0', '1096_0'],
 'flat': ['2357_0', '270_0', '639_0'],
 'the cooling of solids': ['976_0'],

In [38]:
import json

# Save final_clusters_list to JSON file
with open("text_to_ontol_keyword_LLM_ecology_term_extraction.json", "w", encoding="utf-8") as f:
    json.dump(term2docs_mapping_test, f, indent=2, ensure_ascii=False)

print("Clusters saved to final_clusters.json ✅")

Clusters saved to final_clusters.json ✅


In [4]:
import json
def load_json(data_path):
    with open(data_path, "r") as file_1:
        pred_data= json.load(file_1)
    return pred_data

pred_data = load_json("text_to_ontol_keyword_LLM_ecology.json")

In [10]:
import json

# Assuming pred_data is your dictionary of {term: [type1, type2, ...]}
# Here's how to write it properly:
with open("ecology/test/terms.txt", "w", encoding="utf-8") as term_file:
    for key in pred_data.keys():
        term_file.write(f"{key}\n")

with open("ecology/test/types.txt", "w", encoding="utf-8") as type_file:
    for value in pred_data.values():
        # If multiple types are present, join them into one line as JSON list
        type_file.write(json.dumps(value) + "\n")


In [12]:
# Write keys (terms) to terms.txt
with open("ecology/test/terms.txt", "w", encoding="utf-8") as term_file:
    for key in pred_data.keys():
        term_file.write(f"{key}\n")

# Track and write only unique types (no duplicates)
written_types = set()

with open("ecology/test/types.txt", "w", encoding="utf-8") as type_file:
    for value_list in pred_data.values():
        for val in value_list:
            if val not in written_types:
                type_file.write(f"{val}\n")
                written_types.add(val)
