In [2]:
import json

def load_jsonl(file_path):
    """
    Loads data from a .jsonl file.

    Args:
        file_path (str): The path to the .jsonl file.

    Returns:
        list: A list of dictionaries, where each dictionary represents a JSON object from a line.
    """
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data.append(json.loads(line.strip()))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line: {line.strip()} - {e}")
    return data


In [3]:
training_data = load_jsonl("engineering/train/documents.jsonl")
len(training_data), training_data[0]

(83,
 {'id': '0_2',
  'title': 'The Versatility of Prefixed Units in Measurement',
  'text': "The realm of measurement encompasses a vast array of units, many of which are modified by prefixes to denote scale. In the domain of physics and engineering, prefixed units are ubiquitous. For instance, units of length such as the centimetre, picometre, and terametre are used to measure distances ranging from the minute to the astronomical. Similarly, forces are measured in femtonewtons and teranewtons, reflecting the vast scales involved in scientific and engineering applications.\n\nEnergy measurements utilize units like the hectojoule, while power is quantified in petawatts and deciwatts. The digital realm relies on units such as the gigabit and gigabyte to measure data. Other physical quantities, including magnetic field strength in yoctoteslas and zeptoteslas, and radioactivity in picobecquerels and petabecquerels, demonstrate the diversity of prefixed units.\n\nFurthermore, the measureme

In [4]:
def load_txt_file_content(filepath):
    """
    Loads the entire content of a text file into a single string.

    Args:
        filepath (str): The path to the text file.

    Returns:
        str: The content of the file, or None if an error occurs.
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.readlines()
        return content
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        return None
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return None

In [5]:
def load_json(data_path):
    with open(data_path, "r") as file_1:
        data = json.load(file_1)
    return data

In [6]:
terms_text_list = load_txt_file_content("engineering/train/types.txt")

In [7]:
doc_term_types_mapping = []
train_data_term_types = load_json("engineering/train/terms2types.json")
train_data_term_types

[{'term': 'exagram', 'types': ['prefixed unit']},
 {'term': 'millinewton', 'types': ['prefixed unit']},
 {'term': 'petahenry', 'types': ['prefixed unit']},
 {'term': 'petalitre', 'types': ['prefixed unit']},
 {'term': 'petaweber', 'types': ['prefixed unit']},
 {'term': '3 to 5 on the Kelvin scale', 'types': ['fixed point']},
 {'term': 'hectotesla', 'types': ['prefixed unit']},
 {'term': 'statohm', 'types': ['singular unit', 'unit']},
 {'term': 'weber', 'types': ['singular unit', 'unit']},
 {'term': 'wine glass', 'types': ['unit']},
 {'term': 'kilohm', 'types': ['prefixed unit']},
 {'term': 'yottajoule', 'types': ['prefixed unit']},
 {'term': 'piece', 'types': ['unit']},
 {'term': '961.78 on the Celsius scale', 'types': ['fixed point']},
 {'term': 'centimetre of mercury', 'types': ['prefixed unit']},
 {'term': '54.3584 on the Kelvin scale', 'types': ['fixed point']},
 {'term': 'zeptopascal', 'types': ['prefixed unit']},
 {'term': '24.5561 on the Kelvin scale', 'types': ['fixed point']},

In [8]:
doc_term_types_mapping = []

In [9]:
for doc in training_data:
    title=doc['title']
    id= doc['id']
    text =doc['text']
    term_types_pairs = []
    for term_type in train_data_term_types:
        if term_type['term'] in title or term_type['term'] in text:
            term_types_pairs.append(term_type)
    doc_term_types_mapping.append({"title":title, "id":id,"text":text, "term_type_pairs":term_types_pairs})
    

In [10]:
doc_term_types_mapping

[{'title': 'The Versatility of Prefixed Units in Measurement',
  'id': '0_2',
  'text': "The realm of measurement encompasses a vast array of units, many of which are modified by prefixes to denote scale. In the domain of physics and engineering, prefixed units are ubiquitous. For instance, units of length such as the centimetre, picometre, and terametre are used to measure distances ranging from the minute to the astronomical. Similarly, forces are measured in femtonewtons and teranewtons, reflecting the vast scales involved in scientific and engineering applications.\n\nEnergy measurements utilize units like the hectojoule, while power is quantified in petawatts and deciwatts. The digital realm relies on units such as the gigabit and gigabyte to measure data. Other physical quantities, including magnetic field strength in yoctoteslas and zeptoteslas, and radioactivity in picobecquerels and petabecquerels, demonstrate the diversity of prefixed units.\n\nFurthermore, the measurement of

In [11]:
test_data_all= load_jsonl('engineering/test/text2onto_engineering_test_documents.jsonl')
test_data_all

[{'id': '3_7',
  'title': 'Units of Molar Concentration per Unit Length',
  'text': 'Units of measurement for molar concentration per unit length are derived from the base unit "mole per metre". This base unit can be modified in two ways: by prefixing the "mole" to indicate different orders of magnitude, or by prefixing the "metre" to denote different units of length. \n\nWhen the "mole" is prefixed, it results in units such as nanomole per metre, femtomole per metre, and others, down to yoctomole per metre and up to yottamole per metre. These units are all categorized as "prefixed mole per metre". Examples include nanomole per metre, kilomole per metre, and gigamole per metre, among others.\n\nOn the other hand, when the "metre" is prefixed, it gives rise to units like mole per centimetre, mole per kilometre, and mole per nanometre. These are classified as "mole per prefixed metre". Other examples include mole per decimetre, mole per megametre, and mole per picometre.\n\nBoth "prefixe

In [12]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def find_best_document_by_title(query: str, docs: list[dict]) -> tuple[dict | None, float]:
    """
    Finds the best matching document in a list based on its title.

    Args:
        query (str): The text to search for.
        docs (list[dict]): A list of documents, where each doc is a dictionary
                           with at least a 'title' key.

    Returns:
        tuple[dict | None, float]: A tuple containing the entire best-matching
                                   document dictionary and its similarity score.
                                   Returns (None, 0.0) if the docs list is empty.
    """
    if not docs:
        return None, 0.0

    # 1. Extract all titles from the documents
    titles = [f"{doc['title']}+ {doc['text']} " for doc in docs]

    # 2. Create a corpus by prepending the query to the list of titles
    corpus = [query] + titles
    
    # 3. Initialize and fit the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)

    # 4. Calculate cosine similarity between the query (first vector) and all titles
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

    # 5. Find the index of the highest similarity score
    best_match_index = np.argmax(cosine_similarities)
    best_match_score = cosine_similarities[best_match_index]

    # 6. Retrieve the entire document object using the index
    best_matching_document = docs[best_match_index]

    return best_matching_document, float(best_match_score)


In [13]:
final_ans = []
from tqdm import tqdm
for test_data in tqdm(test_data_all):
    title= test_data['title']
    text = test_data['text']
    best_match_doc, score = find_best_document_by_title(query=title+text, docs=doc_term_types_mapping)
    final_ans.append({"query_doc":test_data, "relevant_doc":best_match_doc})


100%|██████████| 21/21 [00:00<00:00, 59.09it/s]


In [14]:
final_ans[0]

{'query_doc': {'id': '3_7',
  'title': 'Units of Molar Concentration per Unit Length',
  'text': 'Units of measurement for molar concentration per unit length are derived from the base unit "mole per metre". This base unit can be modified in two ways: by prefixing the "mole" to indicate different orders of magnitude, or by prefixing the "metre" to denote different units of length. \n\nWhen the "mole" is prefixed, it results in units such as nanomole per metre, femtomole per metre, and others, down to yoctomole per metre and up to yottamole per metre. These units are all categorized as "prefixed mole per metre". Examples include nanomole per metre, kilomole per metre, and gigamole per metre, among others.\n\nOn the other hand, when the "metre" is prefixed, it gives rise to units like mole per centimetre, mole per kilometre, and mole per nanometre. These are classified as "mole per prefixed metre". Other examples include mole per decimetre, mole per megametre, and mole per picometre.\n\n

In [None]:
from google import genai
model = "gemini-2.5-flash"
client= genai.Client(api_key="")

In [16]:
final_prep_ans = []
i=0

In [17]:
for data in tqdm(final_ans):
    prompt =f"""You are an expert model specializing in scientific and engineering terminology. Your task is to analyze the provided document (title and text) and extract all measurement units mentioned. For each unit, you must classify it based on its structure and composition.

    **Domain: Engineering**
    The domain is **Engineering**, a broad field applying scientific and mathematical principles to design, build, and analyze structures, machines, and systems. It encompasses various disciplines like mechanical, electrical, chemical, and civil engineering. A core tenet of this domain is the reliance on precise, standardized units of measurement to ensure safety, interoperability, and accuracy in all designs and calculations.

    **Instructions:**
    1.  Read the title and text of the document carefully.
    2.  Identify every distinct measurement unit. These can be single words (e.g., "metre") or complex multi-word phrases (e.g., "mole per centimetre").
    3.  For each unit, assign one or more types from the following categories:
        - `singular unit`: A base unit (e.g., "mole", "metre").
        - `prefixed unit`: A base unit with a standard prefix (e.g., "nanomole", "kilometre").
        - `unit multiplication`: A unit formed by multiplying two or more units (e.g., "newton metre").
        - `unit division`: A unit formed by dividing one unit by another (e.g., "mole per metre").
        - Other specific classifications mentioned in the text (e.g., "prefixed mole per metre").
    4.  The output must be a single, valid JSON object with one key: `"term_type_pairs"`. The value should be a list of objects, where each object contains a `"term"` and a `"types"` key.

    ---
    **EXAMPLE**

    **Document:**
    **Title:** {data['relevant_doc']['title']}
    **Text:** {data['relevant_doc']['text']}

    **Correct Output:**
    {json.dumps({"term_type_pairs": data['relevant_doc']['term_type_pairs']}, indent=2)}

    TASK
    query Document:
    Title: {data['query_doc']['title']}
    Text: {data['query_doc']['text']} """

    print(len(prompt))

    generation_config = {"response_mime_type": "application/json"}
    response = client.models.generate_content(
                contents= prompt,
                config=generation_config,
                model=model
            )
    final_prep_ans.append(response.text)
    if i%5==0:
        print(response.text)
    i+=1
    



  0%|          | 0/21 [00:00<?, ?it/s]

9701


  5%|▍         | 1/21 [00:16<05:37, 16.87s/it]

{
  "term_type_pairs": [
    {
      "term": "centimetre",
      "types": [
        "prefixed unit"
      ]
    },
    {
      "term": "decimetre",
      "types": [
        "prefixed unit"
      ]
    },
    {
      "term": "femtomole",
      "types": [
        "prefixed unit"
      ]
    },
    {
      "term": "femtomole per metre",
      "types": [
        "prefixed mole per metre",
        "unit division"
      ]
    },
    {
      "term": "gigamole",
      "types": [
        "prefixed unit"
      ]
    },
    {
      "term": "gigamole per metre",
      "types": [
        "prefixed mole per metre",
        "unit division"
      ]
    },
    {
      "term": "kilomole",
      "types": [
        "prefixed unit"
      ]
    },
    {
      "term": "kilomole per metre",
      "types": [
        "prefixed mole per metre",
        "unit division"
      ]
    },
    {
      "term": "kilometre",
      "types": [
        "prefixed unit"
      ]
    },
    {
      "term": "megametre",
      "ty

 10%|▉         | 2/21 [00:34<05:30, 17.40s/it]

9318


 14%|█▍        | 3/21 [01:09<07:33, 25.17s/it]

9989


 19%|█▉        | 4/21 [01:32<06:53, 24.33s/it]

10021


 24%|██▍       | 5/21 [02:07<07:35, 28.48s/it]

3718


 29%|██▊       | 6/21 [02:16<05:28, 21.87s/it]

{
  "term_type_pairs": [
    {
      "term": "interval scale",
      "types": [
        "scale"
      ]
    },
    {
      "term": "ratio scale",
      "types": [
        "scale"
      ]
    },
    {
      "term": "thermodynamic temperature scale",
      "types": [
        "temperature scale",
        "scale"
      ]
    },
    {
      "term": "Celsius temperature scale",
      "types": [
        "temperature scale",
        "scale"
      ]
    },
    {
      "term": "Fahrenheit temperature scale",
      "types": [
        "temperature scale",
        "scale"
      ]
    },
    {
      "term": "Rankine temperature scale",
      "types": [
        "temperature scale",
        "scale"
      ]
    },
    {
      "term": "Réaumur temperature scale",
      "types": [
        "temperature scale",
        "scale"
      ]
    },
    {
      "term": "Temperature_scale",
      "types": [
        "temperature scale",
        "scale"
      ]
    }
  ]
}
11876


 33%|███▎      | 7/21 [02:41<05:19, 22.85s/it]

11415


 38%|███▊      | 8/21 [03:04<04:55, 22.73s/it]

8153


 43%|████▎     | 9/21 [03:09<03:25, 17.14s/it]

12035


 48%|████▊     | 10/21 [03:33<03:31, 19.23s/it]

3576


 52%|█████▏    | 11/21 [03:58<03:29, 20.98s/it]

{
  "term_type_pairs": [
    {
      "term": "Temperature",
      "types": [
        "fundamental physical quantity"
      ]
    },
    {
      "term": "Celsius",
      "types": [
        "singular unit"
      ]
    },
    {
      "term": "Réaumur",
      "types": [
        "singular unit"
      ]
    },
    {
      "term": "Fahrenheit",
      "types": [
        "singular unit"
      ]
    }
  ]
}
7317


 57%|█████▋    | 12/21 [04:05<02:33, 17.01s/it]

8334


 62%|██████▏   | 13/21 [04:23<02:16, 17.07s/it]

3364


 67%|██████▋   | 14/21 [04:38<01:55, 16.54s/it]

8624


 71%|███████▏  | 15/21 [04:50<01:31, 15.23s/it]

2573


 76%|███████▌  | 16/21 [04:53<00:57, 11.60s/it]

{
  "term_type_pairs": []
}
8823


 81%|████████  | 17/21 [05:20<01:04, 16.08s/it]

10274


 86%|████████▌ | 18/21 [05:35<00:47, 15.79s/it]

9977


 90%|█████████ | 19/21 [06:13<00:45, 22.56s/it]

9036


 95%|█████████▌| 20/21 [06:23<00:18, 18.68s/it]

8534


100%|██████████| 21/21 [06:26<00:00, 18.40s/it]

{
  "term_type_pairs": []
}





In [18]:
final_prep_ans

['{\n  "term_type_pairs": [\n    {\n      "term": "centimetre",\n      "types": [\n        "prefixed unit"\n      ]\n    },\n    {\n      "term": "decimetre",\n      "types": [\n        "prefixed unit"\n      ]\n    },\n    {\n      "term": "femtomole",\n      "types": [\n        "prefixed unit"\n      ]\n    },\n    {\n      "term": "femtomole per metre",\n      "types": [\n        "prefixed mole per metre",\n        "unit division"\n      ]\n    },\n    {\n      "term": "gigamole",\n      "types": [\n        "prefixed unit"\n      ]\n    },\n    {\n      "term": "gigamole per metre",\n      "types": [\n        "prefixed mole per metre",\n        "unit division"\n      ]\n    },\n    {\n      "term": "kilomole",\n      "types": [\n        "prefixed unit"\n      ]\n    },\n    {\n      "term": "kilomole per metre",\n      "types": [\n        "prefixed mole per metre",\n        "unit division"\n      ]\n    },\n    {\n      "term": "kilometre",\n      "types": [\n        "prefixed unit"

In [21]:
final_prep_ans[0]

'{\n  "term_type_pairs": [\n    {\n      "term": "centimetre",\n      "types": [\n        "prefixed unit"\n      ]\n    },\n    {\n      "term": "decimetre",\n      "types": [\n        "prefixed unit"\n      ]\n    },\n    {\n      "term": "femtomole",\n      "types": [\n        "prefixed unit"\n      ]\n    },\n    {\n      "term": "femtomole per metre",\n      "types": [\n        "prefixed mole per metre",\n        "unit division"\n      ]\n    },\n    {\n      "term": "gigamole",\n      "types": [\n        "prefixed unit"\n      ]\n    },\n    {\n      "term": "gigamole per metre",\n      "types": [\n        "prefixed mole per metre",\n        "unit division"\n      ]\n    },\n    {\n      "term": "kilomole",\n      "types": [\n        "prefixed unit"\n      ]\n    },\n    {\n      "term": "kilomole per metre",\n      "types": [\n        "prefixed mole per metre",\n        "unit division"\n      ]\n    },\n    {\n      "term": "kilometre",\n      "types": [\n        "prefixed unit"\

In [None]:
unique_types = set()

In [27]:
import json
import os

# Ensure output directory exists
os.makedirs("engineering/test", exist_ok=True)

all_terms = []
unique_types = set()

# Loop over all JSON strings
for data_1 in final_prep_ans:
    data = json.loads(data_1)
    pairs = data["term_type_pairs"]

    for item in pairs:
        all_terms.append(item["term"])
        for typ in item["types"]:
            unique_types.add(typ)

# Save all terms to one file
with open("engineering/test/2nd_method_terms.txt", "w", encoding="utf-8") as term_file:
    for term in all_terms:
        term_file.write(f"{term}\n")

# Save all unique types to one file
with open("engineering/test/2nd_method_types.txt", "w", encoding="utf-8") as type_file:
    for typ in sorted(unique_types):  # sorted is optional
        type_file.write(f"{typ}\n")
