# **Clustering the Training data Term**

In [6]:
import json
from collections import defaultdict

def cluster_terms_by_shared_word(data):
    word_to_terms = defaultdict(set)
    for idx, entry in enumerate(data):
        words = entry['term'].lower().split()
        for w in words:
            word_to_terms[w].add(idx)

    visited = set()
    clusters = []

    def dfs(term_idx, cluster):
        visited.add(term_idx)
        cluster.add(term_idx)
        for w in data[term_idx]['term'].lower().split():
            for neighbor_idx in word_to_terms[w]:
                if neighbor_idx not in visited:
                    dfs(neighbor_idx, cluster)

    for i in range(len(data)):
        if i not in visited:
            cluster = set()
            dfs(i, cluster)
            clusters.append(cluster)

    clustered_output = []
    for cluster in clusters:
        cluster_terms = []
        cluster_types = set()
        for idx in cluster:
            cluster_terms.append(data[idx]['term'])
            cluster_types.update(data[idx]['types'])
        clustered_output.append({
            'terms': cluster_terms,
            'types': list(cluster_types)
        })
    return clustered_output
import json
import re

def clean_json_string(s):
    """Clean JSON string by replacing problematic Unicode escape sequences."""
    # Replace \u00c2 with middle dot (·)
    s = re.sub(r'\\u00c2', '·', s)
    # Replace \u00b7 with middle dot (·)
    s = re.sub(r'\\u00b7', '·', s)
    # Handle incomplete or malformed sequences like \u00b
    s = re.sub(r'\\u00b(?!\w{2})', '·', s)
    # Convert superscript notation (e.g., s-2 to s⁻²)
    s = re.sub(r's-2', 's⁻²', s)
    s = re.sub(r's-3', 's⁻³', s)
    s = re.sub(r's-4', 's⁻⁴', s)
    s = re.sub(r'm-2', 'm⁻²', s)
    s = re.sub(r'm-4', 'm⁻⁴', s)
    s = re.sub(r'kg-1', 'kg⁻¹', s)
    s = re.sub(r'A-1', 'A⁻¹', s)
    s = re.sub(r'A-2', 'A⁻²', s)
    return s

try:
    # Read the JSON file as raw text to clean it
    with open('MatOnto/train/term_typing_train_data.json', 'r', encoding='utf-8') as f:
        raw_data = f.read()
    
    # Clean the raw JSON string
    cleaned_data = clean_json_string(raw_data)
    
    # Parse the cleaned JSON string
    data = json.loads(cleaned_data)
    
    # Print all terms to verify correct loading
    print("Successfully loaded JSON data. Terms:")
    # for entry in data:
    #     print(f"ID: {entry['id']}, Term: {entry['term']}, Type: {entry['types'][0]}")

except UnicodeDecodeError as e:
    print(f"UnicodeDecodeError: {e}")
    print("Try a different encoding (e.g., 'latin-1' or 'iso-8859-1') or clean the file.")
except json.JSONDecodeError as e:
    print(f"JSONDecodeError: {e}")
    print("The JSON file may be malformed. Check for invalid characters or syntax.")
except Exception as e:
    print(f"Unexpected error: {e}")

clusters = cluster_terms_by_shared_word(data)
print(json.dumps(clusters, indent=2, ensure_ascii=False))


Successfully loaded JSON data. Terms:
[
  {
    "terms": [
      "kilogram",
      "pascal root meter",
      "per meter",
      "radian per second squared",
      "kilogram per kilogram",
      "joule per kelvin",
      "root meter",
      "per squared meter",
      "pascal per second",
      "kilogram per cubic meter",
      "candela per square meter",
      "kilogram per meter per second squared",
      "second cubed",
      "farad per meter",
      "meter per second squared",
      "ampere squared",
      "cubic meter",
      "meter kilogram per second squared",
      "joule per cubic meter",
      "steradian",
      "meter per second",
      "watt per steradian",
      "second ampere",
      "per volt",
      "cubic angstrom",
      "ampere per meter",
      "henry",
      "per second squared",
      "candela",
      "joule per kilogram",
      "ampere",
      "per ampere squared",
      "second",
      "per mole",
      "kilogram squared meter per second squared",
      "per seco

In [None]:
import json
from collections import defaultdict

def cluster_terms_by_shared_word(data):
    word_to_terms = defaultdict(set)
    for idx, entry in enumerate(data):
        words = entry['term'].lower().split()
        for w in words:
            word_to_terms[w].add(idx)

    visited = set()
    clusters = []

    def dfs(term_idx, cluster):
        visited.add(term_idx)
        cluster.add(term_idx)
        for w in data[term_idx]['term'].lower().split():
            for neighbor_idx in word_to_terms[w]:
                if neighbor_idx not in visited:
                    dfs(neighbor_idx, cluster)

    for i in range(len(data)):
        if i not in visited:
            cluster = set()
            dfs(i, cluster)
            clusters.append(cluster)

    clustered_output = []
    for cluster in clusters:
        cluster_terms = []
        for idx in cluster:
            cluster_terms.append(data[idx]['term'])
        clustered_output.append({
            'term_indices': cluster,
            'terms': cluster_terms
        })
    return clustered_output

def get_word_set(term):
    return set(term.lower().split())

def find_best_training_match(new_term, training_data):
    new_words = get_word_set(new_term)
    best_match = None
    best_score = -1
    for train_entry in training_data:
        train_words = get_word_set(train_entry['term'])
        shared_words = new_words.intersection(train_words)
        score = len(shared_words)
        if score > best_score:
            best_score = score
            best_match = train_entry
    return best_match, best_score

# Load your training data (with types)
with open('MatOnto/train/term_typing_train_data.json', 'r', encoding='utf-8') as f:
    training_data = json.load(f)

# Load your new data (without types)
with open("MatOnto/test/matonto_term_typing_test_data.json", 'r', encoding='utf-8' ) as f:
    new_data = json.load(f)

# Step 1: Cluster new terms by shared words
clusters = cluster_terms_by_shared_word(new_data)

# Step 2: For each cluster, find types from training data based on best matches per term
final_clusters = []
for cluster in clusters:
    cluster_types = set()
    for idx in cluster['term_indices']:
        new_term = new_data[idx]['term']
        best_match, score = find_best_training_match(new_term, training_data)
        if best_match and 'types' in best_match:
            cluster_types.update(best_match['types'])
    final_clusters.append({
        'terms': cluster['terms'],
        'types': list(cluster_types) if cluster_types else ['unknown']
    })

print(json.dumps(final_clusters, indent=2))


[
  {
    "terms": [
      "newton",
      "newton meter",
      "newton per meter",
      "coulomb per square meter",
      "mole per cubic meter",
      "per ampere",
      "second power 4",
      "per kilogram",
      "joule per mole",
      "coulomb per kilogram",
      "ampere per square meter",
      "joule",
      "meter squared",
      "joule per kilogram per kelvin",
      "per steradian",
      "second per mole",
      "coulomb per cubic meter",
      "joule per mole per kelvin",
      "watt per meter per kelvin",
      "reciprocal second",
      "watt",
      "gray per second"
    ],
    "types": [
      "angular acceleration unit",
      "radiant intensity unit",
      "unit",
      "magnetic field strength",
      "volume density unit",
      "entropy unit",
      "luminance unit",
      "mass unit",
      "radiance unit"
    ]
  },
  {
    "terms": [
      "m2\u00b7m-2"
    ],
    "types": [
      "mass unit"
    ]
  },
  {
    "terms": [
      "farad"
    ],
    "types":

# "assigning test term to the nearest cluster"

In [7]:
import json
from collections import defaultdict

def cluster_terms_by_shared_word(data):
    word_to_terms = defaultdict(set)
    for idx, entry in enumerate(data):
        words = entry['term'].lower().split()
        for w in words:
            word_to_terms[w].add(idx)

    visited = set()
    clusters = []

    def dfs(term_idx, cluster):
        visited.add(term_idx)
        cluster.add(term_idx)
        for w in data[term_idx]['term'].lower().split():
            for neighbor_idx in word_to_terms[w]:
                if neighbor_idx not in visited:
                    dfs(neighbor_idx, cluster)

    for i in range(len(data)):
        if i not in visited:
            cluster = set()
            dfs(i, cluster)
            clusters.append(cluster)

    clustered_output = []
    for cluster in clusters:
        cluster_terms = []
        for idx in cluster:
            cluster_terms.append(data[idx]['term'])
        clustered_output.append({
            'term_indices': cluster,
            'terms': cluster_terms
        })
    return clustered_output

def get_word_set(term):
    return set(term.lower().split())

# Load your training data (with types)
with open('MatOnto/train/term_typing_train_data.json', 'r', encoding='utf-8') as f:
    training_data = json.load(f)

# Load your new data (without types)
with open("MatOnto/test/matonto_term_typing_test_data.json", 'r', encoding='utf-8') as f:
    new_data = json.load(f)

# Step 1: Cluster training data
training_clusters = cluster_terms_by_shared_word(training_data)

# Step 2: Cluster new/test data
test_clusters = cluster_terms_by_shared_word(new_data)

# Precompute training cluster terms sets (list of word sets per term)
training_cluster_wordsets = []
for cluster in training_clusters:
    cluster_wordsets = []
    for idx in cluster['term_indices']:
        cluster_wordsets.append(get_word_set(training_data[idx]['term']))
    training_cluster_wordsets.append(cluster_wordsets)

# Step 3: For each test term, find best matching training cluster
final_clusters = []

for test_cluster in test_clusters:
    cluster_types = set()
    for test_idx in test_cluster['term_indices']:
        test_term = new_data[test_idx]['term']
        test_words = get_word_set(test_term)

        best_cluster_idx = None
        best_score = 0

        # Compare with each training cluster
        for i, cluster_wordsets in enumerate(training_cluster_wordsets):
            # For each training term in cluster, calculate shared words with test term
            cluster_score = 0
            for train_words in cluster_wordsets:
                shared = test_words.intersection(train_words)
                cluster_score = max(cluster_score, len(shared))
            if cluster_score > best_score:
                best_score = cluster_score
                best_cluster_idx = i

        if best_cluster_idx is not None and best_score > 0:
            # Collect types from that training cluster terms
            # Here collect all types from all terms in that cluster
            for train_idx in training_clusters[best_cluster_idx]['term_indices']:
                if 'types' in training_data[train_idx]:
                    cluster_types.update(training_data[train_idx]['types'])
        else:
            # No good match found for this term
            cluster_types.add('unknown')

    final_clusters.append({
        'terms': test_cluster['terms'],
        'types': list(cluster_types) if cluster_types else ['unknown']
    })

print(json.dumps(final_clusters, indent=2))


[
  {
    "terms": [
      "newton",
      "newton meter",
      "newton per meter",
      "coulomb per square meter",
      "mole per cubic meter",
      "per ampere",
      "second power 4",
      "per kilogram",
      "joule per mole",
      "coulomb per kilogram",
      "ampere per square meter",
      "joule",
      "meter squared",
      "joule per kilogram per kelvin",
      "per steradian",
      "second per mole",
      "coulomb per cubic meter",
      "joule per mole per kelvin",
      "watt per meter per kelvin",
      "reciprocal second",
      "watt",
      "gray per second"
    ],
    "types": [
      "amount of substance unit",
      "permittivity unit",
      "length unit",
      "energy density unit",
      "angular velocity unit",
      "electric current unit",
      "inductance unit",
      "volume density unit",
      "mass unit",
      "angular acceleration unit",
      "velocity unit",
      "volume unit",
      "pressure unit",
      "unit",
      "magnetic field

In [11]:
data

[{'id': 'TT_778fb090', 'term': 'kilogram', 'types': ['mass unit']},
 {'id': 'TT_30a06554', 'term': 'pascal root meter', 'types': ['unit']},
 {'id': 'TT_0f529528', 'term': 'per meter', 'types': ['unit']},
 {'id': 'TT_59cd3f71', 'term': 'kJ/mol', 'types': ['molar energy unit']},
 {'id': 'TT_57dd933e', 'term': 'unit_lux', 'types': ['illuminance unit']},
 {'id': 'TT_45bde25c',
  'term': 'radian per second squared',
  'types': ['angular acceleration unit']},
 {'id': 'TT_0720443e', 'term': 'kilogram per kilogram', 'types': ['unit']},
 {'id': 'TT_2a5fd547', 'term': 'weber', 'types': ['magnetic flux unit']},
 {'id': 'TT_6a031e6e', 'term': "'degree", 'types': ['plane angle unit']},
 {'id': 'TT_11273acb', 'term': 'joule per kelvin', 'types': ['entropy unit']},
 {'id': 'TT_e4e7b7f2',
  'term': 'unit_ohm',
  'types': ['electric resistance unit']},
 {'id': 'TT_e45431ca', 'term': 'root meter', 'types': ['unit']},
 {'id': 'TT_7942e2ca', 'term': 'per squared meter', 'types': ['unit']},
 {'id': 'TT_2ef

In [12]:
all_unique_types = []
for data_1 in data:
    if data_1['types'][0] not in all_unique_types:
        all_unique_types.append(data_1['types'][0])

In [13]:
all_unique_types

['mass unit',
 'unit',
 'molar energy unit',
 'illuminance unit',
 'angular acceleration unit',
 'magnetic flux unit',
 'plane angle unit',
 'entropy unit',
 'electric resistance unit',
 'dynamic viscosity unit',
 'volume density unit',
 'luminance unit',
 'pressure unit',
 'permittivity unit',
 'absorbed dose unit',
 'acceleration unit',
 'volume unit',
 'force unit',
 'electric charge unit',
 'inductance unit',
 'energy density unit',
 'solid angle unit',
 'velocity unit',
 'electric conduction unit',
 'dose equivalent unit',
 'radiant intensity unit',
 'magnetic field strength',
 'luminous intensity unit',
 'specific energy unit',
 'electric current unit',
 'time unit',
 'magnetic flux density unit',
 'energy unit',
 'length unit',
 'radiance unit',
 'angular velocity unit',
 'capacitance unit',
 'catalytic concentration unit',
 'temperature unit',
 'irradiance unit',
 'specific volume unit',
 'permeability unit',
 'catalytic activity unit',
 'potential difference unit',
 'informati

In [16]:
for test_idx in test_cluster['term_indices']:
    test_term = new_data[test_idx]['term']
    test_words = get_word_set(test_term)  # split test term by spaces to words

    best_cluster_idx = None
    best_score = 0

    # Iterate through each training cluster
    for i, cluster_wordsets in enumerate(training_cluster_wordsets):
        cluster_score = 0
        # Check against each training term in this cluster
        for train_words in cluster_wordsets:
            # train_words is word set of training term
            shared = test_words.intersection(train_words)  # word-based intersection
            cluster_score = max(cluster_score, len(shared))  # take max overlap in this cluster
        if cluster_score > best_score:
            best_score = cluster_score
            best_cluster_idx = i

    # If match found, collect types from best matching training cluster
    if best_cluster_idx is not None and best_score > 0:
        for train_idx in training_clusters[best_cluster_idx]['term_indices']:
            if 'types' in training_data[train_idx]:
                cluster_types.update(training_data[train_idx]['types'])
    else:
        # No match found
        cluster_types.update(all_unique_types)



In [17]:
cluster_types

{'absorbed dose unit',
 'acceleration unit',
 'amount of substance unit',
 'angular acceleration unit',
 'angular velocity unit',
 'capacitance unit',
 'catalytic activity unit',
 'catalytic concentration unit',
 'dose equivalent unit',
 'dynamic viscosity unit',
 'electric charge unit',
 'electric conduction unit',
 'electric current unit',
 'electric field strength unit',
 'electric resistance unit',
 'energy density unit',
 'energy unit',
 'entropy unit',
 'force unit',
 'illuminance unit',
 'inductance unit',
 'information unit',
 'irradiance unit',
 'length unit',
 'luminance unit',
 'luminous flux unit',
 'luminous intensity unit',
 'magnetic field strength',
 'magnetic flux density unit',
 'magnetic flux unit',
 'mass unit',
 'molar energy unit',
 'permeability unit',
 'permittivity unit',
 'plane angle unit',
 'potential difference unit',
 'power unit',
 'pressure unit',
 'radiance unit',
 'radiant intensity unit',
 'solid angle unit',
 'specific energy unit',
 'specific volume

In [18]:
import json
from collections import defaultdict

def cluster_terms_by_shared_word(data):
    word_to_terms = defaultdict(set)
    for idx, entry in enumerate(data):
        words = entry['term'].lower().split()
        for w in words:
            word_to_terms[w].add(idx)

    visited = set()
    clusters = []

    def dfs(term_idx, cluster):
        visited.add(term_idx)
        cluster.add(term_idx)
        for w in data[term_idx]['term'].lower().split():
            for neighbor_idx in word_to_terms[w]:
                if neighbor_idx not in visited:
                    dfs(neighbor_idx, cluster)

    for i in range(len(data)):
        if i not in visited:
            cluster = set()
            dfs(i, cluster)
            clusters.append(cluster)

    clustered_output = []
    for cluster in clusters:
        cluster_terms = []
        for idx in cluster:
            cluster_terms.append(data[idx]['term'])
        clustered_output.append({
            'term_indices': cluster,
            'terms': cluster_terms
        })
    return clustered_output

def get_word_set(term):
    return set(term.lower().split())

# Load your training data (with types)
with open('MatOnto/train/term_typing_train_data.json', 'r', encoding='utf-8') as f:
    training_data = json.load(f)

# Load your new data (without types)
with open("MatOnto/test/matonto_term_typing_test_data.json", 'r', encoding='utf-8') as f:
    new_data = json.load(f)

# Precompute all unique types in training data
all_unique_types = set()
for entry in training_data:
    if 'types' in entry:
        all_unique_types.update(entry['types'])

# Step 1: Cluster training data
training_clusters = cluster_terms_by_shared_word(training_data)

# Step 2: Cluster new/test data
test_clusters = cluster_terms_by_shared_word(new_data)

# Precompute training cluster terms sets (list of word sets per term)
training_cluster_wordsets = []
for cluster in training_clusters:
    cluster_wordsets = []
    for idx in cluster['term_indices']:
        cluster_wordsets.append(get_word_set(training_data[idx]['term']))
    training_cluster_wordsets.append(cluster_wordsets)

# Step 3: For each test term, find best matching training cluster
final_clusters = []

for test_cluster in test_clusters:
    cluster_types = set()
    for test_idx in test_cluster['term_indices']:
        test_term = new_data[test_idx]['term']
        test_words = get_word_set(test_term)

        best_cluster_idx = None
        best_score = 0

        # Compare with each training cluster
        for i, cluster_wordsets in enumerate(training_cluster_wordsets):
            cluster_score = 0
            for train_words in cluster_wordsets:
                shared = test_words.intersection(train_words)
                cluster_score = max(cluster_score, len(shared))
            if cluster_score > best_score:
                best_score = cluster_score
                best_cluster_idx = i

        if best_cluster_idx is not None and best_score > 0:
            # Collect types from that training cluster terms
            for train_idx in training_clusters[best_cluster_idx]['term_indices']:
                if 'types' in training_data[train_idx]:
                    cluster_types.update(training_data[train_idx]['types'])
        else:
            # No good match found: assign all unique training types
            cluster_types.update(all_unique_types)

    final_clusters.append({
        'terms': test_cluster['terms'],
        'types': list(cluster_types) if cluster_types else ['unknown']
    })

print(json.dumps(final_clusters, indent=2))


[
  {
    "terms": [
      "newton",
      "newton meter",
      "newton per meter",
      "coulomb per square meter",
      "mole per cubic meter",
      "per ampere",
      "second power 4",
      "per kilogram",
      "joule per mole",
      "coulomb per kilogram",
      "ampere per square meter",
      "joule",
      "meter squared",
      "joule per kilogram per kelvin",
      "per steradian",
      "second per mole",
      "coulomb per cubic meter",
      "joule per mole per kelvin",
      "watt per meter per kelvin",
      "reciprocal second",
      "watt",
      "gray per second"
    ],
    "types": [
      "amount of substance unit",
      "permittivity unit",
      "length unit",
      "energy density unit",
      "angular velocity unit",
      "electric current unit",
      "inductance unit",
      "volume density unit",
      "mass unit",
      "angular acceleration unit",
      "illuminance unit",
      "electric resistance unit",
      "velocity unit",
      "volume unit"

In [None]:
from google import genai
model = "gemini-2.5-pro"
client= genai.Client(api_key="")

In [20]:
from tqdm import tqdm

In [21]:
final_ans = []

In [23]:
final_clusters[0].keys()

dict_keys(['terms', 'types'])

In [25]:
for i, terms_types in tqdm(enumerate(final_clusters)):
        list_of_terms = terms_types['terms']
        types_of_terms = terms_types['types']

        prompt = f"""You are a material science expert with deep and comprehensive knowledge of metals, ceramics, polymers, composites, and semiconductors.  
            You understand the key material properties, including mechanical (strength, hardness), thermal (conductivity, expansion), electrical (resistivity, conductivity), optical, magnetic, and chemical properties.  
            You are familiar with measurement units used in material science such as mole, meter, pascal, newton, and others.  
            You have expertise in the structure-property relationships and the practical applications of materials in engineering, electronics, energy, and aerospace.  
            You are capable of accurately classifying and categorizing material-related terms using your scientific reasoning and up-to-date knowledge of the field.  
            You explain your classifications clearly and precisely.

            Your task is to classify a given list of material-related terms into one or more appropriate types from a provided list of possible types.

            Please respond strictly with a JSON object where:  
            - Each key is a term (string) from the input list.  
            - Each value is a list of one or more types (strings) assigned to that term based on your expert knowledge.

            List of Possible Types: {types_of_terms}

            Examples and also output format:

            {json.dumps({
            "mole": ["amount of substance unit"],
            "meter": ["length unit"],
            "kilogram": ["mass unit"],
            "celsius": ["temperature unit"],
            "steel": ["material"],
            "density": ["property"],
            "velocity": ["Unknown"]
            })}

            Now classify the following terms:
            {list_of_terms}"""
        generation_config = {"response_mime_type": "application/json"}

        response = client.models.generate_content(
                contents= prompt,
                config=generation_config,
                model=model
            )
        final_ans.append(response.text)
        if i%20==0:
            print(response.text)

1it [00:49, 49.38s/it]

{
    "newton": [
        "force unit"
    ],
    "newton meter": [
        "energy unit"
    ],
    "newton per meter": [
        "unit"
    ],
    "coulomb per square meter": [
        "unit"
    ],
    "mole per cubic meter": [
        "unit"
    ],
    "per ampere": [
        "unit"
    ],
    "second power 4": [
        "unit"
    ],
    "per kilogram": [
        "unit"
    ],
    "joule per mole": [
        "molar energy unit"
    ],
    "coulomb per kilogram": [
        "unit"
    ],
    "ampere per square meter": [
        "unit"
    ],
    "joule": [
        "energy unit"
    ],
    "meter squared": [
        "unit"
    ],
    "joule per kilogram per kelvin": [
        "unit"
    ],
    "per steradian": [
        "unit"
    ],
    "second per mole": [
        "unit"
    ],
    "coulomb per cubic meter": [
        "unit"
    ],
    "joule per mole per kelvin": [
        "entropy unit"
    ],
    "watt per meter per kelvin": [
        "unit"
    ],
    "reciprocal second": [
   

16it [06:19, 23.70s/it]


In [26]:
final_ans

['{\n    "newton": [\n        "force unit"\n    ],\n    "newton meter": [\n        "energy unit"\n    ],\n    "newton per meter": [\n        "unit"\n    ],\n    "coulomb per square meter": [\n        "unit"\n    ],\n    "mole per cubic meter": [\n        "unit"\n    ],\n    "per ampere": [\n        "unit"\n    ],\n    "second power 4": [\n        "unit"\n    ],\n    "per kilogram": [\n        "unit"\n    ],\n    "joule per mole": [\n        "molar energy unit"\n    ],\n    "coulomb per kilogram": [\n        "unit"\n    ],\n    "ampere per square meter": [\n        "unit"\n    ],\n    "joule": [\n        "energy unit"\n    ],\n    "meter squared": [\n        "unit"\n    ],\n    "joule per kilogram per kelvin": [\n        "unit"\n    ],\n    "per steradian": [\n        "unit"\n    ],\n    "second per mole": [\n        "unit"\n    ],\n    "coulomb per cubic meter": [\n        "unit"\n    ],\n    "joule per mole per kelvin": [\n        "entropy unit"\n    ],\n    "watt per meter per kelvin

In [28]:
new_data

[{'id': 'TT_5a5763f5', 'term': 'newton'},
 {'id': 'TT_c4f339b8', 'term': 'newton meter'},
 {'id': 'TT_ce5417fc', 'term': 'newton per meter'},
 {'id': 'TT_25014bd4', 'term': 'coulomb per square meter'},
 {'id': 'TT_46dad9e9', 'term': 'm2·m-2'},
 {'id': 'TT_49ab10c7', 'term': 'mole per cubic meter'},
 {'id': 'TT_ab5a8ad6', 'term': 'per ampere'},
 {'id': 'TT_08f9b767', 'term': 'farad'},
 {'id': 'TT_4cff4fec', 'term': 'dalton'},
 {'id': 'TT_ce2b537c', 'term': 'second power 4'},
 {'id': 'TT_dae2a7a6', 'term': 'per kilogram'},
 {'id': 'TT_f8e74193', 'term': 'radian'},
 {'id': 'TT_8a920749', 'term': 'joule per mole'},
 {'id': 'TT_a0402f84', 'term': 'becquerel'},
 {'id': 'TT_14fbac11', 'term': 'byte'},
 {'id': 'TT_f1361175', 'term': 'coulomb per kilogram'},
 {'id': 'TT_981df168', 'term': 'ampere per square meter'},
 {'id': 'TT_4c87c381', 'term': 'unit_kelvin_-1'},
 {'id': 'TT_f15ef7cd', 'term': 'joule'},
 {'id': 'TT_40a02db8', 'term': 'meter squared'},
 {'id': 'TT_d714834e', 'term': 'unit_elec

In [29]:
term_to_id_mapping = {}
for data_n in new_data:
    term_to_id_mapping[data_n['term']]= data_n['id']

In [30]:
term_to_id_mapping

{'newton': 'TT_5a5763f5',
 'newton meter': 'TT_c4f339b8',
 'newton per meter': 'TT_ce5417fc',
 'coulomb per square meter': 'TT_25014bd4',
 'm2·m-2': 'TT_46dad9e9',
 'mole per cubic meter': 'TT_49ab10c7',
 'per ampere': 'TT_ab5a8ad6',
 'farad': 'TT_08f9b767',
 'dalton': 'TT_4cff4fec',
 'second power 4': 'TT_ce2b537c',
 'per kilogram': 'TT_dae2a7a6',
 'radian': 'TT_f8e74193',
 'joule per mole': 'TT_8a920749',
 'becquerel': 'TT_a0402f84',
 'byte': 'TT_14fbac11',
 'coulomb per kilogram': 'TT_f1361175',
 'ampere per square meter': 'TT_981df168',
 'unit_kelvin_-1': 'TT_4c87c381',
 'joule': 'TT_f15ef7cd',
 'meter squared': 'TT_40a02db8',
 'unit_electron_volt': 'TT_d714834e',
 'joule per kilogram per kelvin': 'TT_a9f30802',
 'per steradian': 'TT_8182f530',
 'second per mole': 'TT_ab85a470',
 'coulomb per cubic meter': 'TT_b8e7ebae',
 'joule per mole per kelvin': 'TT_9aec366c',
 'lumen': 'TT_b9a7fd17',
 'hertz': 'TT_5ba78869',
 'watt per meter per kelvin': 'TT_9e7440e4',
 'micrometer': 'TT_177b

In [32]:
result = []
for ans_1 in final_ans:
    ans_dict = json.loads(ans_1)
    for term, type in ans_dict.items():
        result.append({"id":term_to_id_mapping[term], "term":term, "types":type})

In [33]:
result

[{'id': 'TT_5a5763f5', 'term': 'newton', 'types': ['force unit']},
 {'id': 'TT_c4f339b8', 'term': 'newton meter', 'types': ['energy unit']},
 {'id': 'TT_ce5417fc', 'term': 'newton per meter', 'types': ['unit']},
 {'id': 'TT_25014bd4', 'term': 'coulomb per square meter', 'types': ['unit']},
 {'id': 'TT_49ab10c7', 'term': 'mole per cubic meter', 'types': ['unit']},
 {'id': 'TT_ab5a8ad6', 'term': 'per ampere', 'types': ['unit']},
 {'id': 'TT_ce2b537c', 'term': 'second power 4', 'types': ['unit']},
 {'id': 'TT_dae2a7a6', 'term': 'per kilogram', 'types': ['unit']},
 {'id': 'TT_8a920749',
  'term': 'joule per mole',
  'types': ['molar energy unit']},
 {'id': 'TT_f1361175', 'term': 'coulomb per kilogram', 'types': ['unit']},
 {'id': 'TT_981df168', 'term': 'ampere per square meter', 'types': ['unit']},
 {'id': 'TT_f15ef7cd', 'term': 'joule', 'types': ['energy unit']},
 {'id': 'TT_40a02db8', 'term': 'meter squared', 'types': ['unit']},
 {'id': 'TT_a9f30802',
  'term': 'joule per kilogram per ke

In [34]:
import json

with open("Task_B_term_types_matonto.json", "w", encoding="utf-8") as f:
    json.dump(result, f, indent=2, ensure_ascii=False)
