# **Clustering the Training data Term**

In [2]:
import json
from collections import defaultdict

def cluster_terms_by_shared_word(data):
    word_to_terms = defaultdict(set)
    for idx, entry in enumerate(data):
        words = entry['term'].lower().split()
        for w in words:
            word_to_terms[w].add(idx)

    visited = set()
    clusters = []

    def dfs(term_idx, cluster):
        visited.add(term_idx)
        cluster.add(term_idx)
        for w in data[term_idx]['term'].lower().split():
            for neighbor_idx in word_to_terms[w]:
                if neighbor_idx not in visited:
                    dfs(neighbor_idx, cluster)

    for i in range(len(data)):
        if i not in visited:
            cluster = set()
            dfs(i, cluster)
            clusters.append(cluster)

    clustered_output = []
    for cluster in clusters:
        cluster_terms = []
        cluster_types = set()
        for idx in cluster:
            cluster_terms.append(data[idx]['term'])
            cluster_types.update(data[idx]['types'])
        clustered_output.append({
            'terms': cluster_terms,
            'types': list(cluster_types)
        })
    return clustered_output
import json
import re

def clean_json_string(s):
    """Clean JSON string by replacing problematic Unicode escape sequences."""
    # Replace \u00c2 with middle dot (·)
    s = re.sub(r'\\u00c2', '·', s)
    # Replace \u00b7 with middle dot (·)
    s = re.sub(r'\\u00b7', '·', s)
    # Handle incomplete or malformed sequences like \u00b
    s = re.sub(r'\\u00b(?!\w{2})', '·', s)
    # Convert superscript notation (e.g., s-2 to s⁻²)
    s = re.sub(r's-2', 's⁻²', s)
    s = re.sub(r's-3', 's⁻³', s)
    s = re.sub(r's-4', 's⁻⁴', s)
    s = re.sub(r'm-2', 'm⁻²', s)
    s = re.sub(r'm-4', 'm⁻⁴', s)
    s = re.sub(r'kg-1', 'kg⁻¹', s)
    s = re.sub(r'A-1', 'A⁻¹', s)
    s = re.sub(r'A-2', 'A⁻²', s)
    return s

try:
    # Read the JSON file as raw text to clean it
    with open('OBI/train/term_typing_train_data.json', 'r', encoding='utf-8') as f:
        raw_data = f.read()
    
    # Clean the raw JSON string
    cleaned_data = clean_json_string(raw_data)
    
    # Parse the cleaned JSON string
    data = json.loads(cleaned_data)
    
    # Print all terms to verify correct loading
    print("Successfully loaded JSON data. Terms:")
    # for entry in data:
    #     print(f"ID: {entry['id']}, Term: {entry['term']}, Type: {entry['types'][0]}")

except UnicodeDecodeError as e:
    print(f"UnicodeDecodeError: {e}")
    print("Try a different encoding (e.g., 'latin-1' or 'iso-8859-1') or clean the file.")
except json.JSONDecodeError as e:
    print(f"JSONDecodeError: {e}")
    print("The JSON file may be malformed. Check for invalid characters or syntax.")
except Exception as e:
    print(f"Unexpected error: {e}")

clusters = cluster_terms_by_shared_word(data)
print(json.dumps(clusters, indent=2, ensure_ascii=False))


Successfully loaded JSON data. Terms:
[
  {
    "terms": [
      "pT3 (kidney)",
      "pN1 (colon)",
      "pT4 (lung)",
      "pT1 (ovary)",
      "pTis (colon)",
      "pT3b (ovary)",
      "pT2 (kidney)",
      "pN0 (lung)",
      "cM0 (ovary)",
      "pT3a (ovary)",
      "cM0 (lung)",
      "pT1b (ovary)",
      "cM1 (lung)",
      "pT1a (kidney)",
      "pTis (lung)",
      "pN1a (colon)",
      "pN2a (colon)",
      "pN0 (kidney)",
      "pT2 (colon)",
      "cM1a (lung)",
      "pT0 (ovary)",
      "pT2 (lung)",
      "pT2a (lung)",
      "pT2c (ovary)",
      "pM1 (ovary)",
      "pT1 (colon)",
      "pT0 (colon)",
      "pN2 (lung)",
      "pT3c (kidney)",
      "pT2b (kidney)",
      "cM0 (colon)",
      "pT1b (lung)",
      "pT3 (colon)",
      "pN0 (ovary)",
      "pT4a (colon)",
      "pT3b (kidney)",
      "pT3c (ovary)",
      "pM1a (colon)",
      "pT4b (colon)",
      "pN3 (lung)",
      "pM1a (lung)",
      "pM1 (lung)",
      "pT0 (kidney)",
      "pN1 (ovary)",
  

In [4]:
import json
from collections import defaultdict

def cluster_terms_by_shared_word(data):
    word_to_terms = defaultdict(set)
    for idx, entry in enumerate(data):
        words = entry['term'].lower().split()
        for w in words:
            word_to_terms[w].add(idx)

    visited = set()
    clusters = []

    def dfs(term_idx, cluster):
        visited.add(term_idx)
        cluster.add(term_idx)
        for w in data[term_idx]['term'].lower().split():
            for neighbor_idx in word_to_terms[w]:
                if neighbor_idx not in visited:
                    dfs(neighbor_idx, cluster)

    for i in range(len(data)):
        if i not in visited:
            cluster = set()
            dfs(i, cluster)
            clusters.append(cluster)

    clustered_output = []
    for cluster in clusters:
        cluster_terms = []
        for idx in cluster:
            cluster_terms.append(data[idx]['term'])
        clustered_output.append({
            'term_indices': cluster,
            'terms': cluster_terms
        })
    return clustered_output

def get_word_set(term):
    return set(term.lower().split())

def find_best_training_match(new_term, training_data):
    new_words = get_word_set(new_term)
    best_match = None
    best_score = -1
    for train_entry in training_data:
        train_words = get_word_set(train_entry['term'])
        shared_words = new_words.intersection(train_words)
        score = len(shared_words)
        if score > best_score:
            best_score = score
            best_match = train_entry
    return best_match, best_score

# Load your training data (with types)
with open('OBI/train/term_typing_train_data.json', 'r', encoding='utf-8') as f:
    training_data = json.load(f)

# Load your new data (without types)
with open("OBI/test/obi_term_typing_test_data.json", 'r', encoding='utf-8' ) as f:
    new_data = json.load(f)

# Step 1: Cluster new terms by shared words
clusters = cluster_terms_by_shared_word(new_data)

# Step 2: For each cluster, find types from training data based on best matches per term
final_clusters = []
for cluster in clusters:
    cluster_types = set()
    for idx in cluster['term_indices']:
        new_term = new_data[idx]['term']
        best_match, score = find_best_training_match(new_term, training_data)
        if best_match and 'types' in best_match:
            cluster_types.update(best_match['types'])
    final_clusters.append({
        'terms': cluster['terms'],
        'types': list(cluster_types) if cluster_types else ['unknown']
    })

print(json.dumps(final_clusters, indent=2))


[
  {
    "terms": [
      "eBioscience"
    ],
    "types": [
      "pathologic primary tumor stage for kidney according to AJCC 7th edition"
    ]
  },
  {
    "terms": [
      "pT3 (lung)",
      "pT2a (ovary)",
      "pT1b (kidney)",
      "pM1b (lung)",
      "pT1c (ovary)",
      "cM1 (kidney)",
      "pN1 (kidney)",
      "pT3a (kidney)",
      "pT1 (kidney)",
      "pN2b (colon)",
      "pT2 (ovary)",
      "pT2b (lung)",
      "cM1a (colon)",
      "cM1 (colon)",
      "pT1 (lung)",
      "pN1c (colon)",
      "cM1 (ovary)",
      "pM1 (colon)",
      "pN1b (colon)",
      "pN1 (lung)",
      "pT3 (ovary)",
      "pN2 (colon)",
      "pM1b (colon)"
    ],
    "types": [
      "pathologic distant metastases stage for lung according to AJCC 7th edition",
      "pathologic lymph node stage for colon and rectum according to AJCC 7th edition",
      "pathologic primary tumor stage for ovary according to AJCC 7th edition",
      "pathologic primary tumor stage for kidney according t

# "assigning test term to the nearest cluster"

In [5]:
import json
from collections import defaultdict

def cluster_terms_by_shared_word(data):
    word_to_terms = defaultdict(set)
    for idx, entry in enumerate(data):
        words = entry['term'].lower().split()
        for w in words:
            word_to_terms[w].add(idx)

    visited = set()
    clusters = []

    def dfs(term_idx, cluster):
        visited.add(term_idx)
        cluster.add(term_idx)
        for w in data[term_idx]['term'].lower().split():
            for neighbor_idx in word_to_terms[w]:
                if neighbor_idx not in visited:
                    dfs(neighbor_idx, cluster)

    for i in range(len(data)):
        if i not in visited:
            cluster = set()
            dfs(i, cluster)
            clusters.append(cluster)

    clustered_output = []
    for cluster in clusters:
        cluster_terms = []
        for idx in cluster:
            cluster_terms.append(data[idx]['term'])
        clustered_output.append({
            'term_indices': cluster,
            'terms': cluster_terms
        })
    return clustered_output

def get_word_set(term):
    return set(term.lower().split())

# Load your training data (with types)
with open('OBI/train/term_typing_train_data.json', 'r', encoding='utf-8') as f:
    training_data = json.load(f)

# Load your new data (without types)
with open("OBI/test/obi_term_typing_test_data.json", 'r', encoding='utf-8') as f:
    new_data = json.load(f)

# Step 1: Cluster training data
training_clusters = cluster_terms_by_shared_word(training_data)

# Step 2: Cluster new/test data
test_clusters = cluster_terms_by_shared_word(new_data)

# Precompute training cluster terms sets (list of word sets per term)
training_cluster_wordsets = []
for cluster in training_clusters:
    cluster_wordsets = []
    for idx in cluster['term_indices']:
        cluster_wordsets.append(get_word_set(training_data[idx]['term']))
    training_cluster_wordsets.append(cluster_wordsets)

# Step 3: For each test term, find best matching training cluster
final_clusters = []

for test_cluster in test_clusters:
    cluster_types = set()
    for test_idx in test_cluster['term_indices']:
        test_term = new_data[test_idx]['term']
        test_words = get_word_set(test_term)

        best_cluster_idx = None
        best_score = 0

        # Compare with each training cluster
        for i, cluster_wordsets in enumerate(training_cluster_wordsets):
            # For each training term in cluster, calculate shared words with test term
            cluster_score = 0
            for train_words in cluster_wordsets:
                shared = test_words.intersection(train_words)
                cluster_score = max(cluster_score, len(shared))
            if cluster_score > best_score:
                best_score = cluster_score
                best_cluster_idx = i

        if best_cluster_idx is not None and best_score > 0:
            # Collect types from that training cluster terms
            # Here collect all types from all terms in that cluster
            for train_idx in training_clusters[best_cluster_idx]['term_indices']:
                if 'types' in training_data[train_idx]:
                    cluster_types.update(training_data[train_idx]['types'])
        else:
            # No good match found for this term
            cluster_types.add('unknown')

    final_clusters.append({
        'terms': test_cluster['terms'],
        'types': list(cluster_types) if cluster_types else ['unknown']
    })

print(json.dumps(final_clusters, indent=2))


[
  {
    "terms": [
      "eBioscience"
    ],
    "types": [
      "unknown"
    ]
  },
  {
    "terms": [
      "pT3 (lung)",
      "pT2a (ovary)",
      "pT1b (kidney)",
      "pM1b (lung)",
      "pT1c (ovary)",
      "cM1 (kidney)",
      "pN1 (kidney)",
      "pT3a (kidney)",
      "pT1 (kidney)",
      "pN2b (colon)",
      "pT2 (ovary)",
      "pT2b (lung)",
      "cM1a (colon)",
      "cM1 (colon)",
      "pT1 (lung)",
      "pN1c (colon)",
      "cM1 (ovary)",
      "pM1 (colon)",
      "pN1b (colon)",
      "pN1 (lung)",
      "pT3 (ovary)",
      "pN2 (colon)",
      "pM1b (colon)"
    ],
    "types": [
      "pathologic distant metastases stage for lung according to AJCC 7th edition",
      "pathologic lymph node stage for kidney according to AJCC 7th edition",
      "pathologic distant metastases stage for colon according to AJCC 7th edition",
      "pathologic distant metastases stage for kidney according to AJCC 7th edition",
      "pathologic lymph node stage for colo

In [7]:
data

[{'id': 'TT_9bd33766',
  'term': 'pT3 (kidney)',
  'types': ['pathologic primary tumor stage for kidney according to AJCC 7th edition']},
 {'id': 'TT_4c6560da',
  'term': 'GenePattern module KMeansClustering',
  'types': ['GenePattern software']},
 {'id': 'TT_b6596f1b',
  'term': 'Stage IIIA (FIGO)',
  'types': ['International Federation of Gynecology and Obstetrics cervical cancer stage value specification']},
 {'id': 'TT_e6e9e758', 'term': 'angstrom', 'types': ['length unit']},
 {'id': 'TT_94550005',
  'term': 'pT1 (ovary)',
  'types': ['pathologic primary tumor stage for ovary according to AJCC 7th edition']},
 {'id': 'TT_d0023073',
  'term': 'Stage 1 (FIGO)',
  'types': ['International Federation of Gynecology and Obstetrics ovarian cancer stage value specification']},
 {'id': 'TT_5b5aa1c3', 'term': 'hertz', 'types': ['frequency unit']},
 {'id': 'TT_0e0deac9', 'term': 'Waters', 'types': ['manufacturer']},
 {'id': 'TT_c58ed932',
  'term': 'pT3b (ovary)',
  'types': ['pathologic prim

In [8]:
all_unique_types = []
for data_1 in data:
    if data_1['types'][0] not in all_unique_types:
        all_unique_types.append(data_1['types'][0])

In [9]:
all_unique_types

['pathologic primary tumor stage for kidney according to AJCC 7th edition',
 'GenePattern software',
 'International Federation of Gynecology and Obstetrics cervical cancer stage value specification',
 'length unit',
 'pathologic primary tumor stage for ovary according to AJCC 7th edition',
 'International Federation of Gynecology and Obstetrics ovarian cancer stage value specification',
 'frequency unit',
 'manufacturer',
 'pathologic distant metastases stage for ovary according to AJCC 7th edition',
 'pathologic distant metastases stage for lung according to AJCC 7th edition',
 'data format specification',
 'mass unit',
 'categorical label',
 'pathologic lymph node stage for colon and rectum according to AJCC 7th edition',
 'curation status specification',
 'obsolescence reason specification',
 'concentration unit',
 'pathologic primary tumor stage for colon and rectum according to AJCC 7th edition',
 'CART',
 'denotator type',
 'material supplier',
 'pathologic primary tumor stage f

In [10]:
for test_idx in test_cluster['term_indices']:
    test_term = new_data[test_idx]['term']
    test_words = get_word_set(test_term)  # split test term by spaces to words

    best_cluster_idx = None
    best_score = 0

    # Iterate through each training cluster
    for i, cluster_wordsets in enumerate(training_cluster_wordsets):
        cluster_score = 0
        # Check against each training term in this cluster
        for train_words in cluster_wordsets:
            # train_words is word set of training term
            shared = test_words.intersection(train_words)  # word-based intersection
            cluster_score = max(cluster_score, len(shared))  # take max overlap in this cluster
        if cluster_score > best_score:
            best_score = cluster_score
            best_cluster_idx = i

    # If match found, collect types from best matching training cluster
    if best_cluster_idx is not None and best_score > 0:
        for train_idx in training_clusters[best_cluster_idx]['term_indices']:
            if 'types' in training_data[train_idx]:
                cluster_types.update(training_data[train_idx]['types'])
    else:
        # No match found
        cluster_types.update(all_unique_types)



In [11]:
cluster_types

{'histologic grade according to the Fuhrman Nuclear Grading System',
 'histologic grade for ovarian tumor according to the World Health Organization'}

In [12]:
import json
from collections import defaultdict

def cluster_terms_by_shared_word(data):
    word_to_terms = defaultdict(set)
    for idx, entry in enumerate(data):
        words = entry['term'].lower().split()
        for w in words:
            word_to_terms[w].add(idx)

    visited = set()
    clusters = []

    def dfs(term_idx, cluster):
        visited.add(term_idx)
        cluster.add(term_idx)
        for w in data[term_idx]['term'].lower().split():
            for neighbor_idx in word_to_terms[w]:
                if neighbor_idx not in visited:
                    dfs(neighbor_idx, cluster)

    for i in range(len(data)):
        if i not in visited:
            cluster = set()
            dfs(i, cluster)
            clusters.append(cluster)

    clustered_output = []
    for cluster in clusters:
        cluster_terms = []
        for idx in cluster:
            cluster_terms.append(data[idx]['term'])
        clustered_output.append({
            'term_indices': cluster,
            'terms': cluster_terms
        })
    return clustered_output

def get_word_set(term):
    return set(term.lower().split())

# Load your training data (with types)
with open('OBI/train/term_typing_train_data.json', 'r', encoding='utf-8') as f:
    training_data = json.load(f)

# Load your new data (without types)
with open("OBI/test/obi_term_typing_test_data.json", 'r', encoding='utf-8') as f:
    new_data = json.load(f)

# Precompute all unique types in training data
all_unique_types = set()
for entry in training_data:
    if 'types' in entry:
        all_unique_types.update(entry['types'])

# Step 1: Cluster training data
training_clusters = cluster_terms_by_shared_word(training_data)

# Step 2: Cluster new/test data
test_clusters = cluster_terms_by_shared_word(new_data)

# Precompute training cluster terms sets (list of word sets per term)
training_cluster_wordsets = []
for cluster in training_clusters:
    cluster_wordsets = []
    for idx in cluster['term_indices']:
        cluster_wordsets.append(get_word_set(training_data[idx]['term']))
    training_cluster_wordsets.append(cluster_wordsets)

# Step 3: For each test term, find best matching training cluster
final_clusters = []

for test_cluster in test_clusters:
    cluster_types = set()
    for test_idx in test_cluster['term_indices']:
        test_term = new_data[test_idx]['term']
        test_words = get_word_set(test_term)

        best_cluster_idx = None
        best_score = 0

        # Compare with each training cluster
        for i, cluster_wordsets in enumerate(training_cluster_wordsets):
            cluster_score = 0
            for train_words in cluster_wordsets:
                shared = test_words.intersection(train_words)
                cluster_score = max(cluster_score, len(shared))
            if cluster_score > best_score:
                best_score = cluster_score
                best_cluster_idx = i

        if best_cluster_idx is not None and best_score > 0:
            # Collect types from that training cluster terms
            for train_idx in training_clusters[best_cluster_idx]['term_indices']:
                if 'types' in training_data[train_idx]:
                    cluster_types.update(training_data[train_idx]['types'])
        else:
            # No good match found: assign all unique training types
            cluster_types.update(all_unique_types)

    final_clusters.append({
        'terms': test_cluster['terms'],
        'types': list(cluster_types) if cluster_types else ['unknown']
    })

print(json.dumps(final_clusters, indent=2))


[
  {
    "terms": [
      "eBioscience"
    ],
    "types": [
      "pathologic distant metastases stage for lung according to AJCC 7th edition",
      "data format specification",
      "histologic grade for ovarian tumor according to the World Health Organization",
      "pathologic distant metastases stage for colon according to AJCC 7th edition",
      "material supplier",
      "denotator type",
      "curation status specification",
      "principal components analysis dimensionality reduction",
      "pathologic distant metastases stage for ovary according to AJCC 7th edition",
      "leave one out cross validation method",
      "hierarchical clustering",
      "obsolescence reason specification",
      "GenePattern software",
      "histologic grade according to the Fuhrman Nuclear Grading System",
      "clinical tumor stage group according to AJCC 7th edition",
      "pathologic lymph node stage for lung according to AJCC 7th edition",
      "organization",
      "pathologi

In [None]:
from google import genai
model = "gemini-2.5-flash"
client= genai.Client(api_key="")

In [14]:
from tqdm import tqdm

In [15]:
final_ans = []

In [16]:
final_clusters[0].keys()

dict_keys(['terms', 'types'])

In [17]:
len(final_clusters)

40

In [19]:
for i, terms_types in tqdm(enumerate(final_clusters)):
        list_of_terms = terms_types['terms']
        types_of_terms = terms_types['types']

        prompt = prompt = f"""You are an expert in the Ontology for Biomedical Investigations (OBI) with comprehensive knowledge of biomedical research concepts, including assays, experimental protocols, instruments, reagents, data analysis methods, and biological or chemical entities.  
            You understand the key properties and roles of these entities, such as their function in experiments (e.g., measurement, detection, or analysis), their material properties (e.g., chemical composition), and their relationships in biomedical workflows.  
            You are familiar with measurement units used in biomedical research, such as mole, meter, pascal, and others, as well as OBI-specific terms like 'planned process', 'material entity', 'data transformation', and 'assay'.  
            You have expertise in the structure and classification of terms within the OBI ontology and their practical applications in biomedical experiments, clinical research, and data analysis.  
            You are capable of accurately classifying and categorizing OBI-related terms using your scientific reasoning and up-to-date knowledge of the OBI ontology.  
            You explain your classifications clearly and precisely.

            Your task is to classify a given list of OBI-related terms into one or more appropriate types from a provided list of possible types, ensuring alignment with the OBI ontology structure.

            Please respond strictly with a JSON object where:  
            - Each key is a term (string) from the input list.  
            - Each value is a list of one or more types (strings) assigned to that term based on your expert knowledge of the OBI ontology.

            List of Possible Types: {types_of_terms}

            Examples and also output format:

            {json.dumps({
            "millimolar": ["concentration unit"],
            "Stage 3C (FIGO)": ["International Federation of Gynecology and Obstetrics ovarian cancer stage value specification"],
            "centimeter": ["length unit"],
            "Miltenyi Biotec": ["manufacturer"],
            "Bentley Instruments": ["manufacturer"],
            "pM1 (ovary)": ["pathologic distant metastases stage for ovary according to AJCC 7th edition"],
            "pT1 (colon)": ["pathologic primary tumor stage for colon and rectum according to AJCC 7th edition"],
            "pT0 (colon)": ["pathologic primary tumor stage for colon and rectum according to AJCC 7th edition"],
            "Unknown": ["Unknown"]
            }, indent=4)}

            Now classify the following terms:
            {list_of_terms}"""

        generation_config = {"response_mime_type": "application/json"}

        response = client.models.generate_content(
                contents= prompt,
                config=generation_config,
                model=model
            )
        final_ans.append(response.text)
        if i%20==0:
            print(response.text)

1it [00:02,  2.72s/it]

{
    "eBioscience": [
        "manufacturer",
        "material supplier"
    ]
}


21it [02:28,  7.10s/it]

{
    "nanoliter": [
        "volume unit"
    ]
}


40it [03:22,  5.05s/it]


In [20]:
final_ans

['{\n    "eBioscience": [\n        "manufacturer",\n        "material supplier"\n    ]\n}',
 '{\n  "pT3 (lung)": [\n    "pathologic primary tumor stage for lung according to AJCC 7th edition"\n  ],\n  "pT2a (ovary)": [\n    "pathologic primary tumor stage for ovary according to AJCC 7th edition"\n  ],\n  "pT1b (kidney)": [\n    "pathologic primary tumor stage for kidney according to AJCC 7th edition"\n  ],\n  "pM1b (lung)": [\n    "pathologic distant metastases stage for lung according to AJCC 7th edition"\n  ],\n  "pT1c (ovary)": [\n    "pathologic primary tumor stage for ovary according to AJCC 7th edition"\n  ],\n  "cM1 (kidney)": [\n    "Unknown"\n  ],\n  "pN1 (kidney)": [\n    "pathologic lymph node stage for kidney according to AJCC 7th edition"\n  ],\n  "pT3a (kidney)": [\n    "pathologic primary tumor stage for kidney according to AJCC 7th edition"\n  ],\n  "pT1 (kidney)": [\n    "pathologic primary tumor stage for kidney according to AJCC 7th edition"\n  ],\n  "pN2b (colon)": 

In [28]:
new_data

[{'id': 'TT_5a5763f5', 'term': 'newton'},
 {'id': 'TT_c4f339b8', 'term': 'newton meter'},
 {'id': 'TT_ce5417fc', 'term': 'newton per meter'},
 {'id': 'TT_25014bd4', 'term': 'coulomb per square meter'},
 {'id': 'TT_46dad9e9', 'term': 'm2·m-2'},
 {'id': 'TT_49ab10c7', 'term': 'mole per cubic meter'},
 {'id': 'TT_ab5a8ad6', 'term': 'per ampere'},
 {'id': 'TT_08f9b767', 'term': 'farad'},
 {'id': 'TT_4cff4fec', 'term': 'dalton'},
 {'id': 'TT_ce2b537c', 'term': 'second power 4'},
 {'id': 'TT_dae2a7a6', 'term': 'per kilogram'},
 {'id': 'TT_f8e74193', 'term': 'radian'},
 {'id': 'TT_8a920749', 'term': 'joule per mole'},
 {'id': 'TT_a0402f84', 'term': 'becquerel'},
 {'id': 'TT_14fbac11', 'term': 'byte'},
 {'id': 'TT_f1361175', 'term': 'coulomb per kilogram'},
 {'id': 'TT_981df168', 'term': 'ampere per square meter'},
 {'id': 'TT_4c87c381', 'term': 'unit_kelvin_-1'},
 {'id': 'TT_f15ef7cd', 'term': 'joule'},
 {'id': 'TT_40a02db8', 'term': 'meter squared'},
 {'id': 'TT_d714834e', 'term': 'unit_elec

In [21]:
term_to_id_mapping = {}
for data_n in new_data:
    term_to_id_mapping[data_n['term']]= data_n['id']

In [22]:
term_to_id_mapping

{'eBioscience': 'TT_d539e3e1',
 'pT3 (lung)': 'TT_65da10e1',
 '3: symptomatic in bed more than 50% of the day but not bed ridden': 'TT_a3dc97d0',
 'Antigenix': 'TT_05e33e44',
 'metadata complete': 'TT_1ea4dbdc',
 '100: asymptomatic': 'TT_a9116815',
 'Cytopeia': 'TT_0f6a960e',
 'Edingburgh handedness inventory': 'TT_ea06d5c3',
 'BioGents': 'TT_47e2f629',
 'year': 'TT_c69005f8',
 'Stage IA (FIGO)': 'TT_da557411',
 'tar': 'TT_65393c3a',
 'pT2a (ovary)': 'TT_163f5980',
 'Stage 1A (FIGO)': 'TT_ccb5234f',
 'Advanced Instruments Inc. (AI Companies)': 'TT_90fdf1cb',
 'Stage IIIB (FIGO)': 'TT_93c33c2e',
 'nanomole': 'TT_2fd89cce',
 'Stage IIB (FIGO)': 'TT_e67be5a3',
 'pT1b (kidney)': 'TT_9b926ad9',
 'Bruker Corporation': 'TT_b9588c69',
 'pM1b (lung)': 'TT_ad01c533',
 'G3: Poorly differentiated': 'TT_9ccf6d5f',
 'pT1c (ovary)': 'TT_0bc56139',
 'Stage Unknown (FIGO)': 'TT_ed5377fb',
 'Luminex': 'TT_2afe465c',
 'cM1 (kidney)': 'TT_cdedd71b',
 'Occult Carcinoma (AJCC 7th)': 'TT_cd693ff4',
 'Stage I

In [23]:
result = []
for ans_1 in final_ans:
    ans_dict = json.loads(ans_1)
    for term, type in ans_dict.items():
        result.append({"id":term_to_id_mapping[term], "term":term, "types":type})

In [24]:
result

[{'id': 'TT_d539e3e1',
  'term': 'eBioscience',
  'types': ['manufacturer', 'material supplier']},
 {'id': 'TT_65da10e1',
  'term': 'pT3 (lung)',
  'types': ['pathologic primary tumor stage for lung according to AJCC 7th edition']},
 {'id': 'TT_163f5980',
  'term': 'pT2a (ovary)',
  'types': ['pathologic primary tumor stage for ovary according to AJCC 7th edition']},
 {'id': 'TT_9b926ad9',
  'term': 'pT1b (kidney)',
  'types': ['pathologic primary tumor stage for kidney according to AJCC 7th edition']},
 {'id': 'TT_ad01c533',
  'term': 'pM1b (lung)',
  'types': ['pathologic distant metastases stage for lung according to AJCC 7th edition']},
 {'id': 'TT_0bc56139',
  'term': 'pT1c (ovary)',
  'types': ['pathologic primary tumor stage for ovary according to AJCC 7th edition']},
 {'id': 'TT_cdedd71b', 'term': 'cM1 (kidney)', 'types': ['Unknown']},
 {'id': 'TT_cdbdc786',
  'term': 'pN1 (kidney)',
  'types': ['pathologic lymph node stage for kidney according to AJCC 7th edition']},
 {'id': '

In [25]:
import json

with open("Task_B_term_types_OBI.json", "w", encoding="utf-8") as f:
    json.dump(result, f, indent=2, ensure_ascii=False)
