# **Clustering the Training data Term**

# "assigning test term to the nearest cluster"

In [None]:
# for test_idx in test_cluster['term_indices']:
#     test_term = new_data[test_idx]['term']
#     test_words = get_word_set(test_term)  # split test term by spaces to words

#     best_cluster_idx = None
#     best_score = 0

#     # Iterate through each training cluster
#     for i, cluster_wordsets in enumerate(training_cluster_wordsets):
#         cluster_score = 0
#         # Check against each training term in this cluster
#         for train_words in cluster_wordsets:
#             # train_words is word set of training term
#             shared = test_words.intersection(train_words)  # word-based intersection
#             cluster_score = max(cluster_score, len(shared))  # take max overlap in this cluster
#         if cluster_score > best_score:
#             best_score = cluster_score
#             best_cluster_idx = i

#     # If match found, collect types from best matching training cluster
#     if best_cluster_idx is not None and best_score > 0:
#         for train_idx in training_clusters[best_cluster_idx]['term_indices']:
#             if 'types' in training_data[train_idx]:
#                 cluster_types.update(training_data[train_idx]['types'])
#     else:
#         # No match found
#         cluster_types.update(all_unique_types)



In [2]:
import json
from collections import defaultdict

def cluster_terms_by_shared_word(data):
    word_to_terms = defaultdict(set)
    for idx, entry in enumerate(data):
        words = entry['term'].lower().split()
        for w in words:
            word_to_terms[w].add(idx)

    visited = set()
    clusters = []

    def dfs(term_idx, cluster):
        visited.add(term_idx)
        cluster.add(term_idx)
        for w in data[term_idx]['term'].lower().split():
            for neighbor_idx in word_to_terms[w]:
                if neighbor_idx not in visited:
                    dfs(neighbor_idx, cluster)

    for i in range(len(data)):
        if i not in visited:
            cluster = set()
            dfs(i, cluster)
            clusters.append(cluster)

    clustered_output = []
    for cluster in clusters:
        cluster_terms = []
        for idx in cluster:
            cluster_terms.append(data[idx]['term'])
        clustered_output.append({
            'term_indices': cluster,
            'terms': cluster_terms
        })
    return clustered_output

def get_word_set(term):
    return set(term.lower().split())

# Load your training data (with types)
with open('SWEET/train/term_typing_train_data.json', 'r', encoding='utf-8') as f:
    training_data = json.load(f)

# Load your new data (without types)
with open("SWEET/test/sweet_term_typing_test_data.json", 'r', encoding='utf-8') as f:
    new_data = json.load(f)

# Precompute all unique types in training data
all_unique_types = set()
for entry in training_data:
    if 'types' in entry:
        all_unique_types.update(entry['types'])

# Step 1: Cluster training data
training_clusters = cluster_terms_by_shared_word(training_data)

# Step 2: Cluster new/test data
test_clusters = cluster_terms_by_shared_word(new_data)

# Precompute training cluster terms sets (list of word sets per term)
training_cluster_wordsets = []
for cluster in training_clusters:
    cluster_wordsets = []
    for idx in cluster['term_indices']:
        cluster_wordsets.append(get_word_set(training_data[idx]['term']))
    training_cluster_wordsets.append(cluster_wordsets)

# Step 3: For each test term, find best matching training cluster
final_clusters = []

for test_cluster in test_clusters:
    cluster_types = set()
    for test_idx in test_cluster['term_indices']:
        test_term = new_data[test_idx]['term']
        test_words = get_word_set(test_term)

        best_cluster_idx = None
        best_score = 0

        # Compare with each training cluster
        for i, cluster_wordsets in enumerate(training_cluster_wordsets):
            cluster_score = 0
            for train_words in cluster_wordsets:
                shared = test_words.intersection(train_words)
                cluster_score = max(cluster_score, len(shared))
            if cluster_score > best_score:
                best_score = cluster_score
                best_cluster_idx = i

        if best_cluster_idx is not None and best_score > 0:
            # Collect types from that training cluster terms
            for train_idx in training_clusters[best_cluster_idx]['term_indices']:
                if 'types' in training_data[train_idx]:
                    cluster_types.update(training_data[train_idx]['types'])
        else:
            # No good match found: assign all unique training types
            cluster_types.update(all_unique_types)

    final_clusters.append({
        'terms': test_cluster['terms'],
        'types': list(cluster_types) if cluster_types else ['unknown']
    })

print(json.dumps(final_clusters, indent=2))


[
  {
    "terms": [
      "electronegativity"
    ],
    "types": [
      "dimensionless ratio"
    ]
  },
  {
    "terms": [
      "sigma",
      "hybrid sigma",
      "hybrid height"
    ],
    "types": [
      "vertical coordinate"
    ]
  },
  {
    "terms": [
      "b8 class",
      "x49 class",
      "x16 class",
      "x2 class",
      "a8 class",
      "x13 class",
      "x37 class",
      "x31 class",
      "x24 class",
      "x8 class",
      "x47 class",
      "x30 class",
      "x25 class",
      "a7 class",
      "b7 class",
      "b3 class",
      "x42 class",
      "m2 class",
      "x45 class",
      "x4 class",
      "x21 class",
      "c7 class",
      "x28 class",
      "x39 class"
    ],
    "types": [
      "x class",
      "m class",
      "b class",
      "a class",
      "c class",
      "allotrope"
    ]
  },
  {
    "terms": [
      "ferroelectric"
    ],
    "types": [
      "consistence property",
      "metalloid",
      "b class",
      "inorganic compoun

In [3]:
all_unique_types = []
for data_1 in training_data:
    if data_1['types'][0] not in all_unique_types:
        all_unique_types.append(data_1['types'][0])

In [4]:
len(all_unique_types)

177

In [None]:
from google import genai
model = "gemini-1.5-flash"
client= genai.Client(api_key="")

In [13]:
from tqdm import tqdm

In [14]:
final_ans = []

In [15]:
final_clusters[0].keys()

dict_keys(['terms', 'types'])

In [16]:
len(final_clusters)

513

In [17]:
# from tqdm import tqdm
# import json

# final_ans = []

# for i, terms_types in tqdm(enumerate(final_clusters)):
#     list_of_terms = terms_types['terms']
#     types_of_terms = terms_types['types']  # Ensure this is a list of strings

#     prompt = f"""
#     You are an expert in the Semantic Web for Earth and Environmental Terminology (SWEET) ontology with comprehensive knowledge of Earth and environmental science concepts, including phenomena (e.g., climate, weather), processes (e.g., erosion, evaporation), substances (e.g., water, air, soil), and properties (e.g., temperature, salinity).  
#     You understand the key properties and roles of these entities, such as physical (e.g., density, viscosity), chemical (e.g., pH, composition), and ecological (e.g., biodiversity, ecosystem function) properties.  
#     You are familiar with measurement units used in environmental science, such as meter, kilogram, kelvin, and others, as well as SWEET-specific terms like 'EarthRealm', 'PhysicalProcess', 'Substance', and 'Property'.  
#     You have expertise in the structure and classification of terms within the SWEET ontology and their applications in environmental monitoring, climate modeling, and Earth system research.  
#     You are capable of accurately classifying and categorizing SWEET-related terms using your scientific reasoning and up-to-date knowledge of the SWEET ontology.  
#     You explain your classifications clearly and precisely.

#     Your task is to classify a given list of SWEET-related terms into one or more appropriate types from a provided list of possible types, ensuring alignment with the SWEET ontology structure.

#     Please respond strictly with a JSON object where:  
#     - Each key is a term (string) from the input list.  
#     - Each value is a list of one or more types (strings) assigned to that term based on your expert knowledge of the SWEET ontology.

#     List of Possible Types: {json.dumps(types_of_terms)}

#     Now classify the following terms:
#     {json.dumps(list_of_terms)}
#     """
    
#     generation_config = {"response_mime_type": "application/json"}

#     try:
#         response = client.models.generate_content(
#             contents=prompt,
#             config=generation_config,
#             model=model
#         )
#         response_json = json.loads(response.text)
#         final_ans.append(response_json)
#         # 
#         if i % 20 == 0:
#             print(response.text)

#     except Exception as e:
#         print(f"Error during response generation: {e}")


In [18]:
from tqdm import tqdm
import json
import time

final_ans = []
failed_iterations = [] # New list to store indices of failed iterations

for i, terms_types in tqdm(enumerate(final_clusters)):
    list_of_terms = terms_types['terms']
    types_of_terms = terms_types['types']  # Ensure this is a list of strings

    prompt = f"""
    You are an expert in the Semantic Web for Earth and Environmental Terminology (SWEET) ontology with comprehensive knowledge of Earth and environmental science concepts, including phenomena (e.g., climate, weather), processes (e.g., erosion, evaporation), substances (e.g., water, air, soil), and properties (e.g., temperature, salinity).  
    You understand the key properties and roles of these entities, such as physical (e.g., density, viscosity), chemical (e.g., pH, composition), and ecological (e.g., biodiversity, ecosystem function) properties.  
    You are familiar with measurement units used in environmental science, such as meter, kilogram, kelvin, and others, as well as SWEET-specific terms like 'EarthRealm', 'PhysicalProcess', 'Substance', and 'Property'.  
    You have expertise in the structure and classification of terms within the SWEET ontology and their applications in environmental monitoring, climate modeling, and Earth system research.  
    You are capable of accurately classifying and categorizing SWEET-related terms using your scientific reasoning and up-to-date knowledge of the SWEET ontology.  
    You explain your classifications clearly and precisely.

    Your task is to classify a given list of SWEET-related terms into one or more appropriate types from a provided list of possible types, ensuring alignment with the SWEET ontology structure.

    Please respond strictly with a JSON object where:  
    - Each key is a term (string) from the input list.  
    - Each value is a list of one or more types (strings) assigned to that term based on your expert knowledge of the SWEET ontology.

    List of Possible Types: {json.dumps(types_of_terms)}

    Now classify the following terms:
    {json.dumps(list_of_terms)}
    """
    
    generation_config = {"response_mime_type": "application/json"}

    max_retries = 3
    retries = 0
    success = False

    while retries < max_retries and not success:
        try:
            response = client.models.generate_content(
                contents=prompt,
                config=generation_config,
                model=model
            )
            response_json = json.loads(response.text)
            final_ans.append(response_json)
            success = True
            
            if i % 20 == 0:
                print(response.text)

        except Exception as e:
            print(f"Error during response generation for iteration {i}: {e}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying in 5 seconds (Attempt {retries}/{max_retries})....")
                time.sleep(5)  # Sleep for 5 seconds before retrying
            else:
                print(f"Max retries reached for iteration {i}. Skipping this item.")
                failed_iterations.append(i) # Store the index of the failed iteration

# After the loop, you can access the list of failed iterations:
# print("Iterations that failed after max retries:", failed_iterations)

1it [00:01,  1.29s/it]

{"electronegativity": ["dimensionless ratio"]}


21it [00:17,  1.23it/s]

{"cfc": ["cfc"]}


41it [00:30,  1.57it/s]

{"extraordinary": ["qualifier"]}


61it [00:41,  1.76it/s]

{"hour": ["base unit"]}


81it [00:54,  1.74it/s]

{"ch4": ["alkane"]}


101it [01:06,  1.72it/s]

{"acenaphthylene": ["alkene", "organic compound", "substance"]}


121it [01:18,  1.59it/s]

{"analyzed": ["qualifier"]}


141it [01:30,  1.61it/s]

{"hydrobromous acid": ["inorganic acid"], "methanoic acid": ["organic compound", "inorganic acid"]}


161it [01:42,  1.69it/s]

{"halosteric": ["fluid property"]}


181it [01:56,  1.13it/s]

{"albers conical equal area": ["spatial reference system", "horizontal coordinate system"], "pollutant standards index": ["environmental standard", "unit", "dimensionless ratio"], "leaf area index": ["dimensionless ratio", "unit"], "refractive index": ["dimensionless ratio", "physical property"]}


201it [02:07,  1.82it/s]

{"equatorial": ["latitude band", "angular direction", "horizontal direction", "direction"]}


221it [02:19,  1.68it/s]

{"sulfate": ["inorganic compound", "chemical role", "substance form"]}


241it [02:31,  1.85it/s]

{"flattend": ["shape"]}


261it [02:44,  1.75it/s]

{"arctic ocean": ["earth ocean"]}


279it [02:54,  1.83it/s]

Error during response generation for iteration 279: Expecting ',' delimiter: line 1 column 17 (char 16)
Retrying in 5 seconds (Attempt 1/3)....


281it [03:01,  1.79s/it]

{"photodissociate": ["PhysicalProcess"]}


301it [03:15,  1.42it/s]

{"published": ["provenance role"]}


321it [03:26,  1.70it/s]

{"aluminum": ["element", "transition metal", "substance"]}


341it [03:38,  1.84it/s]

{"carnian": ["epoch"]}


361it [03:54,  1.42it/s]

{"rigid": ["physical state", "substance form"]}


381it [04:08,  1.67it/s]

{"slow": ["speed state"]}


401it [04:22,  1.72it/s]

{"undefendable": []}


421it [04:36,  1.48it/s]

{"subsonic": ["speed state"]}


441it [04:47,  1.44it/s]

{"solute": ["substance", "component"]}


461it [04:59,  1.36it/s]

{"k41": ["spectral line"]}


481it [05:10,  1.76it/s]

{"crystal structure": ["substance form", "physical property"], "crystal": ["substance form", "physical state"]}


501it [05:21,  1.91it/s]

{"nitrogen monoxide": ["inorganic compound"]}


513it [05:28,  1.56it/s]


In [29]:
failed_iterations

[]

In [19]:
final_ans

[{'electronegativity': ['dimensionless ratio']},
 {'sigma': ['vertical coordinate'],
  'hybrid sigma': ['vertical coordinate'],
  'hybrid height': ['vertical coordinate']},
 {'b8 class': ['x class'],
  'x49 class': ['x class'],
  'x16 class': ['x class'],
  'x2 class': ['x class'],
  'a8 class': ['a class'],
  'x13 class': ['x class'],
  'x37 class': ['x class'],
  'x31 class': ['x class'],
  'x24 class': ['x class'],
  'x8 class': ['x class'],
  'x47 class': ['x class'],
  'x30 class': ['x class'],
  'x25 class': ['x class'],
  'a7 class': ['a class'],
  'b7 class': ['x class'],
  'b3 class': ['x class'],
  'x42 class': ['x class'],
  'm2 class': ['m class'],
  'x45 class': ['x class'],
  'x4 class': ['x class'],
  'x21 class': ['x class'],
  'c7 class': ['c class'],
  'x28 class': ['x class'],
  'x39 class': ['x class']},
 {'ferroelectric': ['physical property']},
 {'category2': ['classifier']},
 {'per second squared': ['unit'],
  'joule': ['unit'],
  'watt per meter squared per ster

In [20]:
new_data

[{'id': 'TT_465e8904', 'term': 'electronegativity'},
 {'id': 'TT_01c7707e', 'term': 'sigma'},
 {'id': 'TT_b20cb478', 'term': 'b8 class'},
 {'id': 'TT_136ee6bc', 'term': 'ferroelectric'},
 {'id': 'TT_e2be01a5', 'term': 'category2'},
 {'id': 'TT_d67b8385', 'term': 'watt per meter squared per steradian'},
 {'id': 'TT_3a87c4f0', 'term': 'inhibitor'},
 {'id': 'TT_23f84dcf', 'term': 'eutrophic'},
 {'id': 'TT_db0ed17b', 'term': 'micronutrient'},
 {'id': 'TT_343e2868', 'term': 'signer'},
 {'id': 'TT_8b290782', 'term': '311.7mya'},
 {'id': 'TT_17b968f2', 'term': 'conductor'},
 {'id': 'TT_044d65d3', 'term': 'non newtonian'},
 {'id': 'TT_f367c52e', 'term': 'tonian'},
 {'id': 'TT_e61af733', 'term': 'enzyme'},
 {'id': 'TT_fca5f9f9', 'term': 'acidify'},
 {'id': 'TT_3390cdcb', 'term': 'paleogene'},
 {'id': 'TT_ffe53db9', 'term': 'x2 class'},
 {'id': 'TT_152f3a8d', 'term': 'beryllium'},
 {'id': 'TT_03db7b11', 'term': 'oxidizer'},
 {'id': 'TT_ecb9fcc7', 'term': 'landward'},
 {'id': 'TT_93eb6d00', 'term

In [21]:
term_to_id_mapping = {}
for data_n in new_data:
    term_to_id_mapping[data_n['term']]= data_n['id']

In [28]:
type(final_ans[0])

dict

In [22]:
term_to_id_mapping

{'electronegativity': 'TT_465e8904',
 'sigma': 'TT_01c7707e',
 'b8 class': 'TT_b20cb478',
 'ferroelectric': 'TT_136ee6bc',
 'category2': 'TT_e2be01a5',
 'watt per meter squared per steradian': 'TT_d67b8385',
 'inhibitor': 'TT_3a87c4f0',
 'eutrophic': 'TT_23f84dcf',
 'micronutrient': 'TT_db0ed17b',
 'signer': 'TT_343e2868',
 '311.7mya': 'TT_8b290782',
 'conductor': 'TT_17b968f2',
 'non newtonian': 'TT_044d65d3',
 'tonian': 'TT_f367c52e',
 'enzyme': 'TT_e61af733',
 'acidify': 'TT_fca5f9f9',
 'paleogene': 'TT_3390cdcb',
 'x2 class': 'TT_ffe53db9',
 'beryllium': 'TT_152f3a8d',
 'oxidizer': 'TT_03db7b11',
 'landward': 'TT_ecb9fcc7',
 'cfc': 'TT_93eb6d00',
 'serializes': 'TT_13327caf',
 '93.6mya': 'TT_3edea2ca',
 'suspended': 'TT_0ccb2235',
 'silicate': 'TT_7a45e8bb',
 'contributor': 'TT_432d65d3',
 'binder': 'TT_a0dc5361',
 '0.126mya': 'TT_bee9eb1d',
 'uva': 'TT_e8d6b32f',
 'dendritic': 'TT_b5b09234',
 '23.03mya': 'TT_0d8981ec',
 'dim': 'TT_7ce655e9',
 'polycrystalline': 'TT_c926cc33',
 'fc

In [31]:
result = []
for ans_1 in final_ans:
    if isinstance(ans_1, str):
        ans_dict = json.loads(ans_1)
        for term, type in ans_dict.items():
            result.append({"id":term_to_id_mapping[term], "term":term, "types":type})
    else:
        for term, type in ans_1.items():
            result.append({"id":term_to_id_mapping[term], "term":term, "types":type})


In [32]:
result

[{'id': 'TT_465e8904',
  'term': 'electronegativity',
  'types': ['dimensionless ratio']},
 {'id': 'TT_01c7707e', 'term': 'sigma', 'types': ['vertical coordinate']},
 {'id': 'TT_4a2916b8',
  'term': 'hybrid sigma',
  'types': ['vertical coordinate']},
 {'id': 'TT_664dd05f',
  'term': 'hybrid height',
  'types': ['vertical coordinate']},
 {'id': 'TT_b20cb478', 'term': 'b8 class', 'types': ['x class']},
 {'id': 'TT_78dca7c3', 'term': 'x49 class', 'types': ['x class']},
 {'id': 'TT_82c035d5', 'term': 'x16 class', 'types': ['x class']},
 {'id': 'TT_ffe53db9', 'term': 'x2 class', 'types': ['x class']},
 {'id': 'TT_401ab4fb', 'term': 'a8 class', 'types': ['a class']},
 {'id': 'TT_30dfa770', 'term': 'x13 class', 'types': ['x class']},
 {'id': 'TT_ea721ede', 'term': 'x37 class', 'types': ['x class']},
 {'id': 'TT_96809d66', 'term': 'x31 class', 'types': ['x class']},
 {'id': 'TT_eddfe9d3', 'term': 'x24 class', 'types': ['x class']},
 {'id': 'TT_babaeba5', 'term': 'x8 class', 'types': ['x class

In [33]:
import json

with open("Task_B_term_types_SWEET.json", "w", encoding="utf-8") as f:
    json.dump(result, f, indent=2, ensure_ascii=False)
