In [1]:
import json
def load_json(data_path):
    with open(data_path, "r") as file_1:
        data = json.load(file_1)
    return data


In [2]:
test_data = []
with open("C10-Blind-Types.txt", "r", encoding="utf-8") as file:
    content = file.readlines()
    for con in content:
        test_data.append(con.strip('\n'))

print(test_data)

['semi colon', 'part of speech', 'intransitive interrogative frame', 'ditransitive frame_ to', 'question mark', 'adjective scale frame', 'negative particle', 'nominal complement frame', 'interrogative frame', 'subjectless frame', 'complement', 'diminutive noun', 'noun phrase', 'adjective phrase', 'prepositional adverb', 'definite article', 'weak personal pronoun', 'possessive infinitive clause', 'indefinite pronoun', 'participle adjective', 'gerund clause', 'reflexive transitive pp frame', 'intransitive pp declarative frame', 'adverbial pronoun', 'particle', 'arbitrary control', 'AbbreviatedForm', 'demonstrative determiner', 'conjunction', 'determiner', 'coordinating conjunction', 'coordination particle', 'present participle adjective', 'preposition frame', 'noun predicate frame', 'ditransitive frame', 'light verb', 'DativePostPositiveArg', 'interrogative pronoun', 'conditional particle', 'Affix', 'existential pronoun', 'SyntacticFrame', 'LexicalEntry', 'generic numeral', 'numeral frac

In [3]:
len(test_data)

266

In [None]:
from google import genai
model = "gemini-2.5-pro"
client= genai.Client(api_key="")

In [6]:
prompt_domain_extraction = f"""You are an expert AI assistant specializing in knowledge representation, ontology, and domain analysis. Your task is to analyze a given list of terms to identify the specific knowledge domain they belong to.

**Instructions:**

1.  **Analyze Input:** You will be given a single list of terms. Carefully examine these terms to find the common theme or scientific field that connects them. Consider what subject matter these terms represent (e.g., biology, geography, technology, food production).
2.  **Generate Domain Name:** Based on your analysis, create a concise and descriptive name for the domain. For example, "Food Science and Production" or "Earth and Environmental Science."
3.  **Generate Domain Description:** Write a 4-5 line description of this domain. This description should explain:
    *   The general area of study or industry it covers.
    *   The types of concepts the terms represent within this domain.
    *   How concepts within this domain are typically related (e.g., one can be a part of another, one can be transformed into another, or one can be an ingredient of another).
    *   The overall purpose of structuring knowledge in this domain (e.g., for scientific analysis, traceability, data interoperability).
4.  **Format the Output:** Return a Python dictionary containing two keys: `domain_name` and `domain_description`.

**Task:**
Analyze the following list of terms and generate a domain name and description.

-   **List of Terms:** {test_data}

Return the result as a Python dictionary."""

In [9]:
generation_config = {"response_mime_type": "application/json"}

response = client.models.generate_content(
        contents= prompt_domain_extraction,
        config=generation_config,
        model=model
    )
domain= json.loads(response.text)

In [10]:
domain

{'domain_name': 'Linguistics and Computational Grammar',
 'domain_description': "This domain encompasses the formal description of language structure, central to linguistics and computational grammar. The terms represent a detailed hierarchy of linguistic units, from morphological components like affixes and roots to lexical items like words and syntactic structures such as phrases, clauses, and complex grammatical patterns known as 'syntactic frames.' Relationships are primarily compositional and functional, where smaller units build larger ones and elements take on specific roles (e.g., subject, object) within a sentence. The purpose of this structured knowledge is to enable the computational analysis, parsing, and understanding of human language, forming the foundation for Natural Language Processing (NLP)."}

In [11]:
prompt_cluster = f"""You are an expert AI assistant with deep expertise in knowledge graphs and ontology reasoning. You will be provided with a domain name and a description to set the context for your task.

Your primary task is to group a list of terms into clusters by identifying their **parent-child relationships**. In this context, a **parent-child relationship** is a fundamental connection where one term is a direct 'parent' (a broader category) or a 'child' (a more specific instance or part) of another. This includes:
*   **'is-a' relationships (Type/Subtype):** Where the child term is a specific type of the parent term (e.g., 'River' is-a 'Body of Water').
*   **'part-of' relationships (Whole/Part):** Where the child term is a component or part of the parent term (e.g., 'Sea' is-part-of 'Ocean').

You will cluster terms that are linked directly or indirectly by these fundamental parent-child connections.

**Context (Provided as Input):**
-   **Domain Name:** {domain['domain_name']}
-   **Domain Description:** {domain['domain_description']}

**Instructions:**

1.  **Build the Graph:**
    *   Treat each term from the input list as a point (node) in a graph.
    *   Using your knowledge of the **identified domain** (contextualized by the provided name and description), check every pair of terms to see if a **parent-child relationship** ('is-a' or 'part-of') exists between them.
    *   If such a relationship exists, create a two-way (undirected) link between the parent and child terms for clustering purposes.
2.  **Form Clusters:**
    *   Group terms into clusters where terms are connected either directly or indirectly through the identified **parent-child** links.
    *   Terms with no identifiable parent-child connections to other terms in the list will form their own single-term clusters.
    *   Ensure every term from the input list appears in the output.
3.  **Handle Special Cases:**
    *   If the terms list is empty, return an empty list of clusters: [].
4.  **Format the Output:**
    *   Return a Python list of lists, where each inner list contains the terms in a cluster.
    *   Do not include the relationships in the output, only the grouped terms.
    *   Sort the clusters alphabetically by the first term in each cluster.
    *   Within each cluster, sort the terms alphabetically.

**Task:**
Group the following terms into clusters by identifying all direct and indirect **parent-child ('is-a' and 'part-of') relationships** between them, guided by the provided domain context.

-   **List of Terms:** {test_data}

Return the result as a Python list of lists, where each inner list represents a cluster of terms."""

In [12]:
generation_config = {"response_mime_type": "application/json"}

response = client.models.generate_content(
        contents= prompt_cluster,
        config=generation_config,
        model=model
    )
clusters_list= json.loads(response.text)

In [13]:
clusters_list

[['AbbreviatedForm',
  'AccusativePostPositiveArg',
  'Affix',
  'affirmative particle',
  'adjective',
  'adjective accusative post positive frame',
  'adjective attributive frame',
  'adjective comparative frame',
  'adjective dative post positive frame',
  'adjective frame',
  'adjective genitive post positive frame',
  'adjective impersonal frame',
  'adjective post positive frame',
  'adjective pp frame',
  'adjective predicate frame',
  'adjective predicative frame',
  'adjective scale frame',
  'adjective superlative frame',
  'adjective phrase',
  'adjective-i',
  'adjective-na',
  'adjectival complement frame',
  'adjunct',
  'adposition',
  'adpositional object',
  'adverb',
  'adverbial complement',
  'adverbial complement frame',
  'adverbial pronoun',
  'affixed personal pronoun',
  'allusive pronoun',
  'arbitrary control',
  'article',
  'attributive arg',
  'auxiliary',
  'cardinal numeral',
  'circumposition',
  'clausal arg',
  'collective pronoun',
  'common noun',
 

In [15]:
for cluser in clusters_list:
    print(len(cluser))

247
15
1


In [26]:
model= "gemini-2.5-pro"

In [None]:
from tqdm import tqdm

response_final = []

# Assume 'final_output' is a list of clusters (e.g., [['Ocean', 'Sea', 'Water'], ['Glacier', 'Ice']])
# Assume 'domain' is a dictionary like {'domain_name': '...', 'domain_description': '...'}
# Assume 'client' and 'model' are already initialized

for i, term_cluster in tqdm(enumerate(clusters_list)):
    # This prompt asks the model to discover the relationship itself
    # instead of just checking against a predefined list.
    if len(term_cluster)>1:
        prompt = f"""You are an expert AI assistant with deep expertise in knowledge graphs and ontology reasoning. Your task will be guided by a specific domain context provided to you.

    **Context (Provided as Input):**
    -   **Domain Name:** {domain['domain_name']}
    -   **Domain Description:** {domain['domain_description']}

    Your task is to take a single cluster of terms and discover all possible **parent-child relationship triples** (head, relation, tail) *between terms within that cluster*. A parent-child relationship can be either:
    *   **'is-a' (Type/Subtype):** Where the child term is a specific type of the parent term.
    *   **'part-of' (Whole/Part):** Where the child term is a component or part of the parent term.

    **Instructions:**

    1.  **Process the Cluster:**
        *   Examine all possible pairs of terms within the given input cluster.
        *   Using your knowledge of the **identified domain**, determine if a direct **parent-child ('is-a' or 'part-of') relationship** exists between them.
        *   If a relationship is found, create a triple `(head, relation, tail)`. The `relation` in the triple should be a string that describes the discovered relationship (e.g., 'is-a', 'part-of').
        *   Only include triples where both the `head` and `tail` are terms from the input cluster.

    2.  **Handle Special Cases:**
        *   If the cluster contains one or zero terms, return an empty list.
        *   If no parent-child relationships can be inferred for any pair in the cluster, return an empty list.

    3.  **Format the Output:**
        *   Return a Python list containing the generated triples, where each triple is a tuple.
        *   Each triple must be in the format `(head, relation, tail)`.
        *   Sort the final list of triples alphabetically by head, then by relation, and then by tail.

    **Task:**
    For the given cluster of terms, and guided by the provided domain context, discover and generate all possible parent-child relationship triples.

    -   **Cluster of Terms:** {term_cluster}

    Return the result as a Python list of triples.
    """
        print(len(prompt), model)
        generation_config = {"response_mime_type": "application/json"}
        
        # This print statement is for debugging the model object if needed
        # print(model) 

        response = client.models.generate_content(
                contents=prompt,
                config=generation_config,
                model=model
            )
            
        response_final.append(response.text)
        
        # Print progress every 10 iterations
        if i % 10 == 0:
            print(f"--- Response for cluster {i} ---")
            print(response.text)

    # After the loop, response_final will contain a list of JSON strings,
    # each string being a list of discovered triples for the corresponding cluster.

0it [00:00, ?it/s]

8736 gemini-2.5-pro


1it [02:40, 160.46s/it]

--- Response for cluster 0 ---
[
  [
    "AbbreviatedForm",
    "is-a",
    "Word"
  ],
  [
    "AccusativePostPositiveArg",
    "is-a",
    "post positive arg"
  ],
  [
    "AccusativePostPositiveArg",
    "part-of",
    "adjective accusative post positive frame"
  ],
  [
    "Affix",
    "is-a",
    "zero morph"
  ],
  [
    "Affix",
    "part-of",
    "Word"
  ],
  [
    "DativePostPositiveArg",
    "is-a",
    "post positive arg"
  ],
  [
    "DativePostPositiveArg",
    "part-of",
    "adjective dative post positive frame"
  ],
  [
    "GenitivePostPositiveArg",
    "is-a",
    "post positive arg"
  ],
  [
    "GenitivePostPositiveArg",
    "part-of",
    "adjective genitive post positive frame"
  ],
  [
    "LexicalEntry",
    "is-a",
    "MultiWordExpression"
  ],
  [
    "LexicalEntry",
    "is-a",
    "Word"
  ],
  [
    "MultiWordExpression",
    "is-a",
    "fused preposition determiner"
  ],
  [
    "MultiWordExpression",
    "is-a",
    "fused preposition pronoun"
  ],
  [

In [19]:
len(response_final)

2

In [22]:
final_ans = []
for resp in response_final:
    resp_list = json.loads(resp)
    for lst in resp_list:
        final_ans.append({"child":lst[0], "parent":lst[-1]})

In [23]:
len(final_ans)

411

In [24]:
final_ans

[{'child': 'AbbreviatedForm', 'parent': 'LexicalEntry'},
 {'child': 'AccusativePostPositiveArg', 'parent': 'post positive arg'},
 {'child': 'AccusativePostPositiveArg',
  'parent': 'adjective accusative post positive frame'},
 {'child': 'Affix', 'parent': 'Word'},
 {'child': 'adjectival complement', 'parent': 'complement'},
 {'child': 'adjectival complement frame', 'parent': 'SyntacticFrame'},
 {'child': 'adjective', 'parent': 'part of speech'},
 {'child': 'adjective', 'parent': 'adjective phrase'},
 {'child': 'adjective accusative post positive frame',
  'parent': 'adjective frame'},
 {'child': 'adjective accusative post positive frame',
  'parent': 'adjective post positive frame'},
 {'child': 'adjective attributive frame', 'parent': 'adjective frame'},
 {'child': 'adjective comparative frame', 'parent': 'adjective frame'},
 {'child': 'adjective dative post positive frame',
  'parent': 'adjective frame'},
 {'child': 'adjective dative post positive frame',
  'parent': 'adjective post p

In [25]:
with open("predictions_flash_model_clustering_pairs.json", "w", encoding="utf-8") as f:
    json.dump(final_ans, f, indent=2, ensure_ascii=False)
