<a href="https://colab.research.google.com/github/meriembenchaaben/IFT6289_Project/blob/main/script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U accelerate
!pip install -q -U optimum
!pip install -q -U transformers
!pip install -q -U auto-gptq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import json
with open('gdrive/My Drive/IFT6289/IFT6289_Project/data/DSLs.json', 'r') as file:
    ontology = json.load(file)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name_or_path = "TheBloke/CapybaraHermes-2.5-Mistral-7B-GPTQ"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                              device_map="auto",
                                              trust_remote_code=False,
                                              revision="main")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def extract_metamodel_elements(diagram_name, data):

    # Iterate through each category of designs
    for category in data['DesignFormalisms'].values():
        # Search in each design form
        for design in category:
            if design['Name'] == diagram_name:
                return design['MetamodelElements']

    return "Diagram not found."

# Generate different missing Elements

In [None]:
def generate_prompt_for_formalism(formalism_name, current_model, metamodel_elements, user_elements=None, desired_functionality=None, specific_constraints=None):
    """
    Generates a prompt for the LLM based on the metamodel elements of a given formalism,
    the current model, and optionally, desired functionality and constraints.

    :param formalism_name: The name of the formalism to generate the prompt for.
    :param current_model: A string representing the current state of the model under construction.
    :param desired_functionality: (Optional) A string describing the desired functionality to be added.
    :param specific_constraints: (Optional) A string describing any specific constraints that must be adhered to.
    :return: A prompt string for the LLM.
    """
    # Navigate the ontology to find the metamodel for the given formalism


    # Construct the prompt using the current model and metamodel elements
    if metamodel_elements:

        # User specifies which metamodel elements to modify
        if user_elements:
          prompt_parts = [
            f"I am developing a {formalism_name} which currently includes: {current_model}.",
            f"Suggest new elements for each of the metamodel element in {user_elements},",
            f"that can complete the current model."
        ]
        # If not, pass all metamodel elements
        else:
          prompt_parts = [
            f"I am developing a {formalism_name} which currently includes: {current_model}.",
            f"Suggest new elements for each of the metamodel element in {metamodel_elements},",
            f"that can complete the current model."
        ]

        # If desired functionality is provided, add it to the prompt
        if desired_functionality:
            prompt_parts.append(f"The model needs to support: {desired_functionality}.")

        # If specific constraints are provided, add them to the prompt
        if specific_constraints:
            prompt_parts.append(f"This should adhere to the following constraints: {specific_constraints}.")


        # Specification to the LLM
        prompt_parts.append(f"Elements should be organized in a JSON format where key value is the name of the metamodel element.")
        prompt_parts.append(f"Provide only the JSON file. Do not provide extra explanations.")


        prompt = " ".join(prompt_parts)

        return prompt
    else:
        return f"Formalism '{formalism_name}' not found in the ontology."



In [None]:
def generate_text(tokenizer, model, prompt, max_new_tokens=512):

    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
    output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=max_new_tokens)
    print(tokenizer.decode(output[0]))
    return tokenizer.decode(output[0] )


In [None]:
def generate_for_completion(prompt, formalism_name):
    system_message = f"You are a model completion expert. You are specialized in {formalism_name}."
    prompt_template=f'''<|im_start|>system
    {system_message}<|im_end|>
    <|im_start|>user
    {prompt}<|im_end|>
    <|im_start|>assistant
    '''

    generated = generate_text(tokenizer, model, prompt_template)
    return generated

# Extract new Elements-  Rule Based Extraction / Manual Parsing

In [None]:
def parse_response(generated):
  # parsing reponse (Mistral)
  parsed = generated.split("<|im_start|> assistant\n")[1]
  if "<|im_end|>" in parsed:
    parsed = parsed.split('<|im_end|>')[0]
  return parsed

In [None]:
def parse_json(parsed):
  if "```json" in parsed:
    parsed = parsed.split('```json')[1]
  if "```" in parsed:
    parsed = parsed.split('```')[0]
  return parsed

In [None]:
def get_elements_by_keys(data, keys):
    # data: output generated by LLM (json file)
    # keys: metamodel_elements
    data = json.loads(data)
    # Create a dictionary to store the results
    results = {}

    # Convert all dictionary keys to lowercase for case-insensitive comparison
    data_lower = {k.lower(): v for k, v in data.items()}

    # Iterate through each key provided and retrieve corresponding values
    for key in keys:
        # Convert the key to lowercase for case-insensitive comparison
        key_lower = key.lower()
        if key_lower in data_lower:
            results[key] = data_lower[key_lower]
        else:
            results[key] = "Key not found in data"

    return results

# Validate element by element if it is actually a good fit

In [None]:
def extract_responses(feedback):
    lines = feedback.split("\n")
    responses = []
    for line in lines:
        if "Sure" in line:
            index = lines.index(line)
            responses.extend(lines[index+1:])
            break
    return "\n".join(responses).split('</s>')[0]

In [None]:
def validation_loop(elements, current_model, formalism_name, metamodel_elements):
  feedback = []
  for key, value in elements.items():
    prompt_parts = [
        f'I am developing a {formalism_name} which currently includes: {current_model}.',
        f'An expert suggests adding {value} as {key} to the {formalism_name}.',
        f'For each suggested elements, check if it is a good fit.',
        f'Answer with only a simple Yes or No, following the element and :',
        f'No additional explanations should be provided.',
        f'Generate only the answer without any other text.'
    ]
    prompt = " ".join(prompt_parts)
    system_message = f"You are a model completion expert. You are specialized in {formalism_name}."
    prompt_template=f'''[INST] <<SYS>>
{system_message}<</SYS>>
{prompt}[/INST]'''


    print('Output:\n')
    generated = generate_text(validation_tokenizer, validation_model, prompt_template, 200)
    parsed = extract_responses(generated)
    feedback.append(parsed)

  return "\n\n".join(feedback)


# Demo

In the following, we give a running example of Class Diagram completion.

In [None]:
formalism_name = "Class Diagrams (UML)"
current_model = "Classes: Hospital, Doctor; Attributes: Hospital(name, address)" # Users can customize current model
metamodel_description = extract_metamodel_elements(formalism_name, ontology) # Extract metamodel elements from the JSON file containing modeling formalisms
metamodel_description

['Classes', 'Attributes', 'Static Relationships']

In [None]:
prompt = generate_prompt_for_formalism(formalism_name,current_model,metamodel_description) # Prompt used for completion
prompt

"I am developing a Class Diagrams (UML) which currently includes: Classes: Hospital, Doctor; Attributes: Hospital(name, address). Suggest new elements for each of the metamodel element in ['Classes', 'Attributes', 'Static Relationships'], that can complete the current model. Elements should be organized in a JSON format where key value is the name of the metamodel element. Provide only the JSON file. Do not provide extra explanations."

In [None]:
generated = generate_for_completion(prompt, formalism_name) # Model completion

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


<s><|im_start|> system
    You are a model completion expert. You are specialized in Class Diagrams (UML).<|im_end|> 
    <|im_start|> user
    I am developing a Class Diagrams (UML) which currently includes: Classes: Hospital, Doctor; Attributes: Hospital(name, address). Suggest new elements for each of the metamodel element in ['Classes', 'Attributes', 'Static Relationships'], that can complete the current model. Elements should be organized in a JSON format where key value is the name of the metamodel element. Provide only the JSON file. Do not provide extra explanations.<|im_end|> 
    <|im_start|> assistant
     {
        "Classes": [
            "Patient",
            "Appointment",
            "MedicalRecord"
        ],
        "Attributes": [
            {
                "Hospital": "phoneNumber"
            },
            {
                "Doctor": "specialty"
            },
            {
                "Patient": "insurance"
            },
            {
                "Ap

In [None]:
parsed = parse_json(parse_response(generated)) # rule-based output extraction
elements = get_elements_by_keys(parsed, metamodel_description) # rule-based element extraction
elements

{'Classes': ['Patient', 'Appointment', 'MedicalRecord'],
 'Attributes': [{'Hospital': 'phoneNumber'},
  {'Doctor': 'specialty'},
  {'Patient': 'insurance'},
  {'Appointment': 'date'},
  {'MedicalRecord': 'diagnosis'}],
 'Static Relationships': [{'Hospital': 'has',
   'Doctors': ['Hospital', 'Doctor']},
  {'Doctor': 'treats', 'Patients': ['Doctor', 'Patient']},
  {'Patient': 'has', 'Appointments': ['Patient', 'Appointment']},
  {'Appointment': 'contains',
   'MedicalRecords': ['Appointment', 'MedicalRecord']}]}

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
validation_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                              device_map="auto",
                                              trust_remote_code=False,
                                              revision="main")
validation_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)





The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class


In [None]:
feedback = validation_loop(elements, current_model, formalism_name, metamodel_description) # validate metamodel elements respectively

Output:





<s> [INST] <<SYS>>
You are a model completion expert. You are specialized in Class Diagrams (UML).<</SYS>>
I am developing a Class Diagrams (UML) which currently includes: Classes: Hospital, Doctor; Attributes: Hospital(name, address). An expert suggests adding ['Patient', 'Appointment', 'MedicalRecord'] as Classes to the Class Diagrams (UML). For each suggested elements, check if it is a good fit. Answer with only a simple Yes or No, following the element and : No additional explanations should be provided. Generate only the answer without any other text.[/INST]  Sure, I'd be happy to help you with that! Here are my answers to your suggested elements:
* 'Patient': Yes
* 'Appointment': Yes
* 'MedicalRecord': Yes</s>
Output:

<s> [INST] <<SYS>>
You are a model completion expert. You are specialized in Class Diagrams (UML).<</SYS>>
I am developing a Class Diagrams (UML) which currently includes: Classes: Hospital, Doctor; Attributes: Hospital(name, address). An expert suggests adding [{'

In [None]:
print(feedback) # final extracted elements for completion

* 'Patient': Yes
* 'Appointment': Yes
* 'MedicalRecord': Yes

1. Hospital : 'phoneNumber' - Yes
2. Doctor : 'specialty' - Yes
3. Patient : 'insurance' - No
4. Appointment : 'date' - Yes
5. MedicalRecord : 'diagnosis' - Yes

1. Hospital has 'Hospital': Yes
2. Doctor treats 'Doctors': Yes
3. Patient has 'Appointments': Yes
4. Appointment contains 'MedicalRecords': Yes
