## Convert Lists in DB Arrays

In [4]:
import json

# Function to convert the language string to a list of strings
def convert_language_to_list(language_str):
    # Remove leading and trailing square brackets and split by comma
    languages_list = language_str.strip("[]").split(", ")
    # Remove any surrounding quotes and whitespace
    return [lang.strip('\"').strip() for lang in languages_list]

# Load the JSON file
input_file_path = 'converted_datasets.json'
output_file_path = 'converted_datasets.json'

with open(input_file_path, 'r') as file:
    models_data = json.load(file)

# Convert the Language array from string to a proper list
for model in models_data:
    if isinstance(model.get("Language (array(str))"), str):
        model["Language (array(str))"] = convert_language_to_list(model["Language (array(str))"])

# Save the updated JSON data to a file
with open(output_file_path, 'w') as json_file:
    json.dump(models_data, json_file, indent=4)

print(f"Updated JSON data has been saved to {output_file_path}")


Updated JSON data has been saved to converted_datasets.json


## Make Train Relationships

In [8]:
import json

# Load the JSON files
with open('final database/ChatIMPACT.Dataset.json', 'r') as f:
    datasets = json.load(f)

with open('final database/ChatIMPACT.LargeLanguageModel.json', 'r') as f:
    models = json.load(f)

# Create a dictionary to quickly find dataset IDs by name
dataset_dict = {dataset['Name']: dataset['_id']['$oid'] for dataset in datasets}

# Create a dictionary to find model IDs by name, allowing multiple entries per name
model_dict = {}
for model in models:
    if model['Name'] not in model_dict:
        model_dict[model['Name']] = []
    model_dict[model['Name']].append(model['_id']['$oid'])

# Define the relationships
relationships = [
    {"Model": "SaulLM7B", "Dataset": ["MultiLegalPile", "Europarl"]},
    {"Model": "StarCoder", "Dataset": "The Stack v2"},
    {"Model": "MEDITRON", "Dataset": ["GUIDELINES", "S2ORC"]},
    {"Model": "Vicuna", "Dataset": None},
]

# Build the relationship structure
output_relationships = []
for relationship in relationships:
    model_ids = model_dict.get(relationship["Model"], [])
    for model_id in model_ids:
        if relationship["Dataset"]:
            if isinstance(relationship["Dataset"], list):
                dataset_ids = [dataset_dict.get(ds) for ds in relationship["Dataset"]]
            else:
                dataset_ids = [dataset_dict.get(relationship["Dataset"])]
            dataset_ids = [ds_id for ds_id in dataset_ids if ds_id]
        else:
            dataset_ids = []
        output_relationships.append({
            "LargeLanguageModel": {"$oid": model_id},
            "Dataset": {"$oid": dataset_ids}
        })

# Save the modified relationships to a new JSON file
output_file = 'train.json'
with open(output_file, 'w') as f:
    json.dump(output_relationships, f, indent=4)

print(f"Relationships saved to {output_file}")

Modified relationships saved to train.json


## Make SuitedFor Relationships

In [20]:
import json

# Load the JSON files
with open('final database/ChatIMPACT.DownstreamTask.json', 'r') as f:
    tasks = json.load(f)

with open('final database/ChatIMPACT.LargeLanguageModel.json', 'r') as f:
    models = json.load(f)

# Create a dictionary to quickly find task IDs by name
task_dict = {task['Name']: task['_id']['$oid'] for task in tasks}

# Create a dictionary to find model IDs by name, allowing multiple entries per name
model_dict = {}
for model in models:
    if model['Name'] not in model_dict:
        model_dict[model['Name']] = []
    model_dict[model['Name']].append(model['_id']['$oid'])

# Define the relationships
relationships = [
    {"Model": "LLaMA", "DownstreamTask": "Question Answering"},
    {"Model": "Code LLaMA", "DownstreamTask": "Code Generation"},
    {"Model": "roberta-base-openai-detector", "DownstreamTask": "Text Classification"},
    {"Model": "Mistral-Finetuned-DialogSumm", "DownstreamTask": "Summarization"},
    {"Model": "RM-Mistral-7B", "DownstreamTask": "Text Classification"},
    {"Model": "SaulLM7B", "DownstreamTask": "Question Answering, Summarization"},
    {"Model": "MEDITRON", "DownstreamTask": "Question Answering"},
    {"Model": "StarCoder", "DownstreamTask": "Code Generation"}
]


# Build the relationship structure
output_relationships = []
for relationship in relationships:
    model_ids = model_dict.get(relationship["Model"], [])
    for model_id in model_ids:
        if relationship["DownstreamTask"]:
            if isinstance(relationship["DownstreamTask"], list):
                task_ids = [task_dict.get(ds) for ds in relationship["DownstreamTask"]]
            else:
                task_ids = [task_dict.get(relationship["DownstreamTask"])]
            task_ids = [ds_id for ds_id in task_ids if ds_id]
        else:
            task_ids = []
        output_relationships.append({
            "LargeLanguageModel": {"$oid": model_id},
            "DownstreamTask": {"$oid": task_ids}
        })

# Save the modified relationships to a new JSON file
output_file = 'suitedFor.json'
with open(output_file, 'w') as f:
    json.dump(output_relationships, f, indent=4)

print(f"Relationships saved to {output_file}")

Relationships saved to suitedFor.json


## Make Evaluated Relationships

In [28]:
import json

# Load the JSON files
with open('final database/ChatIMPACT.metric.json', 'r') as f:
    metrics = json.load(f)

with open('final database/ChatIMPACT.LargeLanguageModel.json', 'r') as f:
    models = json.load(f)

# Create a dictionary to quickly find metric IDs by name
metric_dict = {metric['Name']: metric['_id']['$oid'] for metric in metrics}

# Create a dictionary to find model IDs by name, allowing multiple entries per name
model_dict = {}
for model in models:
    if model['Name'] not in model_dict:
        model_dict[model['Name']] = []
    model_dict[model['Name']].append(model['_id']['$oid'])

# Define the relationships
relationships = [
    {"Model": "SaulLM7B", "metric": None} #TODO
]

# Build the relationship structure
output_relationships = []
for relationship in relationships:
    model_ids = model_dict.get(relationship["Model"], [])
    for model_id in model_ids:
        if relationship["metric"]:
            if isinstance(relationship["metric"], list):
                metric_ids = [metric_dict.get(ds) for ds in relationship["metric"]]
            else:
                metric_ids = [metric_dict.get(relationship["metric"])]
            metric_ids = [ds_id for ds_id in metric_ids if ds_id]
        else:
            metric_ids = []
        output_relationships.append({
            "LargeLanguageModel": {"$oid": model_id},
            "Metric": {"$oid": metric_ids}
        })

# Save the modified relationships to a new JSON file
output_file = 'evaluated.json'
with open(output_file, 'w') as f:
    json.dump(output_relationships, f, indent=4)

print(f"Modified relationships saved to {output_file}")

Modified relationships saved to evaluated.json


## Make Enable Relationships

In [17]:
import json

# Load the JSON files
with open('final database/ChatIMPACT.Dataset.json', 'r') as f:
    datasets = json.load(f)

with open('final database/ChatIMPACT.DownstreamTask.json', 'r') as f:
    tasks = json.load(f)

# Create a dictionary to quickly find dataset IDs by name
dataset_dict = {dataset['Name']: dataset['_id']['$oid'] for dataset in datasets}

# Create a dictionary to find task IDs by name, allowing multiple entries per name
task_dict = {tasks['Name']: tasks['_id']['$oid'] for tasks in tasks}

# Define the relationships
relationships = [
    {"Dataset": "MultiLegalPile", "DownstreamTask": ["Question Answering", "Summarization"], },
    {"Dataset": "Europarl", "DownstreamTask": ["Question Answering", "Summarization"], },
    {"Dataset": "The Stack v2", "DownstreamTask": "Code Generation"},
    {"Dataset": "GUIDELINES", "DownstreamTask": "Question Answering"},
    {"Dataset": "S2ORC", "DownstreamTask": "Question Answering"}
]

# Build the relationship structure
output_relationships = []
for relationship in relationships:
    dataset_id = dataset_dict.get(relationship["Dataset"])
    if dataset_id:
        if relationship["DownstreamTask"]:
            if isinstance(relationship["DownstreamTask"], list):
                task_ids = [task_dict.get(ds) for ds in relationship["DownstreamTask"]]
            else:
                task_ids = [task_dict.get(relationship["DownstreamTask"])]
            task_ids = [ds_id for ds_id in task_ids if ds_id]
        else:
            task_ids = None
        output_relationships.append({
            "Dataset": {"$oid": dataset_id}, 
            "DownstreamTask": {"$oid": task_ids},
        })

# Save the modified relationships to a new JSON file
output_file = 'enable.json'
with open(output_file, 'w') as f:
    json.dump(output_relationships, f, indent=4)

print(f"Modified relationships saved to {output_file}")

Modified relationships saved to enable.json


## Make Assess Relationships

In [27]:
import json

# Load the JSON files
with open('final database/ChatIMPACT.Metric.json', 'r') as f:
    metrics = json.load(f)

with open('final database/ChatIMPACT.DownstreamTask.json', 'r') as f:
    tasks = json.load(f)

# Create a dictionary to quickly find metric IDs by name
metric_dict = {metric['Name']: metric['_id']['$oid'] for metric in metrics}

# Create a dictionary to find task IDs by name, allowing multiple entries per name
task_dict = {tasks['Name']: tasks['_id']['$oid'] for tasks in tasks}

# Define the relationships
relationships = [
    {"metric": "chrF","DownstreamTask": "Machine Translation"},
    {"metric": "BLEU", "DownstreamTask": "Machine Translation"},
    {"metric": "ROUGE", "DownstreamTask": "Summarization"},
    {"metric": "METEOR", "DownstreamTask": "Machine Translation"},
    {"metric": "Perplexity", "DownstreamTask": "Language Modeling"},
    {"metric": "BERTScore", "DownstreamTask": "Text Generation"}
]

# Build the relationship structure
output_relationships = []
for relationship in relationships:
    metric_id = metric_dict.get(relationship["metric"])
    if metric_id:
        if relationship["DownstreamTask"]:
            if isinstance(relationship["DownstreamTask"], list):
                task_ids = [task_dict.get(ds) for ds in relationship["DownstreamTask"]]
            else:
                task_ids = [task_dict.get(relationship["DownstreamTask"])]
            task_ids = [ds_id for ds_id in task_ids if ds_id]
        else:
            task_ids = None
        output_relationships.append({
            "Metric": {"$oid": metric_id}, 
            "DownstreamTask": {"$oid": task_ids},
        })

# Save the modified relationships to a new JSON file
output_file = 'asses.json'
with open(output_file, 'w') as f:
    json.dump(output_relationships, f, indent=4)

print(f"Modified relationships saved to {output_file}")

Modified relationships saved to asses.json
