# Experiments with OntoUML Catalog

In [None]:
import os
import glob
import json
import pandas as pd
import numpy as np

In [None]:
import requests

In [None]:
import textwrap
from pandas.api.types import CategoricalDtype

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

In [None]:
dir_implement = os.getcwd()
os.chdir("Images")
dir_images = os.getcwd()
os.chdir("../Abstractions")
dir_abstractions = os.getcwd()
os.chdir("../Errors")
dir_errors = os.getcwd()
os.chdir("../../../GitHub/ontouml-models2/models")
dir_models = os.getcwd()

In [None]:
sns.set_theme(style="whitegrid", palette="pastel")
sns.despine(offset=5, trim=True)

In [None]:
colors={
    'super small': 'magenta', 
    'small': 'green', 
    'medium': 'blue', 
    'big': 'orange', 
    'super big': 'indigo'
}

----
## Preparing subsets of models

### List of potential models

Setting a directory with models as a working directory...

In [None]:
os.chdir(dir_models)
os.getcwd()

In [None]:
json_problems = [
    'digitaldoctor2022/ontology.json',
    'goncalves2011ecg/ontology.json',
    'tourbo2021/ontology.json',
    'plato-ontology2019/ontology.json',
    'buridan-ontology2021/ontology.json',
    'aristotle-ontology2019/ontology.json',
    'public-expense-ontology2020/ontology.json',
    'tender2013/ontology.json',
    'scientific-publication2013/ontology.json'
]

Go to the folder with models and scan it for *.json

In [None]:
all_file_names = []
for file in glob.glob("*/ontology.json"):
    if file not in json_problems:
        all_file_names.append(file)

print(f"We have {len(all_file_names)} files with ontologies.")

In order to select only those models, that contains only 16 stereotypes (those, for which the algorithm was developed), we
1. analyse all models
2. filter those of our interest

In [None]:
def normalize(stereotype: str) -> str:
    if stereotype:
        stereotype = stereotype.lower().replace(" ", "")
    return stereotype

In [None]:
def get_all_stereotypes(contents, all_content) -> dict:
    if contents:
        for content in contents:
            if content['type'] == 'Package':
                all_content = get_all_stereotypes(content['contents'], all_content)
            else:
                if content['type'] == 'Class':
                    if 'stereotype' in content.keys():
                        stereotype = content['stereotype']
                        stereotype = normalize(stereotype)
                        #if stereotype:
                        if stereotype in all_content:
                            all_content[stereotype] += 1
                        else:
                            all_content[stereotype] = 1
                    elif 'stereotypes' in content.keys():
                        if content['stereotypes']:
                            for stereotype in content['stereotypes']:
                                if stereotype in all_content:
                                    all_content[stereotype] += 1
                                else:
                                    all_content[stereotype] = 1
    return all_content

In [None]:
df_stereotypes = pd.DataFrame(columns=['Name'])

for file_name in all_file_names:
    file = open(file_name, encoding="ISO-8859-1", mode="r")
    data = json.loads(file.read())
    if 'model' in data.keys():
        contents = data['model']['contents']
        model_stereotypes = get_all_stereotypes(contents, {})
        model_stereotypes['Name'] = file_name.split('/')[0]
        df_stereotypes = df_stereotypes.append(model_stereotypes, ignore_index = True)        
    else:
        print(f"ERROR: Model not found in {file_name}.")
    file.close()

df_stereotypes = df_stereotypes.fillna(0)
df_stereotypes = df_stereotypes.set_index('Name')
df_stereotypes = df_stereotypes.astype(int)

print(f"We have stereotypes for {len(df_stereotypes)} ontologies.")

In [None]:
df_stereotypes.head()

Just for curiosity, what are the most popular stereotypes?

In [None]:
df_stereotypes.sum().sort_values(ascending=False)[0:10]

Filtering only those models, that can be processed by the algorithm

In [None]:
algorithm_stereotypes = [
    'subkind', 'kind', 'role', 'relator', 'category', 
    'event', 'rolemixin', 'mode', 'phase', 'collective',  
    'datatype', 'quality', 'mixin', 'quantity', 
    'enumeration', 'phasemixin'
]

In [None]:
subset = df_stereotypes.columns.difference(algorithm_stereotypes)
not_supported_models = df_stereotypes[df_stereotypes[subset].sum(axis=1) > 0].index
print(f"Number of models that contains not supported class stereotypes: {len(not_supported_models)}")

df_models = df_stereotypes.loc[~df_stereotypes.index.isin(not_supported_models), 
                               ~df_stereotypes.columns.isin(subset)]

In [None]:
potential_file_names = [name for name in all_file_names if name.split('/')[0] in df_models.index]
print(f"Number of models that can be processed: {len(potential_file_names)}")

In [None]:
def get_content(contents, all_content) -> dict:
    if contents:
        for content in contents:
            if content['type'] == 'Package':
                all_content = get_content(content['contents'], all_content)
            else:
                if content['type'] == 'Class':
                    all_content['Classes'] += 1
                elif content['type'] == 'Relation':
                    if (content['properties'][0]['aggregationKind'] == 'COMPOSITE') | (
                        content['properties'][1]['aggregationKind'] == 'COMPOSITE'):
                        all_content['PartOf'] += 1
                    all_content['Relations'] += 1
                elif content['type'] == 'Generalization':
                    all_content['Generalizations'] += 1
                    all_content['Relations'] += 1
    return all_content

In [None]:
df_potential = pd.DataFrame(columns=['Name', 'Classes', 'Relations', 'Generalizations', 'PartOf'])

for file_name in potential_file_names:
    file = open(file_name, encoding="ISO-8859-1", mode="r")
    data = json.loads(file.read())
    contents = None
    if 'contents' in data.keys():
        contents = data['contents']
    elif 'model' in data.keys():
        contents = data['model']['contents']
    else:
        print(f"ERROR: Neither model nor contents found in {file_name}.")
    file.close()
    
    all_content = get_content(contents, 
                              {
                                  'Classes': 0, 
                                  'Relations': 0,
                                  'PartOf': 0,
                                  'Generalizations': 0
                              })
    all_content['Name'] = file_name.split('/')[0]
    df_potential = df_potential.append(all_content, ignore_index = True)
    
df_potential = df_potential.fillna(0)
df_potential = df_potential.set_index('Name')
df_potential = df_potential.astype(int)

print(f"We have statistics for {len(df_potential)} models.")

In [None]:
df_potential.describe()

In [None]:
df_potential['TotalSize'] = df_potential['Classes'] + df_potential['Relations']
print(df_potential['TotalSize'].sort_values(ascending=False)[0:10])

In [None]:
conditions = [
    (df_potential['TotalSize'] >= 1000),
    (df_potential['TotalSize'] < 1000) & (df_potential['TotalSize'] >= 200),
    (df_potential['TotalSize'] < 200) & (df_potential['TotalSize'] >= 75),
    (df_potential['TotalSize'] < 75) & (df_potential['TotalSize'] >= 35),
    (df_potential['TotalSize'] < 35)
]
values = ['super big', 'big', 'medium', 'small', 'super small']
df_potential['Model size'] = np.select(conditions, values)
df_potential.head()

In [None]:
os.chdir(dir_images)
os.getcwd()

In [None]:
fig, ax = plt.subplots()
fig.set_figwidth(15)
fig.set_figheight(7)

for (t,c) in colors.items():
    sel_df = df_potential[df_potential['Model size']==t]
    scatter = ax.scatter(sel_df['Classes'], sel_df['Relations'], s=sel_df['TotalSize'], 
                         alpha=0.5, c=c, cmap='viridis', label=t)

#plt.title("Size of conceptual models", fontsize=16)
plt.xlabel("Number of classes", fontsize=16)
plt.ylabel("Number of relations", fontsize=16)

lgnd = plt.legend(markerscale=1,scatterpoints=1, fontsize=14)

#change the marker size manually for all
lgnd.legendHandles[0]._sizes = [50]
lgnd.legendHandles[1]._sizes = [50]
lgnd.legendHandles[2]._sizes = [50]
lgnd.legendHandles[3]._sizes = [50]
lgnd.legendHandles[4]._sizes = [50]
#plt.show()
plt.savefig('all_models.png')

### List of valid models

Send request to `api.ontouml.org` and check models for validity.

In [None]:
headers = {
    'Accept': "application/json",
    'Connection': "keep-alive"
}

In [None]:
url_verify = "http://api.ontouml.org/v1/verify"

List of fixed models:
1. bernasconi2023fair-principles ontology 1 errors were found
2. In goncalves2011ecg ontology 1 errors were found
3. In gomes2022digital-technology ontology 1 errors were found
4. In eu-rent-refactored2022 ontology 2 errors were found
5. In health-organizations ontology 5 errors were found
6. In srro-ontology ontology 2 errors were found
7. In aguiar2019ooco ontology 3 errors were found
8. In nardi2015ufo-s ontology 1 errors were found

In [None]:
os.chdir(dir_models)
os.getcwd()

In [None]:
valid_file_names = []

for file_name in potential_file_names:
    file = open(file_name, encoding="ISO-8859-1", mode="r")
    data = json.loads(file.read())
    file.close()

    body = {'project': data}
    response = requests.post(url_verify, headers=headers, json=body)
    responseResults = json.loads(response.text)['result']
    if len(responseResults) == 0:
        valid_file_names.append(file_name)
    else:
        print(f"In {file_name.split('/')[0]} ontology {len(responseResults)} errors were found")

print(f"Number of valid ontologies is {len(valid_file_names)}")

In [None]:
print(f"Number of valid ontologies is {len(valid_file_names)}")   
print(f"Number of potential ontologies is {len(potential_file_names)}")   

In [None]:
df_valid = df_potential.loc[df_potential.index.isin([name.split('/')[0] for name in valid_file_names]), :]
df_valid.head()

In [None]:
os.chdir(dir_images)
os.getcwd()

In [None]:
fig, ax = plt.subplots()
fig.set_figwidth(15)
fig.set_figheight(7)

library = df_valid.loc['romanenko2023what']
for (t,c) in colors.items():
    sel_df = df_valid[df_valid['Model size']==t]
    scatter = ax.scatter(sel_df['Classes'], sel_df['Relations'], s=sel_df['TotalSize'], 
                         alpha=0.5, c=c, cmap='viridis', label=t)

ax.scatter(library['Classes'], library['Relations'], s=library['TotalSize'], 
            alpha=1.0, c='red', marker='s', cmap='viridis', label='library model')

#plt.title("Size of conceptual models", fontsize=16)
plt.xlabel("Number of classes", fontsize=16)
plt.ylabel("Number of relations", fontsize=16)

lgnd = plt.legend(markerscale=1,scatterpoints=1, fontsize=14)

#change the marker size manually for all
lgnd.legendHandles[0]._sizes = [50]
lgnd.legendHandles[1]._sizes = [50]
lgnd.legendHandles[2]._sizes = [50]
lgnd.legendHandles[3]._sizes = [50]
lgnd.legendHandles[4]._sizes = [50]
lgnd.legendHandles[5]._sizes = [50]

plt.savefig('valid_models.png')

----
## Running abstractions on different sets

### Checking valid models

In [None]:
os.chdir(dir_models)
os.getcwd()

In [None]:
url_abstract = "https://expose.eng.unibz.it/abstract"

In [None]:
atypes = {
    "h": ['hierarchy'], 
    "a": ['aspects'], 
    "p": ['parthood'],
    "ha": ['hierarchy', 'aspects'],
    "ap": ['parthood', 'aspects'],
    "hp": ['parthood', 'hierarchy'],
    "full": ['parthood', 'hierarchy', 'aspects']
}

In [None]:
%%time
for file_name in valid_file_names:
    model_name = file_name.split(os.path.sep)[0]
    file = open(file_name, encoding="ISO-8859-1", mode="r")
    data = json.loads(file.read())
    
    for abstr_name, abstr_params in atypes.items():
        response = requests.post(url_abstract, headers=headers,
                                 json={
                                     'abs_type': abstr_params,
                                     'long_names': True,
                                     'mult_relations': False,
                                     'keep_relators': True,
                                     'in_format': 'json',
                                     'out_format': 'json',
                                     'height': 1000,
                                     'width': 1000,
                                     'origin': data
                                     #'origin': json.load(open(file_name))
                                 })

        if response.ok:
            new_file_name = f"{dir_abstractions}{os.path.sep}{model_name}_{abstr_name}.json"
            with open(new_file_name, 'w') as f:
                json.dump(response.json(), f)

print(f"All valid models were processed.")

In [None]:
os.chdir(dir_abstractions)

In [None]:
abstraction_file_names = []
for file in glob.glob("*.json"):
    abstraction_file_names.append(file)

print(f"We have {len(abstraction_file_names)} files with abstractions.")

__Validation check of abstracted models__

In [None]:
%%time
df_abstract = pd.DataFrame(columns=['Name', 'Classes', 'Relations', 'Generalizations', 'PartOf'])

for file_name in abstraction_file_names:
    file = open(file_name, encoding="ISO-8859-1", mode="r")
    data = json.loads(file.read())
    contents = None
    if 'contents' in data.keys():
        contents = data['contents']
    elif 'model' in data.keys():
        contents = data['model']['contents']
    else:
        print(f"ERROR: Neither model nor contents found in {file_name}.")
    file.close()
    
    all_content = get_content(contents, 
                              {
                                  'Classes': 0, 
                                  'Relations': 0,
                                  'PartOf': 0,
                                  'Generalizations': 0
                              })
    all_content['Name'] = file_name.split('/')[0][:-5]
    
    response = requests.post(url_verify, headers=headers, json={'project': data})
    responseResults = json.loads(response.text)
    if 'result' not in responseResults:
        print(all_content['Name'] + ": " + responseResults['message'])
    df_abstract = df_abstract.append(all_content, ignore_index = True)
    
df_abstract = df_abstract.fillna(0)

print(f"We have statistics for {len(df_abstract)} abstractions.")

In [None]:
anames = {
    "h": 'hierarchy', 
    "a": 'aspects', 
    "p": 'parthood',
    "ha": 'aspects and hierarchy',
    "ap": 'parthood and aspects',
    "hp": 'parthood and hierarchy',
    "full": 'full abstraction'
}

In [None]:
df_abstract["TotalSize"] = df_abstract["Classes"] + df_abstract["Relations"]
df_abstract['Model size'] = ""
df_abstract["Type of abstraction"] = df_abstract["Name"].str.rsplit('_', 1).str[1].map(anames)
df_abstract["Name"] = df_abstract["Name"].str.rsplit('_', 1).str[0]
size_dict = pd.Series(df_valid['Model size'].values,index=df_valid.index).to_dict()
df_abstract['Model size'] = df_abstract['Name'].map(size_dict)

In [None]:
df_abstract.head()

In [None]:
original_models = df_valid.copy(deep=True).reset_index()
original_models["Type of abstraction"] = 'original model'
original_models.head()

In [None]:
df_abstract = pd.concat([df_abstract,original_models], ignore_index=True)

In [None]:
df_abstract[df_abstract['Name']=='romanenko2023what']

In [None]:
abs_types = ['original model', 'aspects', 'parthood', 
             'parthood and aspects', 'hierarchy', 
             'aspects and hierarchy', 'parthood and hierarchy',
             'full abstraction']
abstraction_type = CategoricalDtype(abs_types, ordered=True)
df_abstract['Type of abstraction'] = df_abstract['Type of abstraction'].astype(abstraction_type)
df_abstract.sort_values(by='Type of abstraction', inplace=True)

In [None]:
df_abstract.head(10)

In [None]:
os.chdir(dir_images)
os.getcwd()

In [None]:
df_abstract.to_excel("output.xlsx", sheet_name='abstractions')

In [None]:
def wrap_labels(ax, width, break_long_words=False):
    labels = []
    for label in ax.get_xticklabels():
        text = label.get_text()
        labels.append(textwrap.fill(text, width=width,
                      break_long_words=break_long_words))
    ax.set_xticklabels(labels, rotation=0)

In [None]:
fig, ax = plt.subplots(figsize=(15, 7))
sns.lineplot(x="Type of abstraction", y="Classes",
            hue="Model size", palette=colors.values(),
            data=df_abstract, ax=ax, sort=False)
ax.set_xticklabels(abs_types)
wrap_labels(ax, 18)
sns.despine(offset=10, trim=True)
plt.xlabel('Type of abstraction', fontsize=16)
plt.ylabel('Number of classes', fontsize=16)
#plt.show()
plt.savefig('classes_compression.png')

In [None]:
fig, ax = plt.subplots(figsize=(15, 7))
sns.lineplot(x="Type of abstraction", y="Relations",
            hue="Model size", palette=colors.values(),
            data=df_abstract, ax=ax, sort=False)
ax.set_xticklabels(abs_types)
wrap_labels(ax, 18)
sns.despine(offset=10, trim=True)
plt.xlabel('Type of abstraction', fontsize=16);
plt.ylabel('Number of relations', fontsize=16);
#plt.show()
plt.savefig('relations_compression.png')

### Checking potential models

In [None]:
os.chdir(dir_models)
os.getcwd()

In [None]:
error_file_names = list(set(potential_file_names) - set(valid_file_names))
len(error_file_names)

In [None]:
%%time
for file_name in error_file_names:
    model_name = file_name.split(os.path.sep)[0]
    
    for abstr_name, abstr_params in atypes.items():
        file = open(file_name, encoding="ISO-8859-1", mode="r")
        data = json.loads(file.read())
        response = requests.post(url_abstract, headers=headers,
                                 json={
                                     'abs_type': abstr_params,
                                     'long_names': True,
                                     'mult_relations': False,
                                     'keep_relators': True,
                                     'in_format': 'json',
                                     'out_format': 'json',
                                     'height': 1000,
                                     'width': 1000,
                                     'origin': data
                                 })

        if response.ok:
            new_file_name = f"{dir_errors}{os.path.sep}{model_name}_{abstr_name}.json"
            with open(new_file_name, 'w') as f:
                json.dump(response.json(), f)

print(f"All models with errors were processed.")

In [None]:
df_error = pd.DataFrame(columns=['Name'] + abs_types)

In [None]:
for file_name in error_file_names:
    file = open(file_name, encoding="ISO-8859-1", mode="r")
    data = json.loads(file.read())
    file.close()

    body = {'project': data}
    response = requests.post(url_verify, headers=headers, json=body)
    responseResults = json.loads(response.text)['result']
    df_error = df_error.append({'Name':file_name.split('/')[0], 
                                'original model': len(responseResults)}, 
                               ignore_index = True)
df_error = df_error.set_index('Name')

In [None]:
os.chdir(dir_errors)

In [None]:
abstraction_error_file_names = []
for file in glob.glob("*.json"):
    abstraction_error_file_names.append(file)

print(f"We have {len(abstraction_error_file_names)} files with abstractions.")

In [None]:
%%time
for file_name in abstraction_error_file_names:
    file = open(file_name, encoding="ISO-8859-1", mode="r")
    data = json.loads(file.read())
    response = requests.post(url_verify, headers=headers, json={'project': data})
    responseResults = json.loads(response.text)
    if 'result' not in responseResults:
        print(file_name + ": " + responseResults['message'])
    else:
        name, abstraction = file_name.rsplit('_', 1)
        df_error.loc[name, anames[abstraction[:-5]]] = len(responseResults['result'])
df_error = df_error.fillna(0)
df_error = df_error.astype(int)

In [None]:
df_error.describe()

### Complete check of models

In [None]:
os.chdir(dir_models)
os.getcwd()

In [None]:
%%time
all_models = 0
for idx, file_name in enumerate(valid_file_names):
    model_name = file_name.split(os.path.sep)[0]
    print(f"({idx}) {model_name}")
    
    file = open(file_name, encoding="ISO-8859-1", mode="r")
    data = json.loads(file.read())
    rule = "start"
    applied_rules = []
    
    while rule:
        response = requests.post(url_abstract, headers=headers,
                                 json={
                                     'abs_type': [],
                                     'long_names': True,
                                     'mult_relations': False,
                                     'keep_relators': True,
                                     'in_format': 'json',
                                     'out_format': 'expo',
                                     'height': 1000,
                                     'width': 1000,
                                     'origin': data
                                 })
        if response.ok:
            all_models += 1
            abstraction = json.loads(response.text)
            rule = abstraction["rule"]
            applied_rules.append(rule)
            data = abstraction["origin"]
            
            response = requests.post(url_verify, headers=headers, json={'project': data})
            responseResults = json.loads(response.text)
            if 'result' not in responseResults:
                print(f"ERROR: Not valid abstraction of {model_name} at step {len(applied_rules)}.")
                print(responseResults['message'])    
                # rule = ""
                # break
            
        else:
            print(f"ERROR: Cannot abstract model {model_name} at step {len(applied_rules)}.")
            rule = ""
            break
    
    print(", ".join(applied_rules)[:-2])  
print(f"Total number of all models is {all_models}")