In [None]:
import json
import pandas as pd

projects = [
    'DigiBatMat',
    'DIGITRUBBER',
    'DiProMag',
    #'DiStAl',
    'GlasDigital',
    #'iBain',
    'KNOW-NOW',
    'KupferDigital',
    'LeBeDigital',
    'ODE_AM',
    'PMDao_MO',
    'PMDao_TTO',
    'SensoTwin',
    'SmaDi',
    'StahlDigital'
]

data = {}

for ont in projects:
    with open(f'{ont}/{ont}.json', 'r', encoding='utf-8') as f:
        data.update({ont: json.load(f)})

## Used Top-Level-Ontologies
For each of the provided ontologies the use of TLOs was analyzed. This was achieved by counting rdfs:subClassOf and rdfs:subPropertyOf chains, for which the subject belongs to the projects namespace and the object belongs to the TLOs namespace. For example, the SPARQL-Query for the usage of PMD Core Ontology (v2.0.x) in the SensoTwin project reads:
```sparql
SELECT (COUNT(*) as ?subcount)
WHERE {
    ?ao rdfs:subClassOf+|rdfs:subPropertyOf+ ?tlo .
    FILTER( STRSTARTS( STR(?tlo), "https://w3id.org/pmd/co" ) ) .
    FILTER( STRSTARTS( STR(?ao), "http://w3id.org/sensotwin/applicationontology" ) ) .
}
```

In [None]:
pd.DataFrame({ont: {tlo: d['subclassesproperties'] for tlo, d in item['tlos']['reasoned'].items()} for ont, item in data.items()}).T

```sparql
SELECT (COUNT(*) as ?count)
WHERE {
    ?ao ?p ?o .
    FILTER( STRSTARTS( STR(?p), "https://w3id.org/pmd/co" ) ) .
    FILTER( STRSTARTS( STR(?ao), "http://w3id.org/sensotwin/applicationontology" ) ) .
}
```

```sparql
SELECT (COUNT(*) as ?count)
WHERE {
    ?ao ?p ?o .
    FILTER( STRSTARTS( STR(?o), "https://w3id.org/pmd/co" ) ) .
    FILTER( STRSTARTS( STR(?ao), "http://w3id.org/sensotwin/applicationontology" ) ) .
}
```

In [None]:
pd.DataFrame({ont: {tlo: d['objects']+d['predicates'] for tlo, d in item['tlos']['reasoned'].items()} for ont, item in data.items()}).T

## Overall defined concepts
The overall number of introduced concepts was analysed. For that, the projects ontology as well as the applicable pmdco were loaded into Protégé and a Reasoner was run. On the resultant graph, the following query was executed (exemplary for `owl:Class`es in SensoTwin):

```sparql
SELECT (COUNT(*) as ?classcount)
WHERE {
    ?class a owl:Class .
    FILTER STRSTARTS( ?class, "http://w3id.org/sensotwin/applicationontology" ) .
}
```

The table below shows the respective numbers of found definitions.

In [None]:
concepts = {ont: {
    'owl:Class': item['definitioncounts']['owl:Class'],
    'owl:ObjectProperty': item['definitioncounts']['owl:ObjectProperty'],
    'owl:DatatypeProperty': item['definitioncounts']['owl:DatatypeProperty'],
    'Total': item['definitioncounts']['owl:Class']+item['definitioncounts']['owl:ObjectProperty']+item['definitioncounts']['owl:DatatypeProperty'],
    'Reasoner': f"{item['reasoner']['reasoner']}-{item['reasoner']['version']}"
} for ont, item in data.items()}
pd.DataFrame(concepts).T

## Number of ProcessingNodes, ValueObjects, Processes, Objects (pmdco-2.0.x) and ProcessNodes (pmdco-v0.1-beta)
To get an overview over the usage of the PMD Core Ontology the number of subclasses of ProcessingNode, ValueObject, Process and Object was determined. For that, the projects ontology as well as the applicable pmdco were loaded into Protégé and a Reasoner was run. On the resultant graph, the following query was executed (exemplary for sub-classes of ProcessingNode in SensoTwin):

```sparql
SELECT ?classname
WHERE {
    ?x rdfs:subClassOf+ <https://w3id.org/pmd/co/ProcessingNode> .
    BIND(STR(?x) AS ?classname) .
    FILTER STRSTARTS( ?classname, "http://w3id.org/sensotwin/applicationontology" ) .
}
```

The table below shows the respective numbers of found definitions.

In [None]:
pmdusage = {ont: {
    'ProcessingNode (2.0.x)': item['processingnodes']['pmdco-2.0.7']['count'],
    'ValueObject (2.0.x)': item['valueobjects']['pmdco-2.0.7']['count'],
    'Process (2.0.x)': item['processes']['pmdco-2.0.7']['count'],
    'Object (2.0.x)': item['objects']['pmdco-2.0.7']['count'],
    'Total (2.0.x)': item['processingnodes']['pmdco-2.0.7']['count']+item['valueobjects']['pmdco-2.0.7']['count']+item['processes']['pmdco-2.0.7']['count']+item['objects']['pmdco-2.0.7']['count'],
    'ProcessNode (v0.1-beta)': item['processingnodes']['pmdco-v0.1-beta']['count'],
    'Total (v0.1-beta)': item['processingnodes']['pmdco-v0.1-beta']['count'],
    'Total (both)': item['processingnodes']['pmdco-2.0.7']['count']+item['valueobjects']['pmdco-2.0.7']['count']+item['processes']['pmdco-2.0.7']['count']+item['objects']['pmdco-2.0.7']['count']+item['processingnodes']['pmdco-v0.1-beta']['count'],
    'Reasoner': f"{item['reasoner']['reasoner']}-{item['reasoner']['version']}"
} for ont, item in data.items()}
pd.DataFrame(pmdusage).T

## Used Licenses
The following table summarizes the referenced licenses. The SPARQL used for finding this information reads:
```sparql
SELECT ?lic
WHERE {
    ?x <http://purl.org/dc/terms/license>|<http://purl.org/dc/elements/1.1/license> ?lic .
}
```

In [None]:
def license_cleanup(license):
    replacements = [
        ('https://creativecommons.org/licenses/by/4.0', 'CC BY 4.0'),
        ('http://creativecommons.org/licenses/by/4.0', 'CC BY 4.0'),
        ('https://creativecommons.org/licenses/by-sa/4.0/', 'CC BY-SA 4.0'),
        ('https://creativecommons.org/licenses/unspecified', '')
    ]
    license = license.replace('<', '').replace('>', '')
    for old, new in replacements:
        if license.startswith(old):
            return new
    return license

licenses = {ont: {'used_licenses': ', '.join(map(license_cleanup, set(item['license']['items'])))} for ont, item in data.items()}
pd.DataFrame(licenses).T

## Contributors

In [None]:
import re
import rdflib
from IPython.display import display, HTML

def pp(df):
    return display(HTML(df.to_html().replace('\\n', '<br>')))

def orcid_resolve(string):
    m = re.match(r"<?(https://orcid.org/(\d{4}-\d{4}-\d{4}-\d{3}[\dX]))>?", string)
    if m:
        orcid = m.group(1)
        stype = 'uri' if f'<{orcid}>' == string else 'literal'

        g = rdflib.Graph()
        g.parse(orcid)
        names = []
        [names.append(str(row.gname)) for row in g.query(
            f"""
                SELECT ?gname WHERE {{
                    <{orcid}> <http://xmlns.com/foaf/0.1/givenName> ?gname .
                }}
            """
        )]
        [names.append(str(row.fname)) for row in g.query(
            f"""
                SELECT ?fname WHERE {{
                    <{orcid}> <http://xmlns.com/foaf/0.1/familyName> ?fname .
                }}
            """
        )]
        name = ' '.join(names)
        return f'{orcid} ({stype}) -> {name}'
    return string

contributors = {ont: {'creators_contributors': '\n'.join(map(orcid_resolve, set(item['creators_contributors']['items'])))} for ont, item in data.items()}
df = pd.DataFrame(contributors).T
pp(df)

## Namespaces
To analyze which namespaces were used in the projects T-Boxes, the ontology files were parsed for all occurences of semantically valid uris (`'<(https?:\/\/([0-9A-z-_~\.]+[\/|#])+)'`). The list of uris was stored (`requests_raw.xlsx`) and manually curated with applicable namespace identifiers (`requests.xlsx`). This approach was necessary, as in some of the ontology files wrong or ambiguous identifiers were used (e.g. `http://material-digital.de/pmdco/` instead of `https://material-digital.de/pmdco/`). For all uris it was tested, if they are dereferecenceable. If so, it was checked if they allow for content negotiation and deliver some owl serialization.

In [None]:
import requests
from ipywidgets import IntProgress
from IPython.display import display

mime_types = ['text/turtle','application/rdf+xml','application/ld+json','application/n-triples']
res = {
    'accept': {},
    'noaccept': {},
    'error': {}
}

all_namespaces = list(set(x for ds in data.values() for x in ds['namespaces']['items']))

f = IntProgress(min=0, max=len(all_namespaces))
display(f)

for x in all_namespaces:
    f.value += 1
    try:
        req = requests.head(x, headers={'Accept': ','.join(mime_types)}, allow_redirects=True)
        if req.headers['content-type'] in mime_types: 
            res['accept'].update({x: {'status_code': req.status_code, 'content_type': req.headers['content-type']}})
        else:
            res['noaccept'].update({x: {'status_code': req.status_code, 'content_type': req.headers['content-type']}})
    except Exception as e:
        res['error'].update({x: {'error': e}})
responses = pd.concat((
    pd.DataFrame(res['accept']).T, 
    pd.DataFrame(res['noaccept']).T, 
    #pd.DataFrame(res['error']).T
))

import json
with open('requests.json', 'r', encoding='utf8') as jf:
    requests_data = json.load(jf)

for a, x in responses.iterrows():
    if a not in requests_data:
        requests_data.update({a: {
            'status_code': x['status_code'],
            'content_type': x['content_type'],
            'countas_tlo': None,
            'countas_ao': None}})
        print(f'added {a} to requests_data (with empty information)')
    if a in requests_data:
        if not x['status_code'] == requests_data[a]['status_code']:
            requests_data[a].update({'status_code': x['status_code']})
            print(f'updated status code for {a}')
        if not x['content_type'] == requests_data[a]['content_type']:
            requests_data[a].update({'content_type': x['content_type']})
            print(f'updated content type for {a}')

with open('requests.json', 'w', encoding='utf8') as jf:
    json.dump(requests_data, jf, indent=4)

## Namespace identifiers
The following uri(-stubs) were collected into the respective namespace identifiers. 

In [None]:
with open('requests.json', 'r', encoding='utf8') as jf:
    requests_data = json.load(jf)

tlodict = dict()
for tkey, tval in requests_data.items():
    if not tval['countas_tlo']:
        continue
    if tval['countas_tlo'] not in tlodict:
        tlodict[tval['countas_tlo']] = {'uris': [tkey]}
    else:
        tlodict[tval['countas_tlo']]['uris'].append(tkey)
tlodict = {k: {'uris': ', '.join(v['uris'])} for k, v in tlodict.items()}
pd.set_option('display.max_colwidth', None)
pd.DataFrame(tlodict).T

In [None]:
aodict = dict()
for tkey, tval in requests_data.items():
    if not tval['countas_ao']:
        continue
    if tval['countas_ao'] not in aodict:
        aodict[tval['countas_ao']] = {'uris': [tkey]}
    else:
        aodict[tval['countas_ao']]['uris'].append(tkey)
aodict = {k: {'uris': ', '.join(v['uris'])} for k, v in aodict.items()}
pd.set_option('display.max_colwidth', None)
pd.DataFrame(aodict).T

### TLO usage
The used TLOs are listed in the table below. Also trivial cases like `owl` and `rdfs` were included. The column `Sum` denoted the number of evaluated A-Boxes, that used concepts belonging to the respective namespace.

In [None]:
pd.reset_option('display.max_colwidth')
tlodict = dict()
for tkey, tval in requests_data.items():
    if not tval['countas_tlo']:
        continue
    if tval['countas_tlo'] not in tlodict:
        tlodict[tval['countas_tlo']] = {'uris': [tkey]}
    else:
        tlodict[tval['countas_tlo']]['uris'].append(tkey)

dftlo = pd.DataFrame({proj: {key: max([int(x in data[proj]['namespaces']['items']) for x in tlodict[key]['uris']]) for key in tlodict.keys()} for proj in data.keys()})
dftlo.insert(loc=len(dftlo.columns), column='Sum', value=dftlo.sum(axis=1))
dftlo['name'] = dftlo.index
dftlo.sort_values(by=['Sum', 'name'], ascending=[False, True], inplace=True)
dftlo

### Inter-AO usage
The table below denotes the usage of concepts from other projects namespaces. It can easily be seen, that there is no concept usage between the projects visible in the current state.

In [None]:
aodict = dict()
for tkey, tval in requests_data.items():
    if not tval['countas_ao']:
        continue
    if tval['countas_tlo'] not in aodict:
        aodict[tval['countas_ao']] = {'uris': [tkey]}
    else:
        aodict[tval['countas_ao']]['uris'].append(tkey)

dfao = pd.DataFrame({proj: {key: max([int(x in data[proj]['namespaces']['items']) for x in aodict[key]['uris']]) for key in aodict.keys()} for proj in data.keys()})
dfao.insert(loc=len(dfao.columns), column='Sum', value=dfao.sum(axis=1))
dfao['name'] = dfao.index
dfao.sort_values(by=['Sum', 'name'], ascending=[False, True], inplace=True)
dfao[list(dfao.index) + ['Sum']]
