In [1]:
import pandas as pd
from pysmartdatamodels import pysmartdatamodels as sdm

In [2]:
smartDataModels = sdm.load_all_datamodels()

In [3]:
attributes = sdm.load_all_attributes()

In [5]:
attributes

[{'_id': {'$oid': '60b98736367efc6c60d96807'},
  'property': 'id',
  'dataModel': 'Control',
  'repoName': 'dataModel.EnergyCIM',
  'schemaVersion': '0.0.1',
  'type': 'string',
  'description': 'Unique identifier of the entity',
  'typeNGSI': 'Property',
  'modelTags': ''},
 {'_id': {'$oid': '60b98736367efc6c60d96808'},
  'property': 'dateCreated',
  'dataModel': 'Control',
  'repoName': 'dataModel.EnergyCIM',
  'schemaVersion': '0.0.1',
  'type': 'string',
  'format': 'date-time',
  'description': 'Entity creation timestamp. This will usually be allocated by the storage platform.',
  'typeNGSI': 'Property',
  'modelTags': ''},
 {'_id': {'$oid': '60b98736367efc6c60d96809'},
  'property': 'dateModified',
  'dataModel': 'Control',
  'repoName': 'dataModel.EnergyCIM',
  'schemaVersion': '0.0.1',
  'type': 'string',
  'format': 'date-time',
  'description': 'Timestamp of the last modification of the entity. This will usually be allocated by the storage platform.',
  'typeNGSI': 'Property'

In [8]:
import random

random.choice(attributes)

{'_id': {'$oid': '6400ac3032dedbf617eac5e7'},
 'property': 'address',
 'dataModel': 'RotatingMachineDynamics',
 'repoName': 'dataModel.EnergyCIM',
 'modelTags': '',
 'license': 'https://github.com/smart-data-models/dataModel.EnergyCIM/blob/master/RotatingMachineDynamics/LICENSE.md',
 'schemaVersion': '0.0.1',
 'type': 'object',
 'description': 'The mailing address',
 'typeNGSI': 'Property',
 'model': 'https://schema.org/address'}

In [13]:
import requests
from pprint import pprint
import tqdm

def query_lov(query):
    # Possible types: [class, propery, datatype, instance]
    payload = {'q': query, 'page_size': 3000}
    try:
        r = requests.get('https://lov.linkeddata.es/dataset/lov/api/v2/term/search', params=payload)
        return r.json()
    except:
        return 'There was and error'

def search_lov(query):
    lov_results = query_lov(query)
    df = pd.DataFrame(lov_results['results'])
    df = df[['prefixedName', 'uri', 'score', 'type']]
    df['uri'] = df['uri'].apply(lambda x: x[0])
    df['prefixedName'] = df['prefixedName'].apply(lambda x: x[0])
    return df

def lov_printer(result):
    print(f"Search: {result['queryString']}")
    print(f"Results: {result['total_results']}")

    if result['total_results'] > 0:
        print(f"Entities: ")
        print(result['results'][0]['prefixedName'])
        print(result['results'][0]['score'])
        print(result['results'][0]['type'])
        print(result['results'][0]['uri'])
        print(result['results'][0]['vocabulary.prefix'])
        print(result['results'][0]['metrics.reusedByDatasets'])
    print()
    print('#' * 20)


In [18]:
properties = []
results = []
prefix_names = []
scores = []
types = []
uris = []
vocab_prefixs = []
reusedby = []

seen_attributes = []

for a in tqdm.tqdm(attributes[:3]):
    if a['property'] not in seen_attributes:
        try:
            lov_result = query_lov(a['property'])
            print(lov_result['total_results'])

            for r in lov_result['results']:
                if r['score'] > 0.2:
                    properties.append(a['property'])
                    results.append(lov_result['total_results'])
                    prefix_names.append(r['prefixedName'][0])
                    scores.append(r['score'])
                    types.append(r['type'])
                    uris.append(r['uri'][0])
                    vocab_prefixs.append(r['vocabulary.prefix'][0])
                    reusedby.append(r['metrics.reusedByDatasets'][0])
                else:
                    next
        except:
            print(f"Error: {a['property']}")
        
        seen_attributes.append(a['property'])
    else:
        next

 33%|███▎      | 1/3 [00:04<00:09,  4.95s/it]

2948


 67%|██████▋   | 2/3 [00:06<00:02,  2.82s/it]

10


100%|██████████| 3/3 [00:07<00:00,  2.61s/it]

7





In [16]:
sdm_lov_df = pd.DataFrame(
    {
        'property' : properties,
        'num_results' : results,
        'prefix_name' : prefix_names,
        'score' : scores,
        'type' : types,
        'uri' : uris,
        'vocab_prefix' : vocab_prefixs,
        'reused_by' : reusedby
    }
)

In [38]:
sdm_lov_df.to_csv('datasets/sdm_lov_df.csv', index=False)