In [2]:
import json
import pandas as pd
import re
import urllib.request

In [4]:
'''
1. Load the Data_Request(DR)
    1.1 identify the UOA
2. For the UOA identify the optimal cluster (OCF)
    2.1 Select from the NRRT only the cluster_id 
    where (all_RF in DR=TRUE) 
    and among the TRUE one where count(rf) = max (RF count)
3. Selecting the instances of the UOA that satisfy the DR
    3.1 within the cluster we have identified select the select the right instances based on score
        1. all the ATTR & MEAS requested in the DR MUST BE PRESENT but LINK can be inherited
        # eg in the NRRT we only cluster instances of Cars by MODEL & YEAR 
        # if a DR wants  {car.model, car.year, car.[produced_by]->(Company.name)
        # the instances must have all the ATTR & MEAS but they can miss an outgoing link to Company.
        # Indeed the instance can derive the connection from the incoming_links_ranking section present in the NRRT.
4. Select the instances that are 1 link away from the UOA
    4.1 clear distinction between DIRECT_LINKS vs BEST_RATED_LINKS
        if DIRECT_LINKS then the landing node must have all the ATTR & MEAS but the links can be inherited.
        If it has the ATTR & MEAS then you consider the cluster at that LOG and seek for more links
            eg. (Car)-[produced_by]->(Company) and we want to know the company name and foundation_year
                then there must be a clustering in the NRRT with rf = name, foundation_year
                we then look in which cluster that instace is in and it han inherit the links
        The reason why you dont inherit MEAS & ATTR is beacuse the direct link need to be 100% correct
        if the UOA instance is using a BEST_RATED_LINKS we look at the cluster it lands to and 
        even if there are not all the ATTR & MEAS we can still look for instances within the landed cluster that have those info
        eg
        (Car)-[produced_by]->(Company)
        if the DR want to have the Company.name and Company.foundation_year
        if there is DIRECT_LINKS and the landing instance of Companyhas no founded year then we exclude the Car instance
        if there is BEST_RATED_LINKS linking to a cluster by just the name (eg name=FIAT) we can look within
            the cluster of company with name=FIAT to see if there is an instance that contain the foundation_year.
            and if we find it we can keep the car data
        
'''
dr = {
  "UOA": "Car",
  "Car": {
    "ATTR": [
      "model_name",
      "color"
    ],
    "MEAS": [
      "max_speed",
      "consumption"
    ],
    "LINK": [
      "(Car)<-[PRODUCED]-(Company)",
      "(Car)<-[OWNED]-(Person)"
    ]
  },
  "Company": {
    "ATTR": [
      "name",
      "website"
    ],
    "MEAS": [
      "revenue"
    ],
    "LINK": [
      "(Company)-[LOCATED]->(Country)"
    ]
  },
  "Country": {
    "ATTR": [
      "name",
      "capital"
    ],
    "MEAS": [
      "size"
    ]
  },
  "Person": {
    "ATTR": [
      "name",
      "surname"
    ],
    "MEAS": [
      "weight",
      "height"
    ]
  }
}

In [5]:
print(dr)

{'UOA': 'Car', 'Car': {'ATTR': ['model_name', 'color'], 'MEAS': ['max_speed', 'consumption'], 'LINK': ['(Car)<-[PRODUCED]-(Company)', '(Car)<-[OWNED]-(Person)']}, 'Company': {'ATTR': ['name', 'website'], 'MEAS': ['revenue'], 'LINK': ['(Company)-[LOCATED]->(Country)']}, 'Country': {'ATTR': ['name', 'capital'], 'MEAS': ['size']}, 'Person': {'ATTR': ['name', 'surname'], 'MEAS': ['weight', 'height']}}


In [15]:
for x in dr:
    for y in dr[x]:
        if x != 'UOA':
            for z in dr[x][y]:
                print(z)

model_name
color
max_speed
consumption
(Car)<-[PRODUCED]-(Company)
(Car)<-[OWNED]-(Person)
name
website
revenue
(Company)-[LOCATED]->(Country)
name
capital
size
name
surname
weight
height


In [74]:
# selecting the features of the data_request
for key in dr:
    for data_type in dr[key]:
        if key != 'UOA':
            for feature in dr[key][data_type]:
                print(feature)

model_name
color
max_speed
consumption
(Car)<-[PRODUCED]-(Company)
(Car)<-[OWNED]-(Person)
name
website
revenue
(Company)-[LOCATED]->(Country)
name
capital
size
name
surname
weight
height


In [201]:
nrrt = {
  "Car": [
    {
      "NRRT_ID": "1",
      "rf": [
        {
          "feature_name": "max_speed",
          "feature_type": "MEAS",
          "feature_value": "3/5"
        },
        {
          "feature_name": "max_speed",
          "feature_type": "MEAS",
          "feature_value": "January 2020"
        },
        {
          "feature_name": "(Car)<-[PRODUCED]-(Company)",
          "feature_type": "LINK",
          "feature_value": "Product/123123"
        }
      ],
      "num_node_available": 2,
      "node_ranking": [
        {
          "node_address": "https://gooe.com/Review/1",
          "node_abm": "https://onesto.world/abm/Review/1",
          "trust_score": 0.1,
          "frequency_score": 1
        },
        {
          "node_address": "https://gooe.com/Review/2",
          "node_abm": "https://onesto.world/abm/Review/1",
          "trust_score": 0.1,
          "frequency_score": 1
        }
      ],
      "link_ranking": [
        {
          "origin": "Allert",
          "incoming_node_address": [
            "https://hello.com/Alert/123",
            "https://hello.com/Alert/123",
            "https://hello.com/Alert/123",
            "https://hello.com/Alert/123"
          ]
        }
      ]
    },
    {
      "NRRT_ID": "2",
      "rf": [
        {
          "feature_name": "score",
          "feature_type": "MEAS",
          "feature_value": "3/5"
        },
        {
          "feature_name": "date",
          "feature_type": "MEAS",
          "feature_value": "Feb 2020"
        },
        {
          "feature_name": "[ABOUT]->(Product)",
          "feature_type": "LINK",
          "feature_value": "Product/123123"
        }
      ],
      "num_node_available": 2121,
      "node_ranking": [
        {
          "node_address": "https://gooe.com/Review/4",
          "node_abm": "https://onesto.world/abm/Review/1",
          "trust_score": 0.1,
          "frequency_score": 1
        },
        {
          "node_address": "https://gooe.com/Review/5",
          "node_abm": "https://onesto.world/abm/Review/2",
          "trust_score": 0.1,
          "frequency_score": 1
        }
      ],
      "link_ranking": [
        {
          "origin": "Allert",
          "incoming_node_address": [
            "https://hello.com/Alert/123",
            "https://hello.com/Alert/123",
            "https://hello.com/Alert/123",
            "https://hello.com/Alert/123"
          ]
        }
      ]
    }
  ]
}


In [82]:
for nrrt_item_name in nrrt:
    for cluster in nrrt[nrrt_item_name]:
        for key in cluster:
            print(z)

Car


In [84]:
'''
For the UOA identify the optimal cluster (OCF)
    2.1 Select from the NRRT only the cluster_id 
    where (all_RF in DR=TRUE) 
    and among the TRUE one where count(rf) = max (RF count)
'''
for key in dr:
    for data_type in dr[key]:
        for nrrt_item_name in nrrt:
            if key == nrrt_item_name:
                print(data_type, key)
            

ATTR Car
MEAS Car
LINK Car


In [103]:
for key in dr:
# for every key(where the keys are UOA, and all the items that the user is requesting eg. Car,Review,Video,etc.) 
# in the Data_Request
    if key != 'UOA':
        # 1. get the NRRT associated with the key (eg. the NRRT/Car)
        for nrrt_item_name in nrrt.get(key):
            # 2.  Select from the NRRT only the cluster_id where (all_RF in DR=TRUE) 
            #     and among the TRUE one where count(rf) = max (RF count) 
            for rf in nrrt_item_name.get('rf'):
                print(rf['feature_name'])
                if all(item in optimal_cluster[x] for item in ls):
                selected_cluster_id.append(y['NRRT_ID'])
    

score
date
[ABOUT]->(Product)
score
date
[ABOUT]->(Product)


TypeError: 'NoneType' object is not iterable

In [95]:
for x in nrrt.get('Car'):
    print(x)

{'NRRT_ID': '1', 'rf': [{'feature_name': 'score', 'feature_type': 'MEAS', 'feature_value': '3/5'}, {'feature_name': 'date', 'feature_type': 'MEAS', 'feature_value': 'January 2020'}, {'feature_name': '[ABOUT]->(Product)', 'feature_type': 'LINK', 'feature_value': 'Product/123123'}], 'num_node_available': 2, 'node_ranking': [{'node_address': 'https://gooe.com/Review/1', 'node_abm': 'https://onesto.world/abm/Review/1', 'trust_score': 0.1, 'frequency_score': 1}, {'node_address': 'https://gooe.com/Review/2', 'node_abm': 'https://onesto.world/abm/Review/1', 'trust_score': 0.1, 'frequency_score': 1}], 'link_ranking': [{'origin': 'Allert', 'incoming_node_address': ['https://hello.com/Alert/123', 'https://hello.com/Alert/123', 'https://hello.com/Alert/123', 'https://hello.com/Alert/123']}]}
{'NRRT_ID': '2', 'rf': [{'feature_name': 'score', 'feature_type': 'MEAS', 'feature_value': '3/5'}, {'feature_name': 'date', 'feature_type': 'MEAS', 'feature_value': 'Feb 2020'}, {'feature_name': '[ABOUT]->(Pr

In [116]:
'''
Select from the NRRT only the cluster_id where (all_RF in DR=TRUE) 
and among the TRUE one where count(rf) = max (RF count)
'''

# for every cluster in the rf
for x in dr.get('Car'):
    print(x,dr.get('Car')[x])
    
for nrrt_item_name in nrrt.get('Car'):
            # 2.  Select from the NRRT only the cluster_id where (all_RF in DR=TRUE) 
            #     and among the TRUE one where count(rf) = max (RF count) 
            for rf in nrrt_item_name.get('rf'):
                print(rf['feature_name'])
if all(item in optimal_cluster[x] for item in ls):
                selected_cluster_id.append(y['NRRT_ID'])

ATTR ['model_name', 'color']
MEAS ['max_speed', 'consumption']
LINK ['(Car)<-[PRODUCED]-(Company)', '(Car)<-[OWNED]-(Person)']
score
date
[ABOUT]->(Product)
score
date
[ABOUT]->(Product)


In [119]:
for nrrt_item_name in nrrt.get('Car'):
            # 2.  Select from the NRRT only the cluster_id where (all_RF in DR=TRUE) 
            #     and among the TRUE one where count(rf) = max (RF count) 
            for cluster in nrrt_item_name:
                ls = []
                ls.append(print(rf['feature_name'])

NRRT_ID
rf
num_node_available
node_ranking
link_ranking
NRRT_ID
rf
num_node_available
node_ranking
link_ranking


In [139]:
for cluster in nrrt.get('Car'):
    ls = []
    for rf in cluster['rf']:
        ls.append(rf.get('feature_name'))
    print(ls)


['score', 'date', '[ABOUT]->(Product)']
['score', 'date', '[ABOUT]->(Product)']


In [140]:
for cluster in nrrt.get('Car'):
    ls = []
    for rf in cluster['rf']:
        print(rf)


{'feature_name': 'score', 'feature_type': 'MEAS', 'feature_value': '3/5'}
{'feature_name': 'date', 'feature_type': 'MEAS', 'feature_value': 'January 2020'}
{'feature_name': '[ABOUT]->(Product)', 'feature_type': 'LINK', 'feature_value': 'Product/123123'}
{'feature_name': 'score', 'feature_type': 'MEAS', 'feature_value': '3/5'}
{'feature_name': 'date', 'feature_type': 'MEAS', 'feature_value': 'Feb 2020'}
{'feature_name': '[ABOUT]->(Product)', 'feature_type': 'LINK', 'feature_value': 'Product/123123'}


In [141]:
print(dr)

{'UOA': 'Car', 'Car': {'ATTR': ['model_name', 'color'], 'MEAS': ['max_speed', 'consumption'], 'LINK': ['(Car)<-[PRODUCED]-(Company)', '(Car)<-[OWNED]-(Person)']}, 'Company': {'ATTR': ['name', 'website'], 'MEAS': ['revenue'], 'LINK': ['(Company)-[LOCATED]->(Country)']}, 'Country': {'ATTR': ['name', 'capital'], 'MEAS': ['size']}, 'Person': {'ATTR': ['name', 'surname'], 'MEAS': ['weight', 'height']}}


In [143]:
for x in dr:
    print(x)

UOA
Car
Company
Country
Person


In [144]:
# 2 Select from the NRRT only the cluster_id where (all_RF in DR=TRUE)
    # for each cluster check if all the rf in the cluster are also in the DR
    # for the true one, return the cluster_id and the count of  the ranking features (id, count)(1223,4)
            #     and among the TRUE one where count(rf) = max (RF count) 
for cluster in nrrt_item_name:

SyntaxError: unexpected EOF while parsing (<ipython-input-144-44b864d0b955>, line 5)

In [171]:
a= {'ATTR': ['attr_1', 'attr_2'], 'MEAS': ['meas_1', 'meas_2'], 'LINK':['link_1','link_2']}
b = {"rf": [
        {
          "feature_name": "attr_1",
          "feature_type": "ATTR",
          "feature_value": "hello"
        },
        {
          "feature_name": "meas_1",
          "feature_type": "MEAS",
          "feature_value": "Feb 2020"
        },
        {
          "feature_name": "link_1",
          "feature_type": "LINK",
          "feature_value": "Product/123123"
        }]}

In [172]:
ATTR = []
MEAS = []
LINK = []
nuber_of_rf = 0
for x in b.get('rf'):
    if x['feature_type']=='ATTR':
        ATTR.append(x['feature_name'])
    if x['feature_type']=='MEAS':
        MEAS.append(x['feature_name'])
    if x['feature_type']=='LINK':
        LINK.append(x['feature_name'])
    nuber_of_rf+=1
print(ATTR,MEAS,LINK,nuber_of_rf)

['attr_1'] ['meas_1'] ['link_1'] 3


In [173]:
true_false = []
for x in a:
    if x == 'ATTR':
        true_false.append(all(item in a[x] for item in ATTR))
    if x == 'MEAS':
        true_false.append(all(item in a[x] for item in MEAS))
    if x == 'LINK':
        true_false.append(all(item in a[x] for item in LINK))
print(all(true_false)) #all(return true if all values are true)

True


In [166]:
# for each cluster:
# for each rf
#check if the feature_name isin the feature_type list of the data_request
for x in a:
    print(x)

ATTR
MEAS
LINK


In [174]:
# for each cluster of the NRRT
#first step is to group all the rf(Ranking Features) by data_type(ATTR,MEAS,LINK)
ATTR = []
MEAS = []
LINK = []
nuber_of_rf = 0
for x in b.get('rf'):
    if x['feature_type']=='ATTR':
        ATTR.append(x['feature_name'])
    if x['feature_type']=='MEAS':
        MEAS.append(x['feature_name'])
    if x['feature_type']=='LINK':
        LINK.append(x['feature_name'])
    nuber_of_rf+=1
#after we have grouped all the rf(Ranking Features) by data_type(ATTR,MEAS,LINK)
# we need to look at the DR and see if all the ATTR,MEAS and LINK in the cluster_id are also in the DR

true_false = []
for x in a:
    if x == 'ATTR':
        true_false.append(all(item in a[x] for item in ATTR))
    if x == 'MEAS':
        true_false.append(all(item in a[x] for item in MEAS))
    if x == 'LINK':
        true_false.append(all(item in a[x] for item in LINK))

#if all the attr/meas/link in the cluster are present in the DR then we append the (cluster_id, nuber_of_rf)
# the cluster_id is used to retrive the nodes
# the nuber_of_rf is used as we are going to use the max number of rf available
if all(true_false)==True:
    #append the (cluster_id, nuber_of_rf)(1223,4)
    

SyntaxError: unexpected EOF while parsing (<ipython-input-174-3648a3e946ce>, line 32)

In [203]:
selected_clusters = []
for key in dr:
    if key==dr.get('UOA'):
        # load the NRRT associated with the UOA
        nrrt= nrrt #load the NRRT associated with the UOA for example the https://onesto.world/NRRT/{Item}
        item_dr = dr.get(key) # for example the item=Car of the dr
        for cluster in nrrt[key]:
            # for each cluster of the NRRT
            #first step is to group all the rf(Ranking Features) by data_type(ATTR,MEAS,LINK)
            ATTR = []
            MEAS = []
            LINK = []
            nuber_of_rf = 0
            for x in cluster.get('rf'):
                if x['feature_type']=='ATTR':
                    ATTR.append(x['feature_name'])
                if x['feature_type']=='MEAS':
                    MEAS.append(x['feature_name'])
                if x['feature_type']=='LINK':
                    LINK.append(x['feature_name'])
                nuber_of_rf+=1

            #after we have grouped all the rf(Ranking Features) by data_type(ATTR,MEAS,LINK)
            # we need to look at the DR and see if all the ATTR,MEAS and LINK in the cluster_id are also in the DR
            true_false = []
            for x in item_dr:
                if x == 'ATTR':
                    true_false.append(all(item in item_dr[x] for item in ATTR))
                if x == 'MEAS':
                    true_false.append(all(item in item_dr[x] for item in MEAS))
                if x == 'LINK':
                    true_false.append(all(item in item_dr[x] for item in LINK))
                        
            #if all the attr/meas/link in the cluster are present in the DR then we append the (cluster_id, nuber_of_rf)
            # the cluster_id is used to retrive the nodes
            # the nuber_of_rf is used as we are going to use the max number of rf available
            if all(true_false)==True:
                #append the (cluster_id, nuber_of_rf)(1223,4)
                selected_clusters.append((int(cluster.get('NRRT_ID')),nuber_of_rf))
print(selected_clusters)

[(1, 3)]


In [195]:
a_list = []
a_list.append((1, 2)) 
a_list.append((1, 2)) 

In [196]:
print(a_list)

[(1, 2), (1, 2)]


In [198]:
for cluster in nrrt.get('Car'):
    print(cluster.get('NRRT_ID'))

1
2
