In [1]:
import pandas as pd
import pandas.io.sql as sqlio
import pprint as pp
import psycopg2
import sqlalchemy
import os
import json
import uuid

In [2]:
conn = psycopg2.connect(
    user="dcat", 
    password="dcat", 
    host="127.0.0.1", 
    port="54320", 
    database="dcat")

In [3]:
cur = conn.cursor()

In [4]:
print ( conn.get_dsn_parameters(),"\n")

{'user': 'dcat', 'dbname': 'dcat', 'host': '127.0.0.1', 'port': '54320', 'tty': '', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'gssencmode': 'prefer', 'krbsrvname': 'postgres', 'target_session_attrs': 'any'} 



In [5]:
cur.execute("SELECT version();")
record = cur.fetchone()
print("You are connected to - ", record,"\n")

You are connected to -  ('PostgreSQL 12.1 (Debian 12.1-1.pgdg100+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 8.3.0-6) 8.3.0, 64-bit',) 



In [6]:
cur.execute("CREATE EXTENSION IF NOT EXISTS pgcrypto;")

In [7]:
conn.commit()

In [8]:
cur.execute("drop index if exists idx_tbl_edge_n1")
cur.execute("drop index if exists idx_tbl_edge_n2")
cur.execute("drop table if exists tbl_edge")

## Node Table

In [1]:
cur.execute("drop table if exists tbl_node")
cur.execute("""
create table tbl_node 
(  
  id VARCHAR(500) PRIMARY KEY,
  guid UUID default gen_random_uuid(),
  prop JSONB, 
  created TIMESTAMP default now() 
);  
""")

NameError: name 'cur' is not defined

In [10]:
sql = "select * from tbl_node"
df = sqlio.read_sql_query(sql, conn)
df.head()

Unnamed: 0,id,guid,prop,created


In [11]:
conn.commit()
sql = 'select * from pg_catalog.pg_user'
df_users = sqlio.read_sql_query(sql, conn)

In [12]:
cur.execute("drop table if exists tbl_edge")
cur.execute("""
create table tbl_edge     -- What is relationship between ID1 to ID2      
(  
  n1 varchar references tbl_node(id),    
  n2 varchar references tbl_node(id),
  prop JSONB, 
  created timestamp default now(),  
  check (n1<>n2),  
  unique (n1,n2)  
);  
""")

In [13]:
sql = "select * from tbl_edge"
df = sqlio.read_sql_query(sql, conn)
df.head()

Unnamed: 0,n1,n2,prop,created


In [14]:
cur.execute("drop index if exists idx_tbl_edge_n1")
cur.execute("create index idx_tbl_edge_n1 on tbl_edge(n1)")

In [15]:

cur.execute("drop index if exists idx_tbl_edge_n2")
cur.execute("create index idx_tbl_edge_n2 on tbl_edge(n2)")

In [16]:
conn.commit()

## Populate Table

In [17]:
cur.execute("SELECT version();")
record = cur.fetchone()
print("You are connected to - ", record,"\n")

You are connected to -  ('PostgreSQL 12.1 (Debian 12.1-1.pgdg100+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 8.3.0-6) 8.3.0, 64-bit',) 



### Get Data

In [18]:
xls = pd.ExcelFile("../data/testdata.xlsx")
xls.sheet_names

['Formål GML',
 'Formål',
 'Ark1',
 'data-catalog-backend',
 'data-catalog-policies',
 'Category']

### Populate tables

In [19]:
def get_id(row):
    return row['type'] + '.' + row['name'].upper()

def get_uuid(row):
    return row['type'] + '.' + str(uuid.uuid4()).upper()

### Processing Activities

In [20]:
sheet=xls.sheet_names[1]
df_formål = pd.read_excel(xls, sheet, encoding='latin1')
df_formål = df_formål[['Forslag til endelig formålskode', 'Formålsbeskrivelse']]
df_formål.columns=['code', 'description']
df_formål['name'] = df_formål['code'].apply(lambda x: x.upper())
df_formål = df_formål.append({'name': 'NOT AVAILABLE', 'description': 'Not Available -Verdi mangler'}, ignore_index=True)
df_formål.drop(['code'], axis=1, inplace=True)
df_formål['label'] = 'Processing activity'
df_formål['type'] = 'GDPR.PROCESSING_ACTIVITY'
df_formål['id'] = df_formål.apply(get_id, axis=1)
df_formål.tail()

Unnamed: 0,description,name,label,type,id
33,Behandle og vurdere rett til uføretrygd som sk...,UFØRETRYGD,Processing activity,GDPR.PROCESSING_ACTIVITY,GDPR.PROCESSING_ACTIVITY.UFØRETRYGD
34,Behandle og vurdere rett til ventelønn som ska...,VENTELØNN,Processing activity,GDPR.PROCESSING_ACTIVITY,GDPR.PROCESSING_ACTIVITY.VENTELØNN
35,Behandle og vurdere rett til ytelser ved yrkes...,YRKESSKADE- OG SYKDOM,Processing activity,GDPR.PROCESSING_ACTIVITY,GDPR.PROCESSING_ACTIVITY.YRKESSKADE- OG SYKDOM
36,Behandle og vurdere rett til ytelser til famil...,YTELSER TIL FAMILIEPLEIER,Processing activity,GDPR.PROCESSING_ACTIVITY,GDPR.PROCESSING_ACTIVITY.YTELSER TIL FAMILIEPL...
37,Not Available -Verdi mangler,NOT AVAILABLE,Processing activity,GDPR.PROCESSING_ACTIVITY,GDPR.PROCESSING_ACTIVITY.NOT AVAILABLE


In [21]:
for index, row in df_formål.iterrows():
    id = index + 1
    prop_id = row.id
    jsonb = row.to_json()
    insert_query = f"insert into tbl_node (id, prop) values ('{prop_id}','{jsonb}')"
    cur.execute(insert_query, (jsonb,))
    
conn.commit()


In [22]:
df = sqlio.read_sql_query("select * from tbl_node", conn)
df.tail()

Unnamed: 0,id,guid,prop,created
33,GDPR.PROCESSING_ACTIVITY.UFØRETRYGD,7b4798b3-8f8f-4ff9-bc3c-8a453df5275a,"{'id': 'GDPR.PROCESSING_ACTIVITY.UFØRETRYGD', ...",2020-01-06 14:48:21.736978
34,GDPR.PROCESSING_ACTIVITY.VENTELØNN,5e110e27-3601-4e7c-915e-8341e80273cf,"{'id': 'GDPR.PROCESSING_ACTIVITY.VENTELØNN', '...",2020-01-06 14:48:21.736978
35,GDPR.PROCESSING_ACTIVITY.YRKESSKADE- OG SYKDOM,9f4ff93c-018e-49c5-9a71-7804ab659e18,{'id': 'GDPR.PROCESSING_ACTIVITY.YRKESSKADE- O...,2020-01-06 14:48:21.736978
36,GDPR.PROCESSING_ACTIVITY.YTELSER TIL FAMILIEPL...,f75c1ddb-cba8-4990-95f3-9a9f35a30f16,{'id': 'GDPR.PROCESSING_ACTIVITY.YTELSER TIL F...,2020-01-06 14:48:21.736978
37,GDPR.PROCESSING_ACTIVITY.NOT AVAILABLE,315a1c55-4752-4904-a99c-f05ae07bd1df,{'id': 'GDPR.PROCESSING_ACTIVITY.NOT AVAILABLE...,2020-01-06 14:48:21.736978


### Purpose

In [23]:
df_purpose = df_formål.copy()
df_purpose['type'] = 'GDPR.PURPOSE'
df_purpose['label'] = 'Purpose'
df_purpose['id'] = df_purpose.apply(get_id, axis=1)

In [24]:
for i, row in df_purpose.iterrows():
    jsonb = row.to_json()
    prop_id = row.id
    insert_query = f"insert into tbl_node (id,prop) values ('{prop_id}','{jsonb}')"
    cur.execute(insert_query, (jsonb,))
    
conn.commit()

In [25]:
df = sqlio.read_sql_query("select * from tbl_node", conn)
df.tail()

Unnamed: 0,id,guid,prop,created
71,GDPR.PURPOSE.UFØRETRYGD,a3bed3c8-7f8b-477f-b7da-75f5ce2207da,"{'id': 'GDPR.PURPOSE.UFØRETRYGD', 'name': 'UFØ...",2020-01-06 14:48:23.590065
72,GDPR.PURPOSE.VENTELØNN,95bc9053-c600-4177-857c-47436c8b2ff5,"{'id': 'GDPR.PURPOSE.VENTELØNN', 'name': 'VENT...",2020-01-06 14:48:23.590065
73,GDPR.PURPOSE.YRKESSKADE- OG SYKDOM,d36b0e57-cd88-47f7-8a45-e25d46a5dab4,"{'id': 'GDPR.PURPOSE.YRKESSKADE- OG SYKDOM', '...",2020-01-06 14:48:23.590065
74,GDPR.PURPOSE.YTELSER TIL FAMILIEPLEIER,a412f5e5-c20a-41a0-be89-cd1885366031,{'id': 'GDPR.PURPOSE.YTELSER TIL FAMILIEPLEIER...,2020-01-06 14:48:23.590065
75,GDPR.PURPOSE.NOT AVAILABLE,77f0ea98-0cea-47f2-8505-8168b65c85bb,"{'id': 'GDPR.PURPOSE.NOT AVAILABLE', 'name': '...",2020-01-06 14:48:23.590065


### Legal Basis

In [26]:
sheet=xls.sheet_names[4]
df_legal_basis = pd.read_excel(xls, sheet, encoding='latin1')
df_legal_basis = df_legal_basis[['legalBasisDescription']].drop_duplicates()
df_legal_basis.columns=['name']
df_legal_basis = df_legal_basis.append({'name': 'NOT AVAILABLE'}, ignore_index=True)
df_legal_basis['type'] = 'GDPR.LEGAL_BASIS'
df_legal_basis['label'] = 'Legal basis'
df_legal_basis['id'] = df_legal_basis.apply(get_uuid, axis=1)
df_legal_basis.head()

Unnamed: 0,name,type,label,id
0,Ftrl. § 11-20,GDPR.LEGAL_BASIS,Legal basis,GDPR.LEGAL_BASIS.AD34311B-23A5-44FF-B2BA-C5529...
1,"Ftrl. §§ 3-16, 3-24, 3-25, 20-8.",GDPR.LEGAL_BASIS,Legal basis,GDPR.LEGAL_BASIS.FD51813B-D37B-4347-99AB-77300...
2,"Kapittel 3, 19, 20 0g 22",GDPR.LEGAL_BASIS,Legal basis,GDPR.LEGAL_BASIS.7C503650-82F4-4428-AEFA-BF866...
3,Ftrl § 3-2 og § 3-3,GDPR.LEGAL_BASIS,Legal basis,GDPR.LEGAL_BASIS.CE73DCE7-FDFF-4B31-A3A5-55D04...
4,Barnetrygdloven § 9,GDPR.LEGAL_BASIS,Legal basis,GDPR.LEGAL_BASIS.0FA8C40A-49AE-40EC-AC30-D4FFC...


In [27]:
for i, row in df_legal_basis.iterrows():
    index = index + 1
    jsonb = row.to_json()
    prop_id = row.id
    insert_query = f"insert into tbl_node (id, prop) values ('{prop_id}','{jsonb}')"
    cur.execute(insert_query, (jsonb,))
    
conn.commit()

In [28]:
df = sqlio.read_sql_query("select * from tbl_node", conn)
df.tail()

Unnamed: 0,id,guid,prop,created
232,GDPR.LEGAL_BASIS.E8FCBE84-E52A-4A46-B6DE-1CFE9...,1edd8999-177b-407d-84d0-4ccf23d2a120,{'id': 'GDPR.LEGAL_BASIS.E8FCBE84-E52A-4A46-B6...,2020-01-06 14:48:24.507296
233,GDPR.LEGAL_BASIS.BC0BCE09-D8A6-442A-9503-5E341...,f0a47658-4c35-4afd-9a4c-c9b691d28841,{'id': 'GDPR.LEGAL_BASIS.BC0BCE09-D8A6-442A-95...,2020-01-06 14:48:24.507296
234,GDPR.LEGAL_BASIS.FFE410AE-F5AC-4E49-A38C-6E96C...,5466cabe-f942-43fd-97a9-7276f557945b,{'id': 'GDPR.LEGAL_BASIS.FFE410AE-F5AC-4E49-A3...,2020-01-06 14:48:24.507296
235,GDPR.LEGAL_BASIS.A4DB9587-C019-44DF-B115-9D45E...,b4f71045-9de7-4bfb-b4ce-83d9b88f32c0,{'id': 'GDPR.LEGAL_BASIS.A4DB9587-C019-44DF-B1...,2020-01-06 14:48:24.507296
236,GDPR.LEGAL_BASIS.DF8F28EA-D2C7-4671-A65D-A28F3...,b8d96201-aa8c-4ccd-96cc-6d2a6b614cf8,{'id': 'GDPR.LEGAL_BASIS.DF8F28EA-D2C7-4671-A6...,2020-01-06 14:48:24.507296


### Informations Types

In [29]:

sheet=xls.sheet_names[3]

df_concepts = pd.read_excel(xls, sheet, encoding='latin1')
df_concepts = df_concepts[['title', 'description']]
df_concepts.columns=['name', 'description']
df_concepts['name'] = df_concepts['name'].apply(lambda x: x.strip().upper())
df_concepts = df_concepts.append({'name': 'NOT AVAILABLE', 'description': 'Not Available -Verdi mangler'}, ignore_index=True).drop_duplicates()
df_concepts['label'] = 'Information type'
df_concepts['type'] = 'GDPR.INFORMATION_TYPE'
df_concepts['id'] = df_concepts.apply(get_id, axis=1)
df_concepts.head()

Unnamed: 0,name,description,label,type,id
0,SIVILSTAND,En overordnet kategori som beskriver en person...,Information type,GDPR.INFORMATION_TYPE,GDPR.INFORMATION_TYPE.SIVILSTAND
1,ARBEIDSFORHOLD,"Avtaleforhold hvor den ene part, arbeidstakere...",Information type,GDPR.INFORMATION_TYPE,GDPR.INFORMATION_TYPE.ARBEIDSFORHOLD
2,KJØNN,TODO - mangler i begrepskatalogen og i MFNs be...,Information type,GDPR.INFORMATION_TYPE,GDPR.INFORMATION_TYPE.KJØNN
3,NAVN,I Norge skal alle ha fornavn og ett enkelt ell...,Information type,GDPR.INFORMATION_TYPE,GDPR.INFORMATION_TYPE.NAVN
4,FØDSELSDATO,Datoen personen er født.,Information type,GDPR.INFORMATION_TYPE,GDPR.INFORMATION_TYPE.FØDSELSDATO


In [30]:

for i, row in df_concepts.iterrows():
    jsonb = row.to_json()
    prop_id = row.id
    insert_query = f"insert into tbl_node (id, prop) values ('{prop_id}','{jsonb}')"
    cur.execute(insert_query, (jsonb,))
    
conn.commit()

In [31]:
df = sqlio.read_sql_query("select * from tbl_node", conn)
df.tail()

Unnamed: 0,id,guid,prop,created
253,GDPR.INFORMATION_TYPE.FLYKTNINGSTATUS,5219591f-7a66-4a00-aee7-0f353ee5439e,{'id': 'GDPR.INFORMATION_TYPE.FLYKTNINGSTATUS'...,2020-01-06 14:48:25.303604
254,GDPR.INFORMATION_TYPE.FORELDREANSVAR,020a88e3-9196-4e1a-ad6c-525d767da6b7,"{'id': 'GDPR.INFORMATION_TYPE.FORELDREANSVAR',...",2020-01-06 14:48:25.303604
255,GDPR.INFORMATION_TYPE.FORSTERFORELDRE,abb459e4-5eb0-409a-8ee5-bba8f03dc5d7,{'id': 'GDPR.INFORMATION_TYPE.FORSTERFORELDRE'...,2020-01-06 14:48:25.303604
256,GDPR.INFORMATION_TYPE.FULLMAKT,f83d1715-32db-4ec1-a5e4-c9620fed5596,"{'id': 'GDPR.INFORMATION_TYPE.FULLMAKT', 'name...",2020-01-06 14:48:25.303604
257,GDPR.INFORMATION_TYPE.NOT AVAILABLE,5d892952-5b26-4102-a16a-14d7e06a5076,"{'id': 'GDPR.INFORMATION_TYPE.NOT AVAILABLE', ...",2020-01-06 14:48:25.303604


### Categories

In [32]:
sheet=xls.sheet_names[3]
df_categories = pd.read_excel(xls, sheet, encoding='latin1')
df_categories = df_categories[['categories']]
df_categories.columns=['name']
df_categories['name'] = df_categories['name'].apply(lambda x: x.strip().upper())
df_categories = df_categories.append({'name': 'NOT AVAILABLE'}, ignore_index=True).drop_duplicates()
df_categories['label'] = 'Information type category'
df_categories['type'] = 'GDPR.INFORMATION_TYPE_CATEGORY'
df_categories['id'] = df_categories.apply(get_id, axis=1)
df_categories.head()

Unnamed: 0,name,label,type,id
0,PERSONALIA,Information type category,GDPR.INFORMATION_TYPE_CATEGORY,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA
1,ARBEIDSFORHOLD,Information type category,GDPR.INFORMATION_TYPE_CATEGORY,GDPR.INFORMATION_TYPE_CATEGORY.ARBEIDSFORHOLD
6,KONTAKTOPPLYSNINGER,Information type category,GDPR.INFORMATION_TYPE_CATEGORY,GDPR.INFORMATION_TYPE_CATEGORY.KONTAKTOPPLYSNI...
9,FAMILIERELASJONER,Information type category,GDPR.INFORMATION_TYPE_CATEGORY,GDPR.INFORMATION_TYPE_CATEGORY.FAMILIERELASJONER
16,FOLKETRYGD,Information type category,GDPR.INFORMATION_TYPE_CATEGORY,GDPR.INFORMATION_TYPE_CATEGORY.FOLKETRYGD


In [33]:
for i, row in df_categories.iterrows():
    index = index + 1
    jsonb = row.to_json()
    prop_id = row.id
    insert_query = f"insert into tbl_node (id,prop) values ('{prop_id}','{jsonb}')"
    cur.execute(insert_query, (jsonb,))
    
conn.commit()

### Nodes

In [34]:
df_nodes = sqlio.read_sql_query("select * from tbl_node", conn)
df_nodes['name'] = df_nodes['prop'].apply(lambda x : x['name'])
df_nodes['type'] = df_nodes['prop'].apply(lambda x : x['type'])
df_nodes.head()

Unnamed: 0,id,guid,prop,created,name,type
0,GDPR.PROCESSING_ACTIVITY.ALDERSPENSJON,1e57d727-99e0-475c-8769-2197a8e25a3a,{'id': 'GDPR.PROCESSING_ACTIVITY.ALDERSPENSJON...,2020-01-06 14:48:21.736978,ALDERSPENSJON,GDPR.PROCESSING_ACTIVITY
1,GDPR.PROCESSING_ACTIVITY.ARBEIDSAVKLARINGSPENG...,d484cc62-77a9-4f32-962f-a6fab17a8d9d,{'id': 'GDPR.PROCESSING_ACTIVITY.ARBEIDSAVKLAR...,2020-01-06 14:48:21.736978,ARBEIDSAVKLARINGSPENGER (AAP),GDPR.PROCESSING_ACTIVITY
2,GDPR.PROCESSING_ACTIVITY.AVTALEFESTET PENSJON ...,29b3d233-58be-41b2-88aa-a981366173d8,{'id': 'GDPR.PROCESSING_ACTIVITY.AVTALEFESTET ...,2020-01-06 14:48:21.736978,AVTALEFESTET PENSJON (AFP) FOR PRIVAT SEKTOR,GDPR.PROCESSING_ACTIVITY
3,GDPR.PROCESSING_ACTIVITY.AVTALEFESTET PENSJON ...,7a4f31c2-c96e-4346-93fd-ea0c3d3b764a,{'id': 'GDPR.PROCESSING_ACTIVITY.AVTALEFESTET ...,2020-01-06 14:48:21.736978,AVTALEFESTET PENSJON (AFP) FOR STATLIG OG KOMM...,GDPR.PROCESSING_ACTIVITY
4,GDPR.PROCESSING_ACTIVITY.BARNEBIDRAG,a009e97a-11a3-46b6-9ea1-57e75f4fd195,"{'id': 'GDPR.PROCESSING_ACTIVITY.BARNEBIDRAG',...",2020-01-06 14:48:21.736978,BARNEBIDRAG,GDPR.PROCESSING_ACTIVITY


### Edges

In [35]:
sheet=xls.sheet_names[3]
df_edges = pd.read_excel(xls, sheet, encoding='latin1')
df_edges = df_edges[['title', 'categories']].drop_duplicates()
df_edges.columns = ['source', 'target']
df_edges['source'] = df_edges['source'].apply(lambda x: x.strip().upper())
df_edges['target'] = df_edges['target'].apply(lambda x: x.strip().upper())
df_edges.drop_duplicates()

df_edges.head()

Unnamed: 0,source,target
0,SIVILSTAND,PERSONALIA
1,ARBEIDSFORHOLD,ARBEIDSFORHOLD
2,KJØNN,PERSONALIA
3,NAVN,PERSONALIA
4,FØDSELSDATO,PERSONALIA


In [36]:
conn.commit()

In [37]:
cur.execute("delete from tbl_edge")
conn.commit()

for index, row in df_edges.iterrows():
    df_nodes.loc[((df_nodes['name'] == row['source']) & (df_nodes['type'] == 'GDPR.INFORMATION_TYPE')), 'id']
    source_id = df_nodes.loc[((df_nodes['name'] == row['source']) & (df_nodes['type'] == 'GDPR.INFORMATION_TYPE')), 'id'].item()
    target_id = df_nodes.loc[((df_nodes['name'] == row['target']) & (df_nodes['type'] == 'GDPR.INFORMATION_TYPE_CATEGORY')), 'id'].item()
    relation = {"edge":"array[5]"}
    cur.execute(f"insert into tbl_edge (n1,n2,prop) values ('{source_id}','{target_id}','{json.dumps(relation)}');")
    
conn.commit()

In [38]:
df_edges = pd.read_excel(xls, sheet, encoding='latin1')
df_edges = df_edges[['categories', 'title']].drop_duplicates()
df_edges.columns = ['source', 'target']
df_edges['source'] = df_edges['source'].apply(lambda x: x.strip().upper())
df_edges['target'] = df_edges['target'].apply(lambda x: x.strip().upper())
df_edges.drop_duplicates()

df_edges.head()

Unnamed: 0,source,target
0,PERSONALIA,SIVILSTAND
1,ARBEIDSFORHOLD,ARBEIDSFORHOLD
2,PERSONALIA,KJØNN
3,PERSONALIA,NAVN
4,PERSONALIA,FØDSELSDATO


In [39]:
for index, row in df_edges.iterrows():
    df_nodes.loc[((df_nodes['name'] == row['source']) & (df_nodes['type'] == 'GDPR.INFORMATION_TYPE')), 'id']
    source_id = df_nodes.loc[((df_nodes['name'] == row['source']) & (df_nodes['type'] == 'GDPR.INFORMATION_TYPE_CATEGORY')), 'id'].item()
    target_id = df_nodes.loc[((df_nodes['name'] == row['target']) & (df_nodes['type'] == 'GDPR.INFORMATION_TYPE')), 'id'].item()
    relation = {"edge":"array[6]"}
    cur.execute(f"insert into tbl_edge (n1,n2,prop) values ('{source_id}','{target_id}','{json.dumps(relation)}');")
    
conn.commit()

In [40]:
sql = "select * from tbl_edge"
df = sqlio.read_sql_query(sql, conn)
df.head()

Unnamed: 0,n1,n2,prop,created
0,GDPR.INFORMATION_TYPE.SIVILSTAND,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,{'edge': 'array[5]'},2020-01-06 14:48:29.640396
1,GDPR.INFORMATION_TYPE.ARBEIDSFORHOLD,GDPR.INFORMATION_TYPE_CATEGORY.ARBEIDSFORHOLD,{'edge': 'array[5]'},2020-01-06 14:48:29.640396
2,GDPR.INFORMATION_TYPE.KJØNN,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,{'edge': 'array[5]'},2020-01-06 14:48:29.640396
3,GDPR.INFORMATION_TYPE.NAVN,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,{'edge': 'array[5]'},2020-01-06 14:48:29.640396
4,GDPR.INFORMATION_TYPE.FØDSELSDATO,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,{'edge': 'array[5]'},2020-01-06 14:48:29.640396


In [108]:
conn.commit()

In [109]:
cur.execute("DROP FUNCTION if exists graph_search1(character varying,integer,bigint)")
conn.commit()

In [113]:
sql = """
create or replace function graph_search1(      
  IN i_root varchar,                       -- The node that the search is based on        
  IN i_depth int  default 99999,       -- the tier to search (the depth limit)      
  IN i_limit int8 default 2000000000,  -- limit the number of records returned for each tier      
  OUT o_path varchar[],                    -- output: path, an array of IDs      
  OUT o_point1 varchar,                    -- output: point 1 ID      
  OUT o_point2 varchar,                    -- output: point 2 ID      
  OUT o_link_prop JSONB,               -- output: the connection property between the two current points      
  OUT o_link_prop_all text,            -- output: the connection property from the starting node to the current node      
  OUT o_depth int                      -- output: current depth (tier)      
) returns setof record as 
$$
      
declare      
  sql text;      
begin      
sql := format($_$      
WITH RECURSIVE search_graph(        
  n1,     -- point 1        
  n2,     -- point 2        
  prop,   -- current edge property      
  all_prop,  -- properties of all edges  
  depth,  -- current depth, starting from 1         
  path    -- path, stored as an array         
) AS (        
        select n1,n2,prop,all_prop,depth,path from (        
        SELECT                               -- ROOT node query        
          g.n1,                              -- point 1        
          g.n2,                              -- point 2        
          g.prop,                            -- edge property        
      g.prop::text as all_prop,              -- properties of all edges  
          1 depth,                           -- initial depth=1        
          ARRAY[g.n1, g.n2] path             -- initial path        
        FROM tbl_edge AS g         
        WHERE         
          n1 = '%s'                            -- ROOT node=?        
          limit %s                           -- How many records are limited at each tier?        
        ) t        
      UNION ALL        
        select n1,n2,prop,all_prop,depth,path from (        
        SELECT                               -- recursive clause         
          g.n1,                              -- point 1        
          g.n2,                              -- point 2        
          g.prop,                            -- edge property     
      sg.all_prop || g.prop::text as all_prop,    -- properties of all edges  
          sg.depth + 1 depth,                   -- depth +1        
          sg.path || g.n2 path                 -- Add a new point to the path        
        FROM tbl_edge AS g, search_graph AS sg    -- circular INNER JOIN        
        WHERE         
          g.n1 = sg.n2                       -- recursive JOIN condition        
          AND (g.n2 <> ALL(sg.path))         -- Prevent loop, determine whether it is a loop and judge if the new point is already in the previous path   
          AND sg.depth <= %s                 -- search depth =?          
          limit %s                           -- How many records are limited at each tier?       
        ) t        
)        
SELECT path as o_path, n1 as o_point1, n2 as o_point2, prop as o_link_prop, all_prop as o_link_prop_all, depth as o_depth      
FROM search_graph;                           -- query a recursive table. You can add LIMIT output or use a cursor       
$_$, i_root, i_limit, i_depth, i_limit      
);      
      
return query execute sql;      
      
end;      

$$
 language plpgsql strict;   
"""

cur.execute(sql)

In [114]:
conn.commit()

In [117]:
%%time
sql = "select * from graph_search1('GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA');"
df = sqlio.read_sql_query(sql, conn)

CPU times: user 2.24 ms, sys: 903 µs, total: 3.15 ms
Wall time: 9.76 ms


In [119]:
sql = "select * from graph_search1('GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA');"
df = sqlio.read_sql_query(sql, conn)

In [120]:
df

Unnamed: 0,o_path,o_point1,o_point2,o_link_prop,o_link_prop_all,o_depth
0,"[GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA, GD...",GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,GDPR.INFORMATION_TYPE.SIVILSTAND,{'edge': 'array[6]'},"{""edge"": ""array[6]""}",1
1,"[GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA, GD...",GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,GDPR.INFORMATION_TYPE.KJØNN,{'edge': 'array[6]'},"{""edge"": ""array[6]""}",1
2,"[GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA, GD...",GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,GDPR.INFORMATION_TYPE.NAVN,{'edge': 'array[6]'},"{""edge"": ""array[6]""}",1
3,"[GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA, GD...",GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,GDPR.INFORMATION_TYPE.FØDSELSDATO,{'edge': 'array[6]'},"{""edge"": ""array[6]""}",1
4,"[GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA, GD...",GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,GDPR.INFORMATION_TYPE.FØDSELSNUMMER,{'edge': 'array[6]'},"{""edge"": ""array[6]""}",1
