In [1]:
import pandas as pd
import pandas.io.sql as sqlio
import pprint as pp
import psycopg2
import sqlalchemy
import os
import json
import uuid

In [2]:
conn = psycopg2.connect(
    user="dcat", 
    password="dcat", 
    host="127.0.0.1", 
    port="54320", 
    database="dcat")

In [3]:
cur = conn.cursor()

In [4]:
print ( conn.get_dsn_parameters(),"\n")

{'user': 'dcat', 'dbname': 'dcat', 'host': '127.0.0.1', 'port': '54320', 'tty': '', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'gssencmode': 'prefer', 'krbsrvname': 'postgres', 'target_session_attrs': 'any'} 



In [5]:
cur.execute("SELECT version();")
record = cur.fetchone()
print("You are connected to - ", record,"\n")

You are connected to -  ('PostgreSQL 12.1 (Debian 12.1-1.pgdg100+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 8.3.0-6) 8.3.0, 64-bit',) 



In [6]:
cur.execute("CREATE EXTENSION IF NOT EXISTS pgcrypto;")

In [7]:
conn.commit()

In [8]:
cur.execute("drop index if exists idx_tbl_edge_n1")
cur.execute("drop index if exists idx_tbl_edge_n2")
cur.execute("drop table if exists tbl_edge")

## Node Table

In [9]:
cur.execute("drop table if exists tbl_node")
cur.execute("""
create table tbl_node 
(  
  id VARCHAR(500) PRIMARY KEY,
  guid UUID default gen_random_uuid(),
  prop JSONB, 
  created TIMESTAMP default now() 
);  
""")

In [10]:
sql = "select * from tbl_node"
df = sqlio.read_sql_query(sql, conn)
df.head()

Unnamed: 0,id,guid,prop,created


In [11]:
conn.commit()
sql = 'select * from pg_catalog.pg_user'
df_users = sqlio.read_sql_query(sql, conn)

In [12]:
cur.execute("drop table if exists tbl_edge")
cur.execute("""
create table tbl_edge     -- What is relationship between ID1 to ID2      
(  
  n1 varchar references tbl_node(id),    
  n2 varchar references tbl_node(id),
  prop JSONB, 
  created timestamp default now(),  
  check (n1<>n2),  
  unique (n1,n2)  
);  
""")

In [13]:
sql = "select * from tbl_edge"
df = sqlio.read_sql_query(sql, conn)
df.head()

Unnamed: 0,n1,n2,prop,created


In [14]:
cur.execute("drop index if exists idx_tbl_edge_n1")
cur.execute("create index idx_tbl_edge_n1 on tbl_edge(n1)")

In [15]:

cur.execute("drop index if exists idx_tbl_edge_n2")
cur.execute("create index idx_tbl_edge_n2 on tbl_edge(n2)")

In [16]:
conn.commit()

## Populate Table

In [17]:
cur.execute("SELECT version();")
record = cur.fetchone()
print("You are connected to - ", record,"\n")

You are connected to -  ('PostgreSQL 12.1 (Debian 12.1-1.pgdg100+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 8.3.0-6) 8.3.0, 64-bit',) 



### Get Data

In [18]:
xls = pd.ExcelFile("../data/testdata.xlsx")
xls.sheet_names

['Formål GML',
 'Formål',
 'Ark1',
 'data-catalog-backend',
 'data-catalog-policies',
 'Category']

### Populate tables

In [19]:
def get_id(row):
    return row['type'] + '.' + row['name'].upper()

def get_uuid(row):
    return row['type'] + '.' + str(uuid.uuid4()).upper()

### Processing Activities

In [20]:
sheet=xls.sheet_names[1]
df_formål = pd.read_excel(xls, sheet, encoding='latin1')
df_formål = df_formål[['Forslag til endelig formålskode', 'Formålsbeskrivelse']]
df_formål.columns=['code', 'description']
df_formål['name'] = df_formål['code'].apply(lambda x: x.upper())
df_formål = df_formål.append({'name': 'NOT AVAILABLE', 'description': 'Not Available -Verdi mangler'}, ignore_index=True)
df_formål.drop(['code'], axis=1, inplace=True)
df_formål['label'] = 'Processing activity'
df_formål['type'] = 'GDPR.PROCESSING_ACTIVITY'
df_formål['id'] = df_formål.apply(get_id, axis=1)
df_formål.tail()

Unnamed: 0,description,name,label,type,id
33,Behandle og vurdere rett til uføretrygd som sk...,UFØRETRYGD,Processing activity,GDPR.PROCESSING_ACTIVITY,GDPR.PROCESSING_ACTIVITY.UFØRETRYGD
34,Behandle og vurdere rett til ventelønn som ska...,VENTELØNN,Processing activity,GDPR.PROCESSING_ACTIVITY,GDPR.PROCESSING_ACTIVITY.VENTELØNN
35,Behandle og vurdere rett til ytelser ved yrkes...,YRKESSKADE- OG SYKDOM,Processing activity,GDPR.PROCESSING_ACTIVITY,GDPR.PROCESSING_ACTIVITY.YRKESSKADE- OG SYKDOM
36,Behandle og vurdere rett til ytelser til famil...,YTELSER TIL FAMILIEPLEIER,Processing activity,GDPR.PROCESSING_ACTIVITY,GDPR.PROCESSING_ACTIVITY.YTELSER TIL FAMILIEPL...
37,Not Available -Verdi mangler,NOT AVAILABLE,Processing activity,GDPR.PROCESSING_ACTIVITY,GDPR.PROCESSING_ACTIVITY.NOT AVAILABLE


In [21]:
for index, row in df_formål.iterrows():
    id = index + 1
    prop_id = row.id
    jsonb = row.to_json()
    insert_query = f"insert into tbl_node (id, prop) values ('{prop_id}','{jsonb}')"
    cur.execute(insert_query, (jsonb,))
    
conn.commit()


In [22]:
df = sqlio.read_sql_query("select * from tbl_node", conn)
df.tail()

Unnamed: 0,id,guid,prop,created
33,GDPR.PROCESSING_ACTIVITY.UFØRETRYGD,aa9ecc4f-97a1-4697-8415-46a09a65eab0,"{'id': 'GDPR.PROCESSING_ACTIVITY.UFØRETRYGD', ...",2020-01-06 14:19:04.744794
34,GDPR.PROCESSING_ACTIVITY.VENTELØNN,e6dd5da0-fc01-496e-a1a9-e507da8fd1de,"{'id': 'GDPR.PROCESSING_ACTIVITY.VENTELØNN', '...",2020-01-06 14:19:04.744794
35,GDPR.PROCESSING_ACTIVITY.YRKESSKADE- OG SYKDOM,f264c926-078d-4611-977f-5d99956fb35b,{'id': 'GDPR.PROCESSING_ACTIVITY.YRKESSKADE- O...,2020-01-06 14:19:04.744794
36,GDPR.PROCESSING_ACTIVITY.YTELSER TIL FAMILIEPL...,058653cc-4161-42f5-a542-c654f02b5634,{'id': 'GDPR.PROCESSING_ACTIVITY.YTELSER TIL F...,2020-01-06 14:19:04.744794
37,GDPR.PROCESSING_ACTIVITY.NOT AVAILABLE,bacd8a1b-1712-4d31-a2da-5eec2dc69da7,{'id': 'GDPR.PROCESSING_ACTIVITY.NOT AVAILABLE...,2020-01-06 14:19:04.744794


### Purpose

In [23]:
df_purpose = df_formål.copy()
df_purpose['type'] = 'GDPR.PURPOSE'
df_purpose['label'] = 'Purpose'
df_purpose['id'] = df_purpose.apply(get_id, axis=1)

In [24]:
for i, row in df_purpose.iterrows():
    jsonb = row.to_json()
    prop_id = row.id
    insert_query = f"insert into tbl_node (id,prop) values ('{prop_id}','{jsonb}')"
    cur.execute(insert_query, (jsonb,))
    
conn.commit()

In [25]:
df = sqlio.read_sql_query("select * from tbl_node", conn)
df.tail()

Unnamed: 0,id,guid,prop,created
71,GDPR.PURPOSE.UFØRETRYGD,c126b68e-487c-4395-af68-41c5ea2b22b8,"{'id': 'GDPR.PURPOSE.UFØRETRYGD', 'name': 'UFØ...",2020-01-06 14:19:06.305707
72,GDPR.PURPOSE.VENTELØNN,0204b07a-677e-4198-9a00-99f0baaaff35,"{'id': 'GDPR.PURPOSE.VENTELØNN', 'name': 'VENT...",2020-01-06 14:19:06.305707
73,GDPR.PURPOSE.YRKESSKADE- OG SYKDOM,ebf25c70-d90d-4f83-a82f-98e392117864,"{'id': 'GDPR.PURPOSE.YRKESSKADE- OG SYKDOM', '...",2020-01-06 14:19:06.305707
74,GDPR.PURPOSE.YTELSER TIL FAMILIEPLEIER,ace8c5b8-e5e5-42c1-8e72-1d21ff6e6e13,{'id': 'GDPR.PURPOSE.YTELSER TIL FAMILIEPLEIER...,2020-01-06 14:19:06.305707
75,GDPR.PURPOSE.NOT AVAILABLE,5d935b16-1606-434c-afba-eb1f7d19fedd,"{'id': 'GDPR.PURPOSE.NOT AVAILABLE', 'name': '...",2020-01-06 14:19:06.305707


### Legal Basis

In [26]:
sheet=xls.sheet_names[4]
df_legal_basis = pd.read_excel(xls, sheet, encoding='latin1')
df_legal_basis = df_legal_basis[['legalBasisDescription']].drop_duplicates()
df_legal_basis.columns=['name']
df_legal_basis = df_legal_basis.append({'name': 'NOT AVAILABLE'}, ignore_index=True)
df_legal_basis['type'] = 'GDPR.LEGAL_BASIS'
df_legal_basis['label'] = 'Legal basis'
df_legal_basis['id'] = df_legal_basis.apply(get_uuid, axis=1)
df_legal_basis.head()

Unnamed: 0,name,type,label,id
0,Ftrl. § 11-20,GDPR.LEGAL_BASIS,Legal basis,GDPR.LEGAL_BASIS.4347E15F-BDE9-4CB2-877E-B5CA2...
1,"Ftrl. §§ 3-16, 3-24, 3-25, 20-8.",GDPR.LEGAL_BASIS,Legal basis,GDPR.LEGAL_BASIS.64FC3FE2-981E-4FFC-80C8-0029C...
2,"Kapittel 3, 19, 20 0g 22",GDPR.LEGAL_BASIS,Legal basis,GDPR.LEGAL_BASIS.73A6431D-9BD3-4B8F-8187-F4469...
3,Ftrl § 3-2 og § 3-3,GDPR.LEGAL_BASIS,Legal basis,GDPR.LEGAL_BASIS.A5C0A4A5-AE15-494D-8634-1F61D...
4,Barnetrygdloven § 9,GDPR.LEGAL_BASIS,Legal basis,GDPR.LEGAL_BASIS.09A695C0-DC93-4240-ABB3-37B80...


In [27]:
for i, row in df_legal_basis.iterrows():
    index = index + 1
    jsonb = row.to_json()
    prop_id = row.id
    insert_query = f"insert into tbl_node (id, prop) values ('{prop_id}','{jsonb}')"
    cur.execute(insert_query, (jsonb,))
    
conn.commit()

In [28]:
df = sqlio.read_sql_query("select * from tbl_node", conn)
df.tail()

Unnamed: 0,id,guid,prop,created
232,GDPR.LEGAL_BASIS.33180537-743B-4338-A019-F8B1A...,d274abd1-ebd6-449f-9b03-260b188e4fa6,{'id': 'GDPR.LEGAL_BASIS.33180537-743B-4338-A0...,2020-01-06 14:19:07.095261
233,GDPR.LEGAL_BASIS.69DC1965-3BF4-46A0-B9E1-A92C3...,86b7db02-fc2f-4c1c-b7f3-5999f7373c9e,{'id': 'GDPR.LEGAL_BASIS.69DC1965-3BF4-46A0-B9...,2020-01-06 14:19:07.095261
234,GDPR.LEGAL_BASIS.8447EACB-9EA5-4116-A672-376D8...,678816aa-74b3-4684-9f29-f8221265180f,{'id': 'GDPR.LEGAL_BASIS.8447EACB-9EA5-4116-A6...,2020-01-06 14:19:07.095261
235,GDPR.LEGAL_BASIS.2BC9C1C8-A4AB-437E-8F1C-EC5CF...,151d9cbd-128b-420b-9a0f-8ad2ca599e69,{'id': 'GDPR.LEGAL_BASIS.2BC9C1C8-A4AB-437E-8F...,2020-01-06 14:19:07.095261
236,GDPR.LEGAL_BASIS.0663F29E-E43A-4AE8-9DFA-C12E6...,a9381f16-f1b6-4489-a2df-9f51fb7c3959,{'id': 'GDPR.LEGAL_BASIS.0663F29E-E43A-4AE8-9D...,2020-01-06 14:19:07.095261


### Informations Types

In [29]:

sheet=xls.sheet_names[3]

df_concepts = pd.read_excel(xls, sheet, encoding='latin1')
df_concepts = df_concepts[['title', 'description']]
df_concepts.columns=['name', 'description']
df_concepts['name'] = df_concepts['name'].apply(lambda x: x.strip().upper())
df_concepts = df_concepts.append({'name': 'NOT AVAILABLE', 'description': 'Not Available -Verdi mangler'}, ignore_index=True).drop_duplicates()
df_concepts['label'] = 'Information type'
df_concepts['type'] = 'GDPR.INFORMATION_TYPE'
df_concepts['id'] = df_concepts.apply(get_id, axis=1)
df_concepts.head()

Unnamed: 0,name,description,label,type,id
0,SIVILSTAND,En overordnet kategori som beskriver en person...,Information type,GDPR.INFORMATION_TYPE,GDPR.INFORMATION_TYPE.SIVILSTAND
1,ARBEIDSFORHOLD,"Avtaleforhold hvor den ene part, arbeidstakere...",Information type,GDPR.INFORMATION_TYPE,GDPR.INFORMATION_TYPE.ARBEIDSFORHOLD
2,KJØNN,TODO - mangler i begrepskatalogen og i MFNs be...,Information type,GDPR.INFORMATION_TYPE,GDPR.INFORMATION_TYPE.KJØNN
3,NAVN,I Norge skal alle ha fornavn og ett enkelt ell...,Information type,GDPR.INFORMATION_TYPE,GDPR.INFORMATION_TYPE.NAVN
4,FØDSELSDATO,Datoen personen er født.,Information type,GDPR.INFORMATION_TYPE,GDPR.INFORMATION_TYPE.FØDSELSDATO


In [30]:

for i, row in df_concepts.iterrows():
    jsonb = row.to_json()
    prop_id = row.id
    insert_query = f"insert into tbl_node (id, prop) values ('{prop_id}','{jsonb}')"
    cur.execute(insert_query, (jsonb,))
    
conn.commit()

In [31]:
df = sqlio.read_sql_query("select * from tbl_node", conn)
df.tail()

Unnamed: 0,id,guid,prop,created
253,GDPR.INFORMATION_TYPE.FLYKTNINGSTATUS,4ac491eb-b40d-4ab2-8b24-eb72d606abe9,{'id': 'GDPR.INFORMATION_TYPE.FLYKTNINGSTATUS'...,2020-01-06 14:19:07.904161
254,GDPR.INFORMATION_TYPE.FORELDREANSVAR,354d2e74-c69a-440f-89eb-a69d77d4aade,"{'id': 'GDPR.INFORMATION_TYPE.FORELDREANSVAR',...",2020-01-06 14:19:07.904161
255,GDPR.INFORMATION_TYPE.FORSTERFORELDRE,ffa98221-f943-4780-a999-30432cf9282a,{'id': 'GDPR.INFORMATION_TYPE.FORSTERFORELDRE'...,2020-01-06 14:19:07.904161
256,GDPR.INFORMATION_TYPE.FULLMAKT,4b638fcc-3d27-4bc8-b6ee-6b5b0591ad45,"{'id': 'GDPR.INFORMATION_TYPE.FULLMAKT', 'name...",2020-01-06 14:19:07.904161
257,GDPR.INFORMATION_TYPE.NOT AVAILABLE,78124bc7-cc93-475c-b2e2-a9163a56aa15,"{'id': 'GDPR.INFORMATION_TYPE.NOT AVAILABLE', ...",2020-01-06 14:19:07.904161


### Categories

In [32]:
sheet=xls.sheet_names[3]
df_categories = pd.read_excel(xls, sheet, encoding='latin1')
df_categories = df_categories[['categories']]
df_categories.columns=['name']
df_categories['name'] = df_categories['name'].apply(lambda x: x.strip().upper())
df_categories = df_categories.append({'name': 'NOT AVAILABLE'}, ignore_index=True).drop_duplicates()
df_categories['label'] = 'Information type category'
df_categories['type'] = 'GDPR.INFORMATION_TYPE_CATEGORY'
df_categories['id'] = df_categories.apply(get_id, axis=1)
df_categories.head()

Unnamed: 0,name,label,type,id
0,PERSONALIA,Information type category,GDPR.INFORMATION_TYPE_CATEGORY,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA
1,ARBEIDSFORHOLD,Information type category,GDPR.INFORMATION_TYPE_CATEGORY,GDPR.INFORMATION_TYPE_CATEGORY.ARBEIDSFORHOLD
6,KONTAKTOPPLYSNINGER,Information type category,GDPR.INFORMATION_TYPE_CATEGORY,GDPR.INFORMATION_TYPE_CATEGORY.KONTAKTOPPLYSNI...
9,FAMILIERELASJONER,Information type category,GDPR.INFORMATION_TYPE_CATEGORY,GDPR.INFORMATION_TYPE_CATEGORY.FAMILIERELASJONER
16,FOLKETRYGD,Information type category,GDPR.INFORMATION_TYPE_CATEGORY,GDPR.INFORMATION_TYPE_CATEGORY.FOLKETRYGD


In [33]:
for i, row in df_categories.iterrows():
    index = index + 1
    jsonb = row.to_json()
    prop_id = row.id
    insert_query = f"insert into tbl_node (id,prop) values ('{prop_id}','{jsonb}')"
    cur.execute(insert_query, (jsonb,))
    
conn.commit()

### Nodes

In [34]:
df_nodes = sqlio.read_sql_query("select * from tbl_node", conn)
df_nodes['name'] = df_nodes['prop'].apply(lambda x : x['name'])
df_nodes['type'] = df_nodes['prop'].apply(lambda x : x['type'])
df_nodes.head()

Unnamed: 0,id,guid,prop,created,name,type
0,GDPR.PROCESSING_ACTIVITY.ALDERSPENSJON,c791ef41-2496-4496-9f73-2f8fa5de8c41,{'id': 'GDPR.PROCESSING_ACTIVITY.ALDERSPENSJON...,2020-01-06 14:19:04.744794,ALDERSPENSJON,GDPR.PROCESSING_ACTIVITY
1,GDPR.PROCESSING_ACTIVITY.ARBEIDSAVKLARINGSPENG...,b85d1d83-0596-4ba8-8329-eb5b3be6c14c,{'id': 'GDPR.PROCESSING_ACTIVITY.ARBEIDSAVKLAR...,2020-01-06 14:19:04.744794,ARBEIDSAVKLARINGSPENGER (AAP),GDPR.PROCESSING_ACTIVITY
2,GDPR.PROCESSING_ACTIVITY.AVTALEFESTET PENSJON ...,f66a1752-148a-4367-af7c-747a46449dbd,{'id': 'GDPR.PROCESSING_ACTIVITY.AVTALEFESTET ...,2020-01-06 14:19:04.744794,AVTALEFESTET PENSJON (AFP) FOR PRIVAT SEKTOR,GDPR.PROCESSING_ACTIVITY
3,GDPR.PROCESSING_ACTIVITY.AVTALEFESTET PENSJON ...,c7f3fb59-57ab-4dee-b319-9ab3bc0c061d,{'id': 'GDPR.PROCESSING_ACTIVITY.AVTALEFESTET ...,2020-01-06 14:19:04.744794,AVTALEFESTET PENSJON (AFP) FOR STATLIG OG KOMM...,GDPR.PROCESSING_ACTIVITY
4,GDPR.PROCESSING_ACTIVITY.BARNEBIDRAG,d73f4841-c705-4fd1-9485-b6e9e981cb3f,"{'id': 'GDPR.PROCESSING_ACTIVITY.BARNEBIDRAG',...",2020-01-06 14:19:04.744794,BARNEBIDRAG,GDPR.PROCESSING_ACTIVITY


### Edges

In [35]:
sheet=xls.sheet_names[3]
df_edges = pd.read_excel(xls, sheet, encoding='latin1')
df_edges = df_edges[['title', 'categories']].drop_duplicates()
df_edges.columns = ['source', 'target']
df_edges['source'] = df_edges['source'].apply(lambda x: x.strip().upper())
df_edges['target'] = df_edges['target'].apply(lambda x: x.strip().upper())
df_edges.drop_duplicates()

df_edges.head()

Unnamed: 0,source,target
0,SIVILSTAND,PERSONALIA
1,ARBEIDSFORHOLD,ARBEIDSFORHOLD
2,KJØNN,PERSONALIA
3,NAVN,PERSONALIA
4,FØDSELSDATO,PERSONALIA


In [36]:
conn.commit()

In [37]:
cur.execute("delete from tbl_edge")
conn.commit()

for index, row in df_edges.iterrows():
    df_nodes.loc[((df_nodes['name'] == row['source']) & (df_nodes['type'] == 'GDPR.INFORMATION_TYPE')), 'id']
    source_id = df_nodes.loc[((df_nodes['name'] == row['source']) & (df_nodes['type'] == 'GDPR.INFORMATION_TYPE')), 'id'].item()
    target_id = df_nodes.loc[((df_nodes['name'] == row['target']) & (df_nodes['type'] == 'GDPR.INFORMATION_TYPE_CATEGORY')), 'id'].item()
    relation = {"edge":"array[5]"}
    cur.execute(f"insert into tbl_edge (n1,n2,prop) values ('{source_id}','{target_id}','{json.dumps(relation)}');")
    
conn.commit()

In [38]:
df_edges = pd.read_excel(xls, sheet, encoding='latin1')
df_edges = df_edges[['categories', 'title']].drop_duplicates()
df_edges.columns = ['source', 'target']
df_edges['source'] = df_edges['source'].apply(lambda x: x.strip().upper())
df_edges['target'] = df_edges['target'].apply(lambda x: x.strip().upper())
df_edges.drop_duplicates()

df_edges.head()

Unnamed: 0,source,target
0,PERSONALIA,SIVILSTAND
1,ARBEIDSFORHOLD,ARBEIDSFORHOLD
2,PERSONALIA,KJØNN
3,PERSONALIA,NAVN
4,PERSONALIA,FØDSELSDATO


In [39]:
for index, row in df_edges.iterrows():
    df_nodes.loc[((df_nodes['name'] == row['source']) & (df_nodes['type'] == 'GDPR.INFORMATION_TYPE')), 'id']
    source_id = df_nodes.loc[((df_nodes['name'] == row['source']) & (df_nodes['type'] == 'GDPR.INFORMATION_TYPE_CATEGORY')), 'id'].item()
    target_id = df_nodes.loc[((df_nodes['name'] == row['target']) & (df_nodes['type'] == 'GDPR.INFORMATION_TYPE')), 'id'].item()
    relation = {"edge":"array[6]"}
    cur.execute(f"insert into tbl_edge (n1,n2,prop) values ('{source_id}','{target_id}','{json.dumps(relation)}');")
    
conn.commit()

In [40]:
sql = "select * from tbl_edge"
df = sqlio.read_sql_query(sql, conn)
df.head()

Unnamed: 0,n1,n2,prop,created
0,GDPR.INFORMATION_TYPE.SIVILSTAND,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,{'edge': 'array[5]'},2020-01-06 14:19:10.400986
1,GDPR.INFORMATION_TYPE.ARBEIDSFORHOLD,GDPR.INFORMATION_TYPE_CATEGORY.ARBEIDSFORHOLD,{'edge': 'array[5]'},2020-01-06 14:19:10.400986
2,GDPR.INFORMATION_TYPE.KJØNN,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,{'edge': 'array[5]'},2020-01-06 14:19:10.400986
3,GDPR.INFORMATION_TYPE.NAVN,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,{'edge': 'array[5]'},2020-01-06 14:19:10.400986
4,GDPR.INFORMATION_TYPE.FØDSELSDATO,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,{'edge': 'array[5]'},2020-01-06 14:19:10.400986


In [73]:
conn.commit()

In [74]:
cur.execute("DROP FUNCTION if exists graph_search1(text,integer,bigint)")
conn.commit()

In [84]:
sql = """
create or replace function graph_search1(      
  IN i_root text,                       -- The node that the search is based on        
  IN i_depth int  default 99999,       -- the tier to search (the depth limit)      
  IN i_limit int8 default 2000000000,  -- limit the number of records returned for each tier      
  OUT o_path text[],                    -- output: path, an array of IDs      
  OUT o_point1 text,                    -- output: point 1 ID      
  OUT o_point2 text,                    -- output: point 2 ID      
  OUT o_link_prop text,              -- output: the connection property between the two current points      
  OUT o_link_prop_all text,            -- output: the connection property from the starting node to the current node      
  OUT o_depth int                      -- output: current depth (tier)      
) returns setof record as 
$$
      
declare      
  sql text;      
begin      
sql := format($_$      
WITH RECURSIVE search_graph(        
  n1,     -- point 1        
  n2,     -- point 2        
  prop,   -- current edge property      
  all_prop,  -- properties of all edges  
  depth,  -- current depth, starting from 1         
  path    -- path, stored as an array         
) AS (        
        select n1,n2,prop,all_prop,depth,path from (        
        SELECT                               -- ROOT node query        
          g.n1,                              -- point 1        
          g.n2,                              -- point 2        
          g.prop,                            -- edge property        
      g.prop::text as all_prop,          -- properties of all edges  
          1 depth,                           -- initial depth=1        
          ARRAY[g.n1, g.n2] path             -- initial path        
        FROM tbl_edge AS g         
        WHERE         
          n1 = %s                            -- ROOT node=?        
          limit %s                           -- How many records are limited at each tier?        
        ) t        
      UNION ALL        
        select n1,n2,prop,all_prop,depth,path from (        
        SELECT                               -- recursive clause         
          g.n1,                              -- point 1        
          g.n2,                              -- point 2        
          g.prop,                            -- edge property     
      sg.all_prop || g.prop::text as all_prop,    -- properties of all edges  
          sg.depth + 1 depth,                   -- depth +1        
          sg.path || g.n2 path                 -- Add a new point to the path        
        FROM tbl_edge AS g, search_graph AS sg    -- circular INNER JOIN        
        WHERE         
          g.n1 = sg.n2                       -- recursive JOIN condition        
          AND (g.n2 <> ALL(sg.path))                      -- Prevent loop, determine whether it is a loop and judge if the new point is already in the previous path   
          AND sg.depth <= %s                 -- search depth =?          
          limit %s                           -- How many records are limited at each tier?       
        ) t        
)        
SELECT path as o_path, n1 as o_point1, n2 as o_point2, prop as o_link_prop, all_prop as o_link_prop_all, depth as o_depth      
FROM search_graph;                           -- query a recursive table. You can add LIMIT output or use a cursor       
$_$, i_root, i_limit, i_depth, i_limit      
);      
      
return query execute sql;      
      
end;      

$$
 language plpgsql strict;   
"""

cur.execute(sql)

In [85]:
conn.commit()

In [86]:
%%time
sql = 'select * from graph_search1("GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA");'
df = sqlio.read_sql_query(sql, conn)

DatabaseError: Execution failed on sql 'select * from graph_search1("GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA");': column "GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA" does not exist
LINE 1: select * from graph_search1("GDPR.INFORMATION_TYPE_CATEGORY....
                                    ^


In [67]:
sql = "select * from tbl_edge where n1 = 'GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA'"
df = sqlio.read_sql_query(sql, conn)
df.head()

Unnamed: 0,n1,n2,prop,created
0,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,GDPR.INFORMATION_TYPE.SIVILSTAND,{'edge': 'array[6]'},2020-01-06 14:19:10.790199
1,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,GDPR.INFORMATION_TYPE.KJØNN,{'edge': 'array[6]'},2020-01-06 14:19:10.790199
2,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,GDPR.INFORMATION_TYPE.NAVN,{'edge': 'array[6]'},2020-01-06 14:19:10.790199
3,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,GDPR.INFORMATION_TYPE.FØDSELSDATO,{'edge': 'array[6]'},2020-01-06 14:19:10.790199
4,GDPR.INFORMATION_TYPE_CATEGORY.PERSONALIA,GDPR.INFORMATION_TYPE.FØDSELSNUMMER,{'edge': 'array[6]'},2020-01-06 14:19:10.790199
