# Below is a step-by-step example that how the pharma_subgraph KG used for SQID is built

Set up environment variables with location of the input files

In [3]:
kgtk_file = "kgtk_files/pharma_subgraph.tsv"

In [4]:
%env ttl_files=ttl_files
%env ES_DATA_FOLDER_PATH = flask/es/es_data

env: ttl_files=ttl_files
env: ES_DATA_FOLDER_PATH=flask/es/es_data


## Building Wikidata triples and Mediawiki json files

pharma_subgraph.tsv link: https://drive.google.com/file/d/11oAhqCNR8vAIKXv0IhuIcy-L6NuDFxKF/view?usp=sharing

### Generate Wikidata Triples (under sqid/ttl_files) from KGTK TSV (pharma_subgraph) -- kgtk generate_wikidata_triples

In [7]:
import os
out = os.system("cat {} \
| kgtk generate_wikidata_triples -pf flask/flask_data/wikidata_properties.tsv -w yes -pd yes > subgraph.ttl"
          .format(kgtk_file))
if out == 0:
    print ("TTL generated")
else:
    print ("Fail to generate TTL")

TTL generated


In [8]:
# stat of output ttl file
!stat subgraph.ttl

  File: subgraph.ttl
  Size: 6020115   	Blocks: 11768      IO Block: 4096   regular file
Device: 10305h/66309d	Inode: 280344      Links: 1
Access: (0664/-rw-rw-r--)  Uid: ( 1004/mingyuet)   Gid: ( 1004/mingyuet)
Access: 2020-10-27 03:29:01.136354267 +0000
Modify: 2020-10-27 03:29:35.220355441 +0000
Change: 2020-10-27 03:29:35.220355441 +0000
 Birth: -


In [9]:
!head -n 5 subgraph.ttl

@prefix wikibase: <http://wikiba.se/ontology#> .
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix wdt: <http://www.wikidata.org/prop/direct/> .
@prefix wdtn: <http://www.wikidata.org/prop/direct-normalized/> .
@prefix wdno: <http://www.wikidata.org/prop/novalue/> .


In [10]:
!mv *.ttl ttl_files/

### Generate Mediawiki JSON (under sqid/flask/es/es_data/) from KGTK TSV -- kgtk generate-mediawiki-jsons

In [23]:
file = kgtk_file.split("/")[-1]
os.system("cp {} {}/{}".format(kgtk_file, os.getenv("ES_DATA_FOLDER_PATH"), file))
 
out = os.system("cat {}/{} \
| kgtk generate-mediawiki-jsons -pf flask/flask_data/wikidata_properties.tsv -w yes -pd yes"
          .format(os.getenv("ES_DATA_FOLDER_PATH"), file))
if out == 0:
    print ("Media JSON file generated")
else:
    print ("Media JSON file generate failed")

Media JSON file generated


In [24]:
!mv *.jsonl $ES_DATA_FOLDER_PATH/

In [25]:
# stats of how many files in ES_DATA_FOLDER_PATH
!cd $ES_DATA_FOLDER_PATH/ && ls | wc -l

34


## Load trile file into Blazegraph

Given triple files loaded into Blazegraph

## Load json file into ES

### Install all required libraries

In [9]:
!python3 -m venv sqid-env
!source sqid-env/bin/activate

!cd flask && pip install -r requirements.txt
!npm install .

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
[K[?25h[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m bootstrap@4.5.2 requires a peer of jquery@1.9.1 - 3 but none is installed. You must install peer dependencies yourself.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35moptional[0m SKIPPING OPTIONAL DEPENDENCY: fsevents@2.1.3 (node_modules/fsevents):
[0m[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35mnotsup[0m SKIPPING OPTIONAL DEPENDENCY: Unsupported platform for fsevents@2.1.3: wanted {"os":"darwin","arch":"any"} (current: {"os":"linux","arch":"x64"})
[0m[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35moptional[0m SKIPPING OPTIONAL DEPENDENCY: fsevents@1.2.13 (node_modules/jest-haste-map/node_modules/fsevents):
[0m[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35mnotsup[0m SKIPPING OPTIONAL DEPENDENCY: Unsupported platform for fsevents@1.2.13: want

### Modify Configuration file (Global configure)
https://github.com/mtang724/sqid/blob/master/global_config.py

Parameters:
- HOST = "localhost"
- SQID_PORT = 8051
- SPARQL_PORT = 10002
- FLASK_PORT = 5556
- ES_INDEX = "kgtk_files"
- ELASTICSEARCH_PORT = 9200
- ES_DATA_FOLDER_PATH = "/home/mingyuet/sqid/flask/es/es_data"

In [10]:
!head -n 15 global_config.py

import argparse

HOST = "localhost"
SQID_PORT = 8051
SPARQL_PORT = 11102
FLASK_PORT = 5556
ES_INDEX = "kgtk_files"
ELASTICSEARCH_PORT = 9200

# Flask application config
PROPERTY_FILES = [
	"flask_data/wikidata_properties.tsv"
]
ES_INDEX = ES_INDEX
ELASTICSEARCH_HOST = "http://{}:{}".format(HOST, str(ELASTICSEARCH_PORT))


### Create Index for ES

In [11]:
from elasticsearch import Elasticsearch
import os
from global_config import ES_INDEX

In [12]:
es = Elasticsearch()

In [13]:
# index name is defined in global_config file, ES_INDEX, please change this accordingly
if not es.indices.exists(index=ES_INDEX):
    os.system("cd flask/es/ && python create_index.py")

### import index data from es_data (mediawiki json files)

Mediawiki json files path(es_data) is defined in global_config file

In [14]:
!cd flask/es/ && python import_data.py

100%|███████████████████████████████████████████| 32/32 [00:03<00:00,  8.10it/s]


delete all imported data

In [15]:
!rm -rf $ES_DATA_FOLDER_PATH/*.jsonl

Output some example indexes

In [16]:
res = es.search(index=ES_INDEX, body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total']['value'])
for hit in res['hits']['hits']:
    print(hit["_source"])

Got 10000 Hits:
{'labels': {'en': {'languange': 'en', 'value': 'Dextromethorphan/quinidine'}}, 'descriptions': {}, 'aliases': {}, 'claims': {}, 'sitelinks': {}, 'type': 'item', 'id': 'Q5268496', 'pageid': -1, 'ns': -1, 'title': 'Q5268496', 'lastrevid': '2000-01-01T00:00:00Z'}
{'labels': {'en': {'languange': 'en', 'value': 'Lewy body dementia'}}, 'descriptions': {}, 'aliases': {}, 'claims': {}, 'sitelinks': {}, 'type': 'item', 'id': 'Q52697931', 'pageid': -1, 'ns': -1, 'title': 'Q52697931', 'lastrevid': '2000-01-01T00:00:00Z'}
{'labels': {'en': {'languange': 'en', 'value': 'one-of qualifier value property constraint'}}, 'descriptions': {}, 'aliases': {}, 'claims': {}, 'sitelinks': {}, 'type': 'item', 'id': 'Q52712340', 'pageid': -1, 'ns': -1, 'title': 'Q52712340', 'lastrevid': '2000-01-01T00:00:00Z'}
{'labels': {'en': {'languange': 'en', 'value': 'diflorasone'}}, 'descriptions': {}, 'aliases': {}, 'claims': {}, 'sitelinks': {}, 'type': 'item', 'id': 'Q5275451', 'pageid': -1, 'ns': -1, '

### Run Global Configure File to generate endpoints for SQID and ES

In [17]:
# generate endpoints config file for SQID
!python global_config.py --is_generate True

### Run ES index system using Flask

and

### Run SQID through npm run serve (development enviorment)

In [18]:
import subprocess
flask_app= subprocess.Popen("cd flask/&&python app.py", shell=True)
sqid_interface = subprocess.Popen("npm run serve", shell=True)