**QuickGO API data to SQLite DB for CONLL dataset entries**

- This module calls the QuickGO API to fetch data on entities found in the MLEE conll dataset. First it finds the list of all entities, then it calls the QuickGO API to fetch additional data, which is then stored in SQLite.
- The reason is that the HTTP API has its limits. It cannot be reliably called during the training/testing processes. It is better to store data in SQLite local instance and fetch the data as we needed.
- This module should only run once to fetch the data. 

In [None]:
import requests as http
import sys
import time
import os
import sqlite3

In [None]:
class CONLLQuickGOAPIToSQLiteDB:
    
    def __init__(self, input_dir, sqlite_path):
        self.req_base_url = "https://www.ebi.ac.uk/QuickGO/services/ontology/go/search?query="
        self.req_suffix = "&limit=1&page=1"
        self.req_headers = { "Accept" : "application/json"}
        self.input_dir = input_dir
        self.sqlite_conn = sqlite3.connect(sqlite_path)
        self.token_entity_mappings = {}
        
    def call_QuickGO_API(self, entity):
        request_FULL = self.req_base_url + entity + self.req_suffix
        resp = http.get(request_FULL, self.req_headers)
        if not resp.ok:
            resp.raise_for_status()
            sys.exit()
        return resp.json()
        
    def extract_tokens_and_tags(self):
        # This will extract all the tokens for which the tags != 'O' 
        num_files = 0
        entity_types = set()
        for dirpath, _, filenames in os.walk(self.input_dir):
            for filename in filenames:
                if filename.endswith(".conll"):
                    num_files += 1
                    file_path = os.path.join(dirpath, filename)
                    with open(file_path, 'r', encoding='utf-8') as file:
                        for line in file:
                            if line.strip():  # non-empty line
                                parts = line.split("\t")
                                if len(parts) == 4:
                                    token, entity_type = parts[0], parts[-1].strip()
                                    if entity_type != 'O' and len(token) > 1:
                                        entity_types.add(entity_type)
                                        self.token_entity_mappings[token] = entity_type
        
        print(f"Read {num_files} from the input_directory recursively. ")
        print(f"There are {len(self.token_entity_mappings)} TOKEN:ENTITY correspondances")
        print(f"There are {len(entity_types)} unique entity types")
        
    
    def call_QuickGO_API_for_all_tokens(self):
        # The API limits 200
        print(f"KBs Concept Embedding for detected CONLL tokens")
        request_count = 0;
        for key, _ in self.token_entity_mappings.items():
            request_count = request_count + 1
            
            token_response = self.call_QuickGO_API(key)
            self.parse_and_save_to_DB(key, token_response)
            
            if request_count%50 == 0: # for compliance with QuickGO API terms
                self.sqlite_conn.commit()
                print(f"Saved {request_count} responses to DB")
                time.sleep(1)
        print(f"Saved {request_count} responses to DB")
        self.sqlite_conn.commit()
    
    def parse_and_save_to_DB(self, key, response):
        
        if len(response["results"]) == 1:
            resp_info = response["results"][0]
            GO_id = resp_info["id"]
            GO_name = resp_info["name"]
            GO_definition = resp_info["definition"]["text"]
            GO_aspect = resp_info["aspect"]
            self.save_API_response_to_DB(key, GO_id, GO_name, GO_definition, GO_aspect)
        else:
            self.save_API_response_to_DB(key, "", "", "", "")
            
    
    def save_API_response_to_DB(self, key, GO_id, GO_name, GO_definition, GO_aspect):
        db_cursor = self.sqlite_conn.cursor()
        db_cursor.execute('''
        INSERT INTO QuickGOCONLL (entity, GO_id, GO_name, GO_definition, GO_aspect) VALUES (?, ?, ?, ?, ?)
        ''', (key, GO_id, GO_name, GO_definition, GO_aspect))
        
                                        

In [None]:
input_dir = '../BME Corpora/MLEE-1.0.2-rev1/conll'
sqlite_path = '../QuickGO.db'

In [None]:
api_to_db = CONLLQuickGOAPIToSQLiteDB(input_dir, sqlite_path)
test_API_response = api_to_db.call_QuickGO_API("VEGF") # TEST
print(test_API_response["results"][0])

In [None]:
api_to_db.extract_tokens_and_tags()

In [None]:
api_to_db.call_QuickGO_API_for_all_tokens()