# 1.0-load-gfe-db / Load Database

Development notebook for executing Cypher transactions through the Neo4j HTTP API.

Activities:
- Generate S3 pre-signed URL for each CSV
- Update the load script with the pre-signed URL
- Execute the load script
- Validate that the load script has been executed or is running

In [43]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [44]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv());

In [45]:
import os
import sys
sys.path.append("../") if "../" not in sys.path else None
import logging
import re
import base64
import json
import requests
import boto3

In [46]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)

## Generate Pre-signed URLs for CSV files

In [47]:
# Environment variables
root = os.environ["ROOT"]
scripts_dir = os.environ["BIN_DIR"]
src_dir = os.environ["SRC_DIR"]
data_dir = os.environ["DATA_DIR"]
logs_dir = os.environ["LOGS_DIR"]
cypher_dir = os.environ["CYPHER_PATH"]
load_script = os.environ["SCRIPT"]

s3_bucket = os.environ["GFE_BUCKET"]
release = os.environ["RELEASES"]

In [48]:
release

'3440'

In [49]:
path = "/".join([logs_dir, "s3CopyLog.txt"])
path = "." + path

def parseS3CopyLog(path):
    
    logger.info(f"Parsing file: {path}")
    
    # Identify S3 urls in text
    pattern = r's3:\/\/([^/]+)\/([\w\W]+)\.(.*)'
    s3_urls = []
    
    try:
        with open(path, "r") as file:
            for line in file.readlines():
                if "upload" in line:
                    s3_url = re.search(pattern, line).group(0)
                    s3_urls.append(s3_url)
                    logger.info(f"Found: {s3_url}")
    except Exception as err:
        logger.error("Could not parse file")
        raise err
        
    return s3_urls

In [50]:
s3_urls = parseS3CopyLog(path)

In [51]:
s3_urls

['s3://gfe-db-4498/data/3440/csv/all_groups.3440.csv',
 's3://gfe-db-4498/data/3440/csv/all_cds.3440.csv',
 's3://gfe-db-4498/data/3440/csv/gfe_sequences.3440.csv',
 's3://gfe-db-4498/data/3440/csv/all_features.3440.csv',
 's3://gfe-db-4498/data/3440/csv/all_alignments.3440.csv']

In [52]:
# Get the service client.
s3 = boto3.client('s3')

def generate_presigned_urls(s3_urls, expire=3600):
    """Accepts a list of S3 URLs or paths and returns
    a dictionary of pre-signed URLs for each"""
    
    logger.info(f"Generating pre-signed URLs...")
    
    s3_urls = [s3_urls] if not isinstance(s3_urls, list) else s3_urls
    
    presigned_urls = {}
    
    for s3_url in s3_urls:
        
        i = 2 if "s3://" in s3_url else 0
        
        bucket = s3_url.split("/")[i]
        key = "/".join(s3_url.split("/")[i + 1:])
        
        # Generate the URL to get 'key-name' from 'bucket-name'
        url = s3.generate_presigned_url(
            ClientMethod='get_object',
            Params={
                'Bucket': bucket,
                'Key': key
            },
            ExpiresIn=expire
        )
        
        presigned_urls[s3_url] = url
        
    return presigned_urls

In [53]:
presigned_urls = generate_presigned_urls(s3_urls)

## Update the load script with the pre-signed URL

In [54]:
cypher_path = "/".join([root, f'{cypher_dir}/{load_script}'])
cypher_path = "." + cypher_path

In [55]:
cypher_path

'../neo4j/cypher/load.cyp'

In [56]:
def update_cypher(cypher_path):

    with open(cypher_path, "r") as file:
        cypher_script = file.read()

    for s3_url in s3_urls:

        csv_prefix = s3_url.split("/")[-1].split(".")[0]
        cypher_script = cypher_script.replace(f'file:///{csv_prefix}.RELEASE.csv', presigned_urls[s3_url])
        
    return cypher_script

In [57]:
cypher_script = update_cypher(cypher_path)

## HTTP Request to load GFE DB

In [58]:
protocol = 'http'
host = "18.206.14.94"
username = "neo4j"
password = "gfedb"
port = "7474"
endpoint = "db/neo4j/tx/commit"
url = f'{protocol}://{host}:{port}/{endpoint}'

In [59]:
# cypher = "MATCH (m:Movie) RETURN m LIMIT 3"
cypher = list(filter(lambda x: x != "\n", cypher_script.split(";")))
cypher = list(map(lambda x: "".join([x, ";"]), cypher))

In [60]:
def run_cypher(cypher, debug=False):
    
    payload = {
        "statements": [
            {
                "statement": cypher,
                "params": {}
            }
        ]
    }
    
    # Headers
    headers = { 
        "Accept": "application/json;charset=UTF-8",
        "Content-Type": "application/json",
        "Authorization": f"Basic {base64.b64encode(':'.join([username, password]).encode()).decode()}"
    }

    # Send requests
    response = requests.post(
        url, 
        data=json.dumps(payload), 
        headers=headers)
    
    response = json.loads(response.content)
    
    # if debug:
    #     print(f"statement: {statement}")
    #     print(f"response: {response}\n\n")
    
    if len(response['errors']) > 0:
        logger.error(response)
    else:
        logger.info(response)
    
    return response

In [62]:
cypher[3:4]

["\nUSING PERIODIC COMMIT 50000\nLOAD CSV WITH HEADERS FROM 'https://gfe-db-4498.s3.amazonaws.com/data/3440/csv/gfe_sequences.3440.csv?AWSAccessKeyId=ASIAXXVOWRIZL5RW2UYC&Signature=rVr%2F1QjkkP4FDsWyoHDedQGFm6E%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEIb%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJGMEQCIH%2FInTeqWC%2FuJuyZ01uimybfphX3YHzqgTXeGgVrx%2BOXAiAfMpRAcgILMassE%2Fa8IxAAO8aiUa9uMQDtGVlVnfVGWir3Agh%2FEAIaDDUzMTg2ODU4NDQ5OCIMiOzjQAx4Y1mScErEKtQCffUamuame0x7Z%2Bpn8AGdL1lXTsg8R5rv5Z4oKwXQskOvGSg4jtLgdDne2QrmeL5uJdERqtUlO%2FuzStluVes79rgODRiVON1w4oVjNnp712fx8Rp8pGeD6YvDJ7rLJdJrteUX82NqGuM5P5v%2Bi%2FwR1fDP7g343SSxHDwPUDHdl5JjHoIcYbZ9Uw%2Bh6aUwGTWFxTy11U3FkZdWDcATCnnynRpbJbGDQxLVho6zT10VgTIG6HhEuWmGQ1Db%2FUMNhe5avBJDVlwnVI0uq%2B%2BAEx3uOu3q3b8pr6zla1ztpbHzzvOYlUxJJIxvLh%2BVATNA13SNnT9V56Dv%2FyBiCy3jVtWfWn6pySwGcRjJ%2BH6L%2FuPPq%2Bxs9aBtJcPeY900UiQcLCQC%2FH3tqi13mnRyNyPEvu7gRkA1ytd2wtznWCq%2Fg9QxjXnEtWev6%2BYiPwtOLKqBqq0AHKuahDCoxNKHBjqnARtNpBqKvqhvyRTUnGUZdjnxZEy7joYNreREom4A52OCZiW

In [63]:
# limit = 10

for idx, statement in enumerate(cypher[3:4]):
    response = run_cypher(statement, debug=True)
    #print(response)
    logger.info(response)
    
    # if limit and idx + 1 == limit:
        # break

In [22]:
response

{'results': [{'columns': [], 'data': []}], 'errors': []}

## Test Neo4j Server

In [73]:
url = f"http://{host}:{port}"

# Send requests
response = requests.get(
    url, 
    headers={
        "Accept": "application/json"
    })

print(response.text)

{
  "bolt_routing" : "neo4j://18.206.14.94:7687",
  "transaction" : "http://18.206.14.94:7474/db/{databaseName}/tx",
  "bolt_direct" : "bolt://18.206.14.94:7687",
  "neo4j_version" : "4.3.2",
  "neo4j_edition" : "community"
}
