# 1.1-load-gfe-db / Load Database

Development notebook for executing Cypher transactions through the Neo4j HTTP API.

Activities:
- Generate S3 pre-signed URL for each CSV
- Update the load script with the pre-signed URL
- Execute the load script via Run Command
- Validate that the load script has been executed or is running

In [5]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [6]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv());

In [7]:
import os
import sys
sys.path.append("../") if "../" not in sys.path else None
import logging
import re
import time
import base64
import json
import requests
import boto3

In [8]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [11]:
!ls ..

Dockerfile         [1m[36mconfig[m[m             nodes-original.txt summary_agg.txt
[31mEC2INSTRUCTIONS.md[m[m [1m[36mdata[m[m               [30m[43mnotebooks[m[m          summary_diff.txt
[31mLICENSE[m[m            [1m[36mlogs[m[m               [31mrequirements.txt[m[m
[31mREADME.md[m[m          [30m[43mneo4j[m[m              [1m[36mscripts[m[m
[1m[36mcfn[m[m                nodes-new.txt      [30m[43msrc[m[m


## Generate Pre-signed URLs for CSV files

In [9]:
# Environment variables
root = os.environ["ROOT"]
scripts_dir = os.environ["BIN_DIR"]
src_dir = os.environ["SRC_DIR"]
data_dir = os.environ["DATA_DIR"]
logs_dir = os.environ["LOGS_DIR"]
cypher_dir = os.environ["CYPHER_PATH"]
load_script = os.environ["SCRIPT"]

s3_bucket = os.environ["GFE_BUCKET"]
release = os.environ["RELEASES"]

KeyError: 'ROOT'

In [6]:
release

'3440'

In [7]:
path = "/".join([logs_dir, "s3CopyLog.txt"])
path = "." + path

def parseS3CopyLog(path):
    
    logger.info(f"Parsing file: {path}")
    
    # Identify S3 urls in text
    pattern = r's3:\/\/([^/]+)\/([\w\W]+)\.(.*)'
    s3_urls = []
    
    try:
        with open(path, "r") as file:
            for line in file.readlines():
                if "upload" in line:
                    s3_url = re.search(pattern, line).group(0)
                    s3_urls.append(s3_url)
                    logger.info(f"Found: {s3_url}")
    except Exception as err:
        logger.error("Could not parse file")
        raise err
        
    return s3_urls

In [8]:
s3_urls = parseS3CopyLog(path)

In [9]:
s3_urls

['s3://gfe-db-4498/data/3440/csv/all_groups.3440.csv',
 's3://gfe-db-4498/data/3440/csv/all_cds.3440.csv',
 's3://gfe-db-4498/data/3440/csv/gfe_sequences.3440.csv',
 's3://gfe-db-4498/data/3440/csv/all_features.3440.csv',
 's3://gfe-db-4498/data/3440/csv/all_alignments.3440.csv']

In [10]:
# Get the service client.
s3 = boto3.client('s3')

def generate_presigned_urls(s3_urls, expire=3600):
    """Accepts a list of S3 URLs or paths and returns
    a dictionary of pre-signed URLs for each"""
    
    logger.info(f"Generating pre-signed URLs...")
    
    s3_urls = [s3_urls] if not isinstance(s3_urls, list) else s3_urls
    
    presigned_urls = {}
    
    for s3_url in s3_urls:
        
        i = 2 if "s3://" in s3_url else 0
        
        bucket = s3_url.split("/")[i]
        key = "/".join(s3_url.split("/")[i + 1:])
        
        # Generate the URL to get 'key-name' from 'bucket-name'
        url = s3.generate_presigned_url(
            ClientMethod='get_object',
            Params={
                'Bucket': bucket,
                'Key': key
            },
            ExpiresIn=expire
        )
        
        presigned_urls[s3_url] = url
        
    return presigned_urls

In [11]:
presigned_urls = generate_presigned_urls(s3_urls)

## Update the load script with the pre-signed URL

In [None]:
cypher_path = "/".join([root, f'{cypher_dir}/{load_script}'])
cypher_path = "." + cypher_path

In [None]:
cypher_path

In [14]:
def update_cypher(cypher_path):
    """Replaces instances of "file:///{csv_prefix}.RELEASE.csv" with
    an S3 pre-sign URL"""

    with open(cypher_path, "r") as file:
        cypher_script = file.read()

    for s3_url in s3_urls:

        csv_prefix = s3_url.split("/")[-1].split(".")[0]
        cypher_script = cypher_script.replace(f'file:///{csv_prefix}.RELEASE.csv', presigned_urls[s3_url])
        
    return cypher_script

In [15]:
cypher_script = update_cypher(cypher_path)

## HTTP Request to load GFE DB

In [16]:
protocol = 'http'
host = "18.206.14.94"
username = "neo4j"
password = "gfedb"
port = "7474"
endpoint = "db/neo4j/tx/commit"
url = f'{protocol}://{host}:{port}/{endpoint}'

In [17]:
# cypher = "MATCH (m:Movie) RETURN m LIMIT 3"
cypher = list(filter(lambda x: x != "\n", cypher_script.split(";")))
cypher = list(map(lambda x: "".join([x, ";"]), cypher))

In [18]:
def run_cypher(cypher, debug=False):
    
    payload = {
        "statements": [
            {
                "statement": cypher,
                "params": {}
            }
        ]
    }
    
    # Headers
    headers = { 
        "Accept": "application/json;charset=UTF-8",
        "Content-Type": "application/json",
        "Authorization": f"Basic {base64.b64encode(':'.join([username, password]).encode()).decode()}"
    }

    # Send requests
    response = requests.post(
        url, 
        data=json.dumps(payload), 
        headers=headers)
    
    response = json.loads(response.content)
    
    # if debug:
    #     print(f"statement: {statement}")
    #     print(f"response: {response}\n\n")
    
    if len(response['errors']) > 0:
        logger.error(response)
    else:
        logger.info(response)
    
    return response

In [23]:
cypher[:2]

["// Nodes\nRETURN '(:GFE)' AS `Creating nodes...`;",
 "\nUSING PERIODIC COMMIT 50000\nLOAD CSV WITH HEADERS FROM 'https://gfe-db-4498.s3.amazonaws.com/data/3440/csv/gfe_sequences.3440.csv?AWSAccessKeyId=ASIAXXVOWRIZGCB4DOSY&Signature=yT4Srg0c%2B9HUliDlR6aMn3k%2FoeQ%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEIz%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJHMEUCIGuXxxE56WS1fn6lX4FDsWTZllP%2BvFXFjTtGrEz2LIxtAiEA2hSzhg7gCOG66BSi6P6Tt%2FaJKVuCVZU0%2BVIQQIihCxUqgAMIhf%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARACGgw1MzE4Njg1ODQ0OTgiDEA0MVBESt0j7bcZGirUAghP6oKhUfR3%2BOgG9zbS0JFrEDo5yke7jhXoCW2S6EVExsl%2BwTWmYowTsNCvpfNNJ6%2BEY%2B3ns9ZYs%2FkJ54BK1mxHvmB8ZO2MAsPqw4Z0WQu0NMsIWoo7Wbqzw2FwqXeLEbPl3EImXwRGx%2B%2FGfZ7a78TJJPTKW%2FjUjRAAPEHkbs%2F6o4fqc%2FX9NCVRUupBVo0J7ukD%2BzebgBXwVLzW7p0SyLPKwXKOW2vYNe194%2BxZgyvxQEdPzWyA9isQkXs2fdpcm%2B5x7nYbl%2B68oGn5xKo2Hss6hvSLv4Ok%2BUQ7xvFhs2tzT8t5bLUw1W%2FAV5be5AjlyWcc8kU9%2Bb1IxUn0Vj8Lb9INzb00Bbb0rlWJAwGo8rqjEitEG8CRtYhbQT8JpBOjNIJz2ZRccImWV1baggtTW%2FvkxanpStn4uwrVpjB

In [29]:
# limit = 10
start = time.time()
for idx, statement in enumerate(cypher):
    print(f'Executing statement: {statement}')
    statement_start = time.time()
    response = run_cypher(statement, debug=True)
    statement_end = time.time()
    statement_elapsed_time = statement_end - statement_start
    logger.info(f'Statement: {statement}\nTime elapsed: {statement_elapsed_time}\nResponse: {response}')
    print(f'Time elapsed: {statement_elapsed_time}\nResponse: {response}\n\n')
    # if limit and idx + 1 == limit:
        # break
        
end = time.time()
time_elapsed = end - start
print(f"Time elapsed: {time_elapsed} seconds")

Statement: // Nodes
RETURN '(:GFE)' AS `Creating nodes...`;
Time elapsed: 0.21080613136291504
Response: {'results': [{'columns': ['Creating nodes...'], 'data': [{'row': ['(:GFE)'], 'meta': [None]}]}], 'errors': []}


Statement: 
USING PERIODIC COMMIT 50000
LOAD CSV WITH HEADERS FROM 'https://gfe-db-4498.s3.amazonaws.com/data/3440/csv/gfe_sequences.3440.csv?AWSAccessKeyId=ASIAXXVOWRIZGCB4DOSY&Signature=yT4Srg0c%2B9HUliDlR6aMn3k%2FoeQ%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEIz%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJHMEUCIGuXxxE56WS1fn6lX4FDsWTZllP%2BvFXFjTtGrEz2LIxtAiEA2hSzhg7gCOG66BSi6P6Tt%2FaJKVuCVZU0%2BVIQQIihCxUqgAMIhf%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARACGgw1MzE4Njg1ODQ0OTgiDEA0MVBESt0j7bcZGirUAghP6oKhUfR3%2BOgG9zbS0JFrEDo5yke7jhXoCW2S6EVExsl%2BwTWmYowTsNCvpfNNJ6%2BEY%2B3ns9ZYs%2FkJ54BK1mxHvmB8ZO2MAsPqw4Z0WQu0NMsIWoo7Wbqzw2FwqXeLEbPl3EImXwRGx%2B%2FGfZ7a78TJJPTKW%2FjUjRAAPEHkbs%2F6o4fqc%2FX9NCVRUupBVo0J7ukD%2BzebgBXwVLzW7p0SyLPKwXKOW2vYNe194%2BxZgyvxQEdPzWyA9isQkXs2fdpcm%2B5x7nY

KeyboardInterrupt: 

In [26]:
response

{'results': [{'columns': [], 'data': []}], 'errors': []}

## Test Neo4j Server

In [73]:
url = f"http://{host}:{port}"

# Send requests
response = requests.get(
    url, 
    headers={
        "Accept": "application/json"
    })

print(response.text)

{
  "bolt_routing" : "neo4j://18.206.14.94:7687",
  "transaction" : "http://18.206.14.94:7474/db/{databaseName}/tx",
  "bolt_direct" : "bolt://18.206.14.94:7687",
  "neo4j_version" : "4.3.2",
  "neo4j_edition" : "community"
}
