In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/neo4j-partners/neo4j-google-cloud-dataflow/blob/main/notebook/neo4j_dataflow_bigquery.ipynb" target="_blank">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/neo4j-partners/neo4j-google-cloud-dataflow/blob/main/notebook/neo4j_dataflow_bigquery.ipynb" target="_blank">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/neo4j-partners/neo4j-google-cloud-dataflow/main/notebook/neo4j_dataflow_bigquery.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">Open in Vertex AI Workbench
    </a>
</td>
</table>

# Overview
This notebook will walk you through the steps of setting up everything you need to run through Neo4j Dataflow hands-on lab.
Once you've set this up you will follow the rest of the tutorial and learn how to set up Google Cloud Dataflow to extract, transform, and load data from Google BigQuery into a Neo4j graph database instance.

# Setup

## Set up your development environment
We suggest you use Colab for this notebook.

## Deploy your Neo4j Instance

If you haven't done so already you will need to deploy an instance of Neo4j. This can be running anywhere in the world as long as it is accessible from the Internet. 

For this lab, however, we recommend deploying an instance of Aura from Google Cloud Platform. The links below will show you how to deploy an instance of AuraDS from Google Cloud Marketplace, however for this lab you can also deploy AuraDB Free. 

- Tutorial link: https://github.com/neo4j-partners/hands-on-lab-neo4j-and-vertex-ai/tree/main/Lab%201%20-%20Deploy%20Neo4j#lab-1---deploy-neo4j
- Video link: https://youtu.be/27PMDtlSP4w

## Prepare your template files


## Install additional Packages
First off, you'll also need to install a few packages.

In [None]:
!pip install --quiet google-cloud-storage
!pip install --quiet python-dotenv

try:
    import google.colab
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)
except:
    print('Please restart the kernel manually')

## Get the sample job specification file 
We'll need to copy the sample job spec from the github repo to upload later to our Google Cloud Storage bucket

In [None]:
import os
job_spec_url = "https://raw.githubusercontent.com/neo4j-partners/neo4j-google-cloud-dataflow/main/templates/bq-northwind-jobspec.json"

jobspec_filename = 'job_spec.json'
connection_filename = 'neo4j_connection.json'

# Download the sample job spec template from github
download_cmd = "wget -O " + jobspec_filename + " " + job_spec_url
print('download_cmd:', download_cmd)
os.system(download_cmd)

## Create the Neo4j connection file
If you created an instance of Aura for this you can upload the credentials file below and it will automatically convert it to the correct JSON format. If you don't have this you will be taken through a dialogue to enter these manually.

In [None]:
import json

try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

neo4j_creds = {}
creds_uploaded=False

if IN_COLAB:
    from google.colab import files
    from dotenv import dotenv_values
    uploaded = files.upload()
    print('uploaded:', uploaded)
    try:
        filename = list(uploaded.keys())[0]
        print("Uploaded file:", filename)
        aura_creds = dotenv_values(filename)
        for item in aura_creds:
            print(item, '=', aura_creds[item])
            neo4j_creds[item] = aura_creds[item]
        creds_uploaded=True
    except:
        creds_uploaded=False

if not creds_uploaded:
    print('Enter the Aura connection values:')
    neo4j_creds['NEO4J_URI'] = input('Neo4j URI: ')
    neo4j_creds['NEO4J_USERNAME'] = input('Neo4j Username ("neo4j"): ') or 'neo4j'
    neo4j_creds['NEO4J_PASSWORD'] = input('Neo4j Password: ') 

# Adding some default values needed for the Dataflow template
neo4j_creds['NEO4J_DATABASE'] = 'neo4j'
neo4j_creds['NEO4J_AUTHTYPE'] = 'basic'

# Convert the values to the required key names and convert to JSON 
neo4j_auth = {}
neo4j_auth['server_url'] = neo4j_creds['NEO4J_URI']
neo4j_auth['database'] = neo4j_creds['NEO4J_DATABASE']
neo4j_auth['auth_type'] = neo4j_creds['NEO4J_AUTHTYPE']
neo4j_auth['username'] = neo4j_creds['NEO4J_USERNAME']
neo4j_auth['pwd'] = neo4j_creds['NEO4J_PASSWORD']

connection_json = json.dumps(neo4j_auth, indent=4)

print('connection_json:')
print(connection_json)
print('')

outfile = 'neo4j_connection.json'
#print('Writing to file', outfile)
print('Writing to file', connection_filename)

f = open(outfile, "w")
f.write(connection_json)
f.close()



## Define Google Cloud variables
You'll need to set a few variables for your GCP environment.  PROJECT_ID and STORAGE_BUCKET are most critical.  The others will probably work with the defaults given.

In [None]:
# Edit these variables!
PROJECT_ID = "YOUR-PROJECT-ID"
STORAGE_BUCKET = "YOUR-BUCKET-NAME"

# You can leave these defaults
REGION = "us-central1"
STORAGE_PATH = "dataflow_templates"

In [None]:
import os

os.environ["GCLOUD_PROJECT"] = PROJECT_ID

## Authenticate your Google Cloud account


In [None]:
try:
    from google.colab import auth as google_auth

    google_auth.authenticate_user()
except:
    pass

## Upload to a GCP Cloud Storage Bucket

To get the data into Vertex AI, we must first put it in a bucket as a CSV.

In [None]:
from google.cloud import storage

# Create bucket (will add code later to check if it already exists or to create a unique name)
client = storage.Client()
bucket = client.bucket(STORAGE_BUCKET)
client.create_bucket(bucket)

template_files = []

# Upload our files to that bucket
for filename in [jobspec_filename, connection_filename]:
    upload_path = os.path.join(STORAGE_PATH, filename)
    blob = bucket.blob(upload_path)
    blob.upload_from_filename(filename)