diff --git a/scripts/emr/Makefile b/scripts/emr/Makefile index c1e0c592b9..1a689b8df0 100644 --- a/scripts/emr/Makefile +++ b/scripts/emr/Makefile @@ -27,10 +27,13 @@ terraform-init: create-cluster: cd terraform; terraform apply +create-jupyter-cluster: + cd terraform ; TF_VAR_install_jupyter="true" terraform apply + destroy-cluster: cd terraform; terraform destroy -proxy: +proxy: cd terraform; aws emr socks --cluster-id ${CLUSTER_ID} --key-pair-file ${KEY_PAIR_FILE} ssh: diff --git a/scripts/emr/README.md b/scripts/emr/README.md index c0242ee18e..7f987a8379 100644 --- a/scripts/emr/README.md +++ b/scripts/emr/README.md @@ -16,62 +16,138 @@ You need to install [Terraform 0.10.8](https://github.com/hashicorp/terraform/re ## Makefile -| Command | Description -|------------------|------------------------------------------------------------| -|terraform-init |`terraform` init, if it's the first run | -|create-cluster |Create EMR cluster with configurations | -|destroy-cluster |Destroy EMR cluster | -|create-cluster |Create EMR cluster with configurations | -|proxy |Create SOCKS proxy for active cluster | -|ssh |SSH into cluster master | -|cleanup-zeppelin |Cleanups all GeoTrellis jars from Zeppelin classpath | -|restart-zeppelin |Restart Zeppelin | -|stop-zeppelin |Stop Zeppelin | -|start-zeppelin |Start Zeppelin | +| Command | Description +|-----------------------|------------------------------------------------------------| +|terraform-init |`terraform` init, if it's the first run | +|create-cluster |Create EMR cluster with configurations | +|create-jupyter-cluster |Create EMR cluster with jupyter-scala available | +|destroy-cluster |Destroy EMR cluster | +|create-cluster |Create EMR cluster with configurations | +|upload-assembly |Send spark-etl assembly jar to cluster | +|proxy |Create SOCKS proxy for active cluster | +|ssh |SSH into cluster master | +|cleanup-zeppelin |Cleanups all GeoTrellis jars from Zeppelin classpath | +|restart-zeppelin |Restart Zeppelin | +|stop-zeppelin |Stop Zeppelin | +|start-zeppelin |Start Zeppelin | ## Running -Create a cluster and upload assembly on EMR master node: +### Creating a Zeppelin Cluster +The Makefile in this directory provides commands to easily set up an EMR +cluster, but doing so does require a minimal amount of configuration. It will +be necessary to provide your AWS credentials to the Terraform script. + +Begin by issuing the commands ```bash make terraform-init && \ -make create-cluster && \ -make upload-assembly +make create-cluster ``` -It will be necessary to provide your AWS credentials to the Terraform script. -Terraform will prompt for the access key, the secret key, and the PEM path for -the current account. You may enter these explicitly, or you may choose to set -environment variables to avoid having to repeatedly fill out the prompts. If -`TF_VAR_access_key`, `TF_VAR_secret_key`, and `TF_VAR_pem_path`, these will be -discovered by the Terraform script and you will not be prompted at startup. -The same mechanism can be used to set other variables. `TF_VAR_spot_price` -and `TF_VAR_worker_count` are useful values. +Terraform will prompt for the S3 access key and secret key, as well as the PEM +path for the current account. You may enter these explicitly, or you may +choose to set environment variables to avoid having to repeatedly fill out the +prompts. If `TF_VAR_access_key`, `TF_VAR_secret_key`, and `TF_VAR_pem_path`, +these will be discovered by the Terraform script and you will not be prompted +at startup. The same mechanism can be used to set other variables. +`TF_VAR_spot_price` and `TF_VAR_worker_count` are useful values. -Note: long startup times (greater than 5 or 6 minutes) probably indicates that -you have chosen a spot price that is too low. +**Note:** long startup times (10 minutes or more) probably indicates that you have +chosen a spot price that is too low. -Make proxy and access Zeppelin though UI: +**Note:** Due to the lack of sophistication in Terraform, you will also be +prompted to set the value of `TF_VAR_s3_notebook_bucket` and +`TF_VAR_s3_notebook_prefix`. These variables are only used by Jupyter-enabled +clusters and can be set arbitrarily for Zeppelin-only clusters. + +This basic cluster will have a running Zeppelin interface that can be accessed +by first creating an SSH tunnel with the command ```bash make proxy ``` +And then browsing to port 8890 of the cluster. + ![Zeppelin Welcome](./images/zeppelin-welcome.png) -Create a new notebook: +This cluster will not have access to GeoTrellis code until the command -![Zeppelin GeoTrellis Notebook](./images/zeppelin-geotrellis-notebook.png) +```bash +make upload-assembly +``` -Go into interpreters tab: +is issued. Upon doing so, you must configure Zeppelin to recognize this +resource by going to the interpreters tab: ![Zeppelin interpreters](./images/zeppelin-interpreters.png) -Edit spark interpreter, and add GeoTrellis jar into deps (make sure that you uploaded GeoTrellis -jar via `make upload-assembly` into `/tmp/geotrellis-spark-etl-assembly-1.2.0-SNAPSHOT.jar` directory): +Edit the spark interpreter settings by adding the GeoTrellis jar into the +class path (`make upload-assembly` copies the fat jar into, e.g., +`/tmp/geotrellis-spark-etl-assembly-1.2.0-SNAPSHOT.jar`): ![Zeppelin interpreter edit](./images/zeppelin-interpreter-edit.png) -After that GeoTrellis deps can be imported: +You may then create a new notebook: + +![Zeppelin GeoTrellis Notebook](./images/zeppelin-geotrellis-notebook.png) + +wherein GeoTrellis deps can be imported: ![Zeppelin GeoTrellis example](./images/zeppelin-geotrellis-example.png) + +### Creating a Jupyter Scala cluster + +An alternative to Zeppelin that may be more user-friendly will be to create a +cluster with Jupyter notebook installed. To do so, set +`TF_VAR_s3_notebook_bucket` and `TF_VAR_s3_notebook_prefix` to appropriate +values (for example, `geotrellis-work` and `user/notebooks`, respectively). +This will set a location on S3 for persistent storage of notebooks. Then +issue + +```bash +make terraform-init &&\ +make create-jupyter-cluster +``` + +**Note:** long startup times (10 minutes or more) probably indicates that you have +chosen a spot price that is too low. Set `TF_VAR_spot_price` appropriately. + +After completing the startup process, navigate to port 8000 of the new +cluster. You will be greeted with a login prompt. Log in as `user` with the +password `password`: + +![Jupyter login](./images/jupyter-login.png) + +You will then be presented with the file browser where you may open existing +notebooks, or create new ones. To create a new notebook, in the `New` +dropdown in the upper right, choose `Scala`: + +![Create new Jupyter Scala notebook](./images/jupyter-new-notebook.png) + +**Note:** Python 3 is available as an option, but it will not have +GeoTrellis/Spark capabilities. + +In the resulting notebook, you may enter Scala code, but neither GeoTrellis +nor a SparkContext are yet available. As this notebook interface is based on +Ammonite, there is no `build.sbt` to set up imports, and you must use the +Ammonite import syntax. To ease initial setup, an extension was provided to +load in the standard boilerplate code for setting up Spark and GeoTrellis. +Click the star icon in the toolbar to insert three new cells at the top of +your notebook: + +![Insert standard boilerplate](./images/jupyter-insert-boilerplate.png) + +The first cell can be executed as is, which could take a few minutes to +download the required packages. This will create the `implicit val sc: +SparkContext` required by many GeoTrellis operations. + +To import GeoTrellis, choose **one** of the following two cells to execute. +Either execute the code to load a published version of GeoTrellis from the +LocationTech repo, _or_ execute the cell to load the `spark-etl` fat jar made +available on the cluster using the `make upload-assembly` directive on your +local machine. + +After this, it will be possible to import and use GeoTrellis code. Notebooks +saved during this session will persist in the given S3 directory. diff --git a/scripts/emr/images/jupyter-insert-boilerplate.png b/scripts/emr/images/jupyter-insert-boilerplate.png new file mode 100644 index 0000000000..e7924d7915 Binary files /dev/null and b/scripts/emr/images/jupyter-insert-boilerplate.png differ diff --git a/scripts/emr/images/jupyter-login.png b/scripts/emr/images/jupyter-login.png new file mode 100644 index 0000000000..30dc412aee Binary files /dev/null and b/scripts/emr/images/jupyter-login.png differ diff --git a/scripts/emr/images/jupyter-new-notebook.png b/scripts/emr/images/jupyter-new-notebook.png new file mode 100644 index 0000000000..2fc768f4ad Binary files /dev/null and b/scripts/emr/images/jupyter-new-notebook.png differ diff --git a/scripts/emr/terraform/boilerplate.js b/scripts/emr/terraform/boilerplate.js new file mode 100644 index 0000000000..e691e26716 --- /dev/null +++ b/scripts/emr/terraform/boilerplate.js @@ -0,0 +1,68 @@ +// file boilerplate/main.js + +define([ + 'base/js/namespace' +], function( + Jupyter +) { + function load_ipython_extension() { + var handler = function () { + var spark_cell = Jupyter.notebook.insert_cell_above('code', 0); + spark_cell.set_text('import $exclude.`org.slf4j:slf4j-log4j12`, $ivy.`org.slf4j:slf4j-nop:1.7.21` // Quiets the logger\n\ +import $profile.`hadoop-2.6`\n\ +import $ivy.`org.apache.spark::spark-sql:2.1.0`\n\ +import $ivy.`org.apache.hadoop:hadoop-aws:2.6.4`\n\ +import $ivy.`org.jupyter-scala::spark:0.4.2`\n\ +\n\ +import org.apache.spark._\n\ +import org.apache.spark.rdd.RDD\n\ +import jupyter.spark.session._\n\ +\n\ +val sparkSession =\n\ + JupyterSparkSession\n\ + .builder()\n\ + .jupyter() // Must be called immediately after builder()\n\ + .master("local[*]")\n\ + .appName("testing")\n\ +.getOrCreate()\n\ +\n\ +implicit val sc = sparkSession.sparkContext'); + var lt_gt_cell = Jupyter.notebook.insert_cell_below('code', 0); + lt_gt_cell.set_text('// Execute to load GeoTrellis from LocationTech repo\n\ +\n\ +import ammonite._, Resolvers._\n\ +\n\ +val locationtech = Resolver.Http(\n\ + "locationtech-releases",\n\ + "https://repo.locationtech.org/content/groups/releases",\n\ + MavenPattern,\n\ + true // Declares whether the organization is dot- (false) or slash- (true) delimited\n\ +)\n\ +\n\ +interp.resolvers() = interp.resolvers() :+ locationtech\n\ +\n\ +import $ivy.`org.locationtech.geotrellis::geotrellis-spark-etl:1.2.0`'); + var local_gt_cell = Jupyter.notebook.insert_cell_below('code', 1); + local_gt_cell.set_text('// Execute to load GeoTrellis from a fat jar uploaded to the local file system\n\ +\n\ +import ammonite._\n\ +interp.load.cp(ops.Path("/tmp/geotrellis-spark-etl-assembly-1.2.0-SNAPSHOT.jar"))'); + }; + + var action = { + icon : 'fa-star-o', // a font-awesome class used on buttons, etc + help : 'Insert standard boilerplate', + help_index : 'zz', + handler : handler + }; + var prefix = 'boilerplate'; + var action_name = 'insert_standard_code'; + + var full_action_name = Jupyter.actions.register(action, action_name, prefix); + Jupyter.toolbar.add_buttons_group([full_action_name]); + } + + return { + load_ipython_extension: load_ipython_extension + }; +}); diff --git a/scripts/emr/terraform/bootstrap.sh b/scripts/emr/terraform/bootstrap.sh index a2efbef24c..4550f8def3 100644 --- a/scripts/emr/terraform/bootstrap.sh +++ b/scripts/emr/terraform/bootstrap.sh @@ -4,6 +4,7 @@ S3_ACCESS_KEY=$1 S3_SECRET_KEY=$2 S3_NOTEBOOK_BUCKET=$3 S3_NOTEBOOK_PREFIX=$4 +EXECUTE_SCRIPT=$5 # Parses a configuration file put in place by EMR to determine the role of this node is_master() { @@ -14,7 +15,7 @@ is_master() { fi } -if is_master; then +if is_master && [ "$EXECUTE_SCRIPT" == "true" ] ; then echo "Installing system software ..." curl -sL https://rpm.nodesource.com/setup_6.x | sudo -E bash - sudo yum install -y -q nodejs @@ -91,5 +92,13 @@ EOF sudo mv /tmp/jupyter_profile.sh /etc/profile.d . /etc/profile.d/jupyter_profile.sh + # Install boilerplate extension + cd /tmp + mkdir boilerplate + mv bp.js boilerplate/main.js + sudo npm install requirejs + sudo /usr/local/bin/jupyter nbextension install --system boilerplate + sudo /usr/local/bin/jupyter nbextension enable --system boilerplate/main + echo "Running at host $AWS_DNS_NAME" fi diff --git a/scripts/emr/terraform/emr-spark.tf b/scripts/emr/terraform/emr-spark.tf index 8b4e3c33b4..40198706e2 100644 --- a/scripts/emr/terraform/emr-spark.tf +++ b/scripts/emr/terraform/emr-spark.tf @@ -73,10 +73,21 @@ resource "aws_emr_cluster" "emr-spark-cluster" { } } + provisioner "file" { + source = "boilerplate.js" + destination = "/tmp/bp.js" + connection { + type = "ssh" + user = "hadoop" + host = "${aws_emr_cluster.emr-spark-cluster.master_public_dns}" + private_key = "${file("${var.pem_path}")}" + } + } + provisioner "remote-exec" { inline=[ "chmod +x /tmp/bootstrap.sh", - "/tmp/bootstrap.sh ${var.access_key} ${var.secret_key} ${var.s3_notebook_bucket} ${var.s3_notebook_prefix}" + "/tmp/bootstrap.sh ${var.access_key} ${var.secret_key} ${var.s3_notebook_bucket} ${var.s3_notebook_prefix} ${var.install_jupyter}" ] connection { type = "ssh" diff --git a/scripts/emr/terraform/variables.tf.json b/scripts/emr/terraform/variables.tf.json index 7834d9a8a6..44629ebfdd 100644 --- a/scripts/emr/terraform/variables.tf.json +++ b/scripts/emr/terraform/variables.tf.json @@ -1,6 +1,6 @@ -{ +{ "variable": { - "access_key": { + "access_key": { "description": "From your `~/.aws/credentials" }, "secret_key": { @@ -15,6 +15,10 @@ "s3_notebook_prefix": { "description": "The prefix path inside the S3 notebook bucket" }, + "install_jupyter": { + "default": "false", + "description": "Install jupyter-scala during cluster creation" + }, "region": { "default": "us-east-1", "description": "Can be overridden if necessary"