adding history server; mount volumes to persist logs;

pavanpkulkarni · May 10, 2018 · 05e0c21 · 05e0c21
1 parent 61fe4aa
commit 05e0c21
Show file tree

Hide file tree

Showing 14 changed files with 80,361 additions and 104 deletions.
diff --git a/Docker_WordCount_Spark-1.0.jar b/Docker_WordCount_Spark-1.0.jar
diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,5 @@
-FROM pavanpkulkarni/spark_image_2.0.1:first_image
-MAINTAINER pavan.git@gmail.com
+FROM img_221
+LABEL authors="pavanpkulkarni@pavanpkulkarni.com"
 
-COPY SparkDocker-1.0.jar /opt/SparkDocker-1.0.jar
+COPY Docker_WordCount_Spark-1.0.jar /opt/Docker_WordCount_Spark-1.0.jar
 COPY sample.txt /opt/sample.txt
-
-CMD /opt/spark/bin/spark-submit --class $SPARK_CLASS --master spark://master:7077 /opt/SparkDocker-1.0.jar /opt/sample.txt
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Spark-2.1.0 Create and Run Jobs on Docker  
+# Create n-node Cluster and Run Jobs on Docker  
 
 ### Prerequisites 
 1. java - [Installation Instruction](https://www.java.com/en/download/help/download_options.xml)
@@ -11,14 +11,14 @@
 
 ### RunSparkJobOnDocker.sh Details
 This repository contains all the required files to create a n-node spark cluster and run a simple app on it. In this project, the script [RunSparkJobOnDocker.sh](blob/master/RunSparkJobOnDocker.sh) does the following:  
-1.  Pull the image from [docker-hub](https://hub.docker.com/r/pavanpkulkarni/spark_image_2.0.1/) 
+1.  Pull the image from [docker-hub](https://hub.docker.com/r/pavanpkulkarni/spark_image/) 
 2.  Create a n-node cluster. in our case, we are creating a 5-node cluster. This can changes by specifying `docker-compose scale slave=n`
 3.  Next, we build an image which acts as expternal image that will submit the job on the cluster that we just created.
-4.  Run the job on the cluster. You can either pull this [Source Code](https://github.com/pavanpkulkarni/SparkDocker) or try something of your own.
+4.  Run the job on the cluster. You can either pull this [Source Code](https://github.com/pavanpkulkarni/Spark_WordCount_Gradle) and build wiht Gradle or try something of your own.
 5.  Finally, when the job is executed, we bring down the cluster.
 
 ### Spark Job Description. 
 This is a simple spark job in Scala to read a file - [sample.txt](blob/master/sample.txt) and perform a basic Word Count on this file.
 
-### Image Dockerfile
-[Dockerfile](https://github.com/pavanpkulkarni/docker-spark-image_2.1.0) 
+### Spark Dockerfile
+[Dockerfile](https://github.com/pavanpkulkarni/docker-spark-image) 
diff --git a/RunSparkJobOnDocker.sh b/RunSparkJobOnDocker.sh
@@ -1,16 +1,41 @@
 #!/bin/bash
 
-#pull docker image
-docker pull pavanpkulkarni/spark_image_2.0.1:first_image
+currtime=$(date +"%Y%m%d")
+LogFile=dockerRun_${currtime}.log
 
-#create a 5 node cluster with default configurations 
-docker-compose build && docker-compose up -d && docker-compose scale slave=5
+#pull spark docker image
+echo "***** Begin Pulling image ***** " >>$LogFile 2>&1
+docker pull pavanpkulkarni/spark_image:2.2.1
+echo "***** End Pulling image *****" >>$LogFile 2>&1
 
-#spawn an image from where the job will be submitted to cluster 
-docker build -t pavanpkulkarni/spark_image_2.0.1:first_image . 
+#create a n node cluster with default configurations 
+number_of_nodes=3
+echo "***** Creating " $number_of_nodes " node cluster *****" >>$LogFile 2>&1
+docker-compose build >>$LogFile 2>&1
+docker-compose up -d --scale slave=$number_of_nodes >>$LogFile 2>&1
 
 #run the job on the cluster
-docker run --net createandrunsparkjob_default -e "SPARK_CLASS=com.pavanpkulkarni.docker.SampleSparkScalaCode" pavanpkulkarni/spark_image_2.0.1:first_image
+echo "***** Executing job on " $number_of_nodes "node cluster *****" >>$LogFile 2>&1
+docker exec master /opt/spark/bin/spark-submit \
+			--class com.pavanpkulkarni.dockerwordcount.DockerWordCount \
+			--master spark://master:6066 \
+			--deploy-mode cluster \
+			--verbose \
+			/opt/Docker_WordCount_Spark-1.0.jar /opt/sample.txt  /opt/output/op1 >>$LogFile 2>&1
 
-#remove the cluster 
-docker-compose down
+
+#Uncomment below lines to add logic to bring down the cluster after spark job finish
+
+# submissionId=$(grep submissionId $LogFile | cut -d \" -f4)
+
+# echo "submission Id is : " $submissionId >>$LogFile 2>&1
+
+# driverState=$(curl http://localhost:6066/v1/submissions/status/$submissionId | grep driverState | cut -d \" -f4) >>$LogFile 2>&1
+
+# echo "driver state is : " $driverState >>$LogFile 2>&1
+
+# #remove the cluster iff driver is finished
+# if [[ "$driverState" == "FINISH" ]]; then
+# 	echo "driver has successfully finished execution.. bringing down the cluster !!! " >>$LogFile 2>&1
+# 	docker-compose down >>$LogFile 2>&1
+# fi
diff --git a/SparkDocker-1.0.jar b/SparkDocker-1.0.jar
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,4 +1,4 @@
-version: '1'
+version: '3'
 
 ###############################################################################
 #DOCKER-COMPOSE 
@@ -14,16 +14,39 @@ version: '1'
 services:
   master:
     build: .
-    image: pavanpkulkarni/spark_image_2.0.1:first_image
+    image: img_221
     container_name: master
     ports:
-      - "8080:8080"
+      - 4040:4040
+      - 7077:7077
+      - 8080:8080
+      - 6066:6066
     command: ["/usr/bin/supervisord", "--configuration=/opt/conf/master.conf"]
+
 #---------------------------------------
 # SLAVE CONFIGURATION 
 #---------------------------------------
   slave:
-    image: pavanpkulkarni/spark_image_2.0.1:first_image
+    image: img_221
     depends_on:
       - master
+    ports:
+      - "8081"
     command: ["/usr/bin/supervisord", "--configuration=/opt/conf/slave.conf"]
+    volumes:
+        - ./docker-volume/spark-output/:/opt/output
+        - ./docker-volume/spark-events/:/opt/spark-events
+
+#---------------------------------------
+# HISTORY SERVER CONFIGURATION 
+#---------------------------------------
+  history-server:
+      image: img_221
+      container_name: history-server
+      depends_on:
+        - master
+      ports:
+        - "18080:18080"
+      command: ["/usr/bin/supervisord", "--configuration=/opt/conf/history-server.conf"]
+      volumes:
+        - ./docker-volume/spark-events:/opt/spark-events
diff --git a/docker-volume/spark-events/app-20180510051218-0000 b/docker-volume/spark-events/app-20180510051218-0000
diff --git a/docker-volume/spark-output/op1/._SUCCESS.crc b/docker-volume/spark-output/op1/._SUCCESS.crc
diff --git a/docker-volume/spark-output/op1/.part-00000-119b7d77-8dee-46ad-a4ec-5293a835798a-c000.csv.crc b/docker-volume/spark-output/op1/.part-00000-119b7d77-8dee-46ad-a4ec-5293a835798a-c000.csv.crc
diff --git a/docker-volume/spark-output/op1/_SUCCESS b/docker-volume/spark-output/op1/_SUCCESS