Skip to content

Commit

Permalink
Upgrade Spark tutorial with the latest Spark&Hadoop version
Browse files Browse the repository at this point in the history
  • Loading branch information
maystery committed Feb 17, 2021
1 parent d076ce3 commit f3ca73e
Show file tree
Hide file tree
Showing 11 changed files with 196 additions and 323 deletions.
Binary file modified tutorials/spark-cluster-with-python.tar.gz
Binary file not shown.

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions tutorials/spark-cluster-with-python/infra-spark-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,20 @@ nodes:
- &M
name: spark-master
type: spark_master_node
- &W
name: spark-worker
type: spark_worker_node
- &S
name: spark-slave
type: spark_slave_node
scaling:
min: 2
max: 10

variables:
HADOOP_VERSION: 2.10.1
SPARK_VERSION: 2.4.7
SPARK_HADOOP_VERSION: 2.7
CONSUL_VERSION: 1.9.1
HADOOP_VERSION: 3.3.0
SPARK_VERSION: 3.0.1
SPARK_HADOOP_VERSION: 3.2
CONSUL_VERSION: 1.9.3
CONSUL_TEMPLATE_VERSION: 0.25.1

dependencies:
-
connection: [ *W, *M ]
connection: [ *S, *M ]
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ write_files:
content: |
#!/bin/bash
chage -d 2020-08-04 ubuntu
set -ex
HADOOP_VERSION={{variables.HADOOP_VERSION}}
SPARK_VERSION={{variables.SPARK_VERSION}}
Expand Down Expand Up @@ -37,8 +38,8 @@ write_files:
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y openjdk-8-jdk openjdk-8-jre python3-pip unzip
su - sparkuser -c 'pip3 install pyspark==2.4.6 notebook==6.0.3 jupyter-contrib-nbextensions==0.5.1 matplotlib==3.2.1'
apt-get install -yq openjdk-8-jdk openjdk-8-jre python3-pip unzip
su - sparkuser -c 'pip3 install pyspark==3.0.1 notebook==6.2.0 jupyter-contrib-nbextensions==0.5.1 matplotlib==3.3.4'
echo "Install requirement packages starts."
Expand Down Expand Up @@ -88,7 +89,6 @@ write_files:
chown sparkuser:sparkuser /home/sparkuser/.bashrc
echo export PATH="/home/sparkuser/hadoop/bin:$PATH" >> /home/sparkuser/.bashrc
mv /tmp/hadoop/configs/* /home/sparkuser/hadoop/etc/hadoop
mv /tmp/hadoop/webconfigs/* /home/sparkuser/hadoop/share/hadoop/hdfs/webapps/hdfs/WEB-INF/
echo "spark: lpds, admin" >> /home/sparkuser/hadoop/etc/hadoop/realm.properties
echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre" >> /home/sparkuser/hadoop/etc/hadoop/hadoop-env.sh
echo "export HADOOP_PID_DIR=/home/sparkuser/hadoop" >> /home/sparkuser/hadoop/etc/hadoop/hadoop-env.sh
Expand Down Expand Up @@ -154,8 +154,8 @@ write_files:
echo "Creating example"
su - sparkuser -c 'mkdir /home/sparkuser/example'
su - sparkuser -c 'wget https://raw.githubusercontent.com/occopus/docs/devel/tutorials/spark-cluster-with-python/example/Spark_cluster_and_HDFS_cluster_test.ipynb -O /home/sparkuser/example/Spark_cluster_and_HDFS_cluster_test.ipynb'
su - sparkuser -c 'wget https://raw.githubusercontent.com/occopus/docs/devel/tutorials/spark-cluster-with-python/example/sztaki_logo.jpg -O /home/sparkuser/example/sztaki_logo.jpg'
su - sparkuser -c 'wget https://raw.githubusercontent.com/occopus/docs/master/tutorials/spark-cluster-with-python/example/Spark_cluster_and_HDFS_cluster_test.ipynb -O /home/sparkuser/example/Spark_cluster_and_HDFS_cluster_test.ipynb'
su - sparkuser -c 'wget https://raw.githubusercontent.com/occopus/docs/master/tutorials/spark-cluster-with-python/example/sztaki_logo.jpg -O /home/sparkuser/example/sztaki_logo.jpg'
sed -i "s/xxxSPARKMASTERIPxxx/$MASTERIP/g" /home/sparkuser/example/Spark_cluster_and_HDFS_cluster_test.ipynb
echo "Example created"
Expand All @@ -175,7 +175,7 @@ write_files:
echo "Launch HADOOP starts."
echo 'Y' | /home/sparkuser/hadoop/bin/hdfs namenode -format hdfs_cluster
/home/sparkuser/hadoop/sbin/hadoop-daemon.sh start namenode
/home/sparkuser/hadoop/bin/hdfs --daemon start namenode
echo "Launch HADOOP finished."
Expand Down Expand Up @@ -203,15 +203,10 @@ write_files:
<configuration>
<property>
<name>dfs.namenode.http-address</name>
<value>HadoopMaster:50070</value>
<value>HadoopMaster:9870</value>
</property>
<property>
<name>dfs.name.dir</name>
<value>/tmp</value>
<final>true</final>
</property>
<property>
<name>dfs.permissions</name>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<property>
Expand Down Expand Up @@ -241,46 +236,12 @@ write_files:
content: |
<configuration>
<property>
<name>fs.default.name</name>
<name>fs.defaultFS</name>
<value>hdfs://HadoopMaster:9000</value>
</property>
</configuration>
permissions: '644'

- path: /tmp/hadoop/webconfigs/web.xml
content: |
<?xml version="1.0" encoding="UTF-8"?>
<web-app version="2.4" xmlns="http://java.sun.com/xml/ns/j2ee">
<security-constraint>
<web-resource-collection>
<web-resource-name>Protected</web-resource-name>
<url-pattern>/*</url-pattern>
</web-resource-collection>
<auth-constraint>
<role-name>admin</role-name>
</auth-constraint>
</security-constraint>
<login-config>
<auth-method>BASIC</auth-method>
<realm-name>realm</realm-name>
</login-config>
</web-app>
permissions: '644'

- path: /tmp/hadoop/webconfigs/jetty-web.xml
content: |
<Configure class="org.mortbay.jetty.webapp.WebAppContext">
<Get name="securityHandler">
<Set name="userRealm">
<New class="org.mortbay.jetty.security.HashUserRealm">
<Set name="name">realm</Set>
<Set name="config">/home/sparkuser/hadoop/etc/hadoop/realm.properties</Set>
</New>
</Set>
</Get>
</Configure>
permissions: '644'

- path: /home/sparkuser/consul/hosts.ctmpl
content: |
127.0.0.1 localhost
Expand Down Expand Up @@ -356,8 +317,8 @@ write_files:
if [ `diff /etc/hosts $FILE_LOCATION | grep '>' | wc -l` -gt 0 ]; then
echo -e `date +%Y-%m-%d` `date +"%T"` "Downscale detected. Restarting name node service..."
su - sparkuser -c '/home/sparkuser/hadoop/sbin/hadoop-daemon.sh stop namenode'
su - sparkuser -c '/home/sparkuser/hadoop/sbin/hadoop-daemon.sh start namenode'
su - sparkuser -c '/home/sparkuser/hadoop/bin/hdfs --daemon stop namenode'
su - sparkuser -c '/home/sparkuser/hadoop/bin/hdfs --daemon start namenode'
echo -e `date +%Y-%m-%d` `date +"%T"` "Namenode restarted!"
cp /etc/hosts $FILE_LOCATION
else [ `diff /etc/hosts $FILE_LOCATION | grep '<' | wc -l` -gt 0 ]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ write_files:
content: |
#!/bin/bash
chage -d 2020-08-04 ubuntu
set -ex
HADOOP_VERSION={{variables.HADOOP_VERSION}}
SPARK_VERSION={{variables.SPARK_VERSION}}
Expand All @@ -34,7 +35,7 @@ write_files:
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y openjdk-8-jdk openjdk-8-jre unzip r-base
apt-get install -yq --no-install-recommends openjdk-8-jdk openjdk-8-jre unzip
echo "Install requirement packages starts."
Expand Down Expand Up @@ -131,11 +132,10 @@ write_files:
#!/bin/bash
set -ex
MASTERIP=`hostname -I | col1`
echo "Launch HADOOP starts."
/home/sparkuser/hadoop/sbin/hadoop-daemon.sh start datanode
/home/sparkuser/hadoop/bin/hdfs --daemon start datanode
echo "Launch HADOOP finished."
Expand All @@ -155,21 +155,20 @@ write_files:
<configuration>
<property>
<name>dfs.namenode.http-address</name>
<value>{{getprivip('spark-master')}}:50070</value>
</property>
<property>
<name>dfs.name.dir</name>
<value>/tmp</value>
<final>true</final>
<value>HadoopMaster:9870</value>
</property>
<property>
<name>dfs.permissions</name>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.datanode.du.reserved</name>
<value>500000000</value>
</property>
<property>
<name>dfs.hosts.exclude</name>
<value>/home/sparkuser/hadoop/etc/hadoop/dfs.exclude</value>
</property>
<property>
<name>dfs.client.use.datanode.hostname</name>
<value>true</value>
Expand All @@ -189,7 +188,7 @@ write_files:
content: |
<configuration>
<property>
<name>fs.default.name</name>
<name>fs.defaultFS</name>
<value>hdfs://{{getprivip('spark-master')}}:9000</value>
</property>
</configuration>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
health_check:
ports:
- 8080
timeout: 2000
timeout: 1000

'node_def:spark_worker_node':
'node_def:spark_slave_node':
-
resource:
type: nova
Expand All @@ -40,4 +40,6 @@
contextualisation:
type: cloudinit
context_template: !yaml_import
url: file://cloud_init_spark_worker.yaml
url: file://cloud_init_spark_slave.yaml
health_check:
ping: False
Binary file modified tutorials/spark-cluster-with-r.tar.gz
Binary file not shown.
18 changes: 9 additions & 9 deletions tutorials/spark-cluster-with-r/infra-spark-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,22 @@ nodes:
- &M
name: spark-master
type: spark_master_node
- &W
name: spark-worker
type: spark_worker_node
- &S
name: spark-slave
type: spark_slave_node
scaling:
min: 2
max: 10

variables:
HADOOP_VERSION: 2.10.1
SPARK_VERSION: 2.4.7
SPARK_HADOOP_VERSION: 2.7
CONSUL_VERSION: 1.9.1
HADOOP_VERSION: 3.3.0
SPARK_VERSION: 3.0.1
SPARK_HADOOP_VERSION: 3.2
CONSUL_VERSION: 1.9.3
CONSUL_TEMPLATE_VERSION: 0.25.1
RSTUDIO_VERSION: 1.3.1073-amd64
RSTUDIO_VERSION: 1.4.1103-amd64

dependencies:
-
connection: [ *W, *M ]
connection: [ *S, *M ]

56 changes: 9 additions & 47 deletions tutorials/spark-cluster-with-r/nodes/cloud_init_spark_master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ write_files:
content: |
#!/bin/bash
chage -d 2020-08-04 ubuntu
set -ex
HADOOP_VERSION={{variables.HADOOP_VERSION}}
SPARK_VERSION={{variables.SPARK_VERSION}}
Expand Down Expand Up @@ -40,7 +41,7 @@ write_files:
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y openjdk-8-jdk openjdk-8-jre unzip r-base gdebi-core libpango1.0-dev libcurl4-gnutls-dev libssl-dev libxml2-dev
apt-get install -yq openjdk-8-jdk openjdk-8-jre unzip r-base gdebi-core libpango1.0-dev libcurl4-gnutls-dev libssl-dev libxml2-dev
echo "Install requirement packages starts."
Expand Down Expand Up @@ -74,6 +75,7 @@ write_files:
echo "Install RSTUDIO starts."
wget -nc "https://download2.rstudio.org/server/xenial/amd64/rstudio-server-$RSTUDIO_VERSION.deb" -O "/home/sparkuser/rstudio-server-$RSTUDIO_VERSION.deb"
gdebi -n /home/sparkuser/rstudio-server-$RSTUDIO_VERSION.deb
rm /home/sparkuser/rstudio-server-$RSTUDIO_VERSION.deb
echo "Install RSTUDIO finished."
Expand All @@ -96,7 +98,6 @@ write_files:
chown sparkuser:sparkuser /home/sparkuser/.bashrc
echo export PATH="/home/sparkuser/hadoop/bin:$PATH" >> /home/sparkuser/.bashrc
mv /tmp/hadoop/configs/* /home/sparkuser/hadoop/etc/hadoop
mv /tmp/hadoop/webconfigs/* /home/sparkuser/hadoop/share/hadoop/hdfs/webapps/hdfs/WEB-INF/
echo "spark: lpds, admin" >> /home/sparkuser/hadoop/etc/hadoop/realm.properties
echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre" >> /home/sparkuser/hadoop/etc/hadoop/hadoop-env.sh
echo "export HADOOP_PID_DIR=/home/sparkuser/hadoop" >> /home/sparkuser/hadoop/etc/hadoop/hadoop-env.sh
Expand Down Expand Up @@ -165,7 +166,7 @@ write_files:
echo "Launch HADOOP starts."
echo 'Y' | /home/sparkuser/hadoop/bin/hdfs namenode -format hdfs_cluster
/home/sparkuser/hadoop/sbin/hadoop-daemon.sh start namenode
/home/sparkuser/hadoop/bin/hdfs --daemon start namenode
echo "Launch HADOOP finished."
Expand All @@ -185,15 +186,10 @@ write_files:
<configuration>
<property>
<name>dfs.namenode.http-address</name>
<value>HadoopMaster:50070</value>
<value>HadoopMaster:9870</value>
</property>
<property>
<name>dfs.name.dir</name>
<value>/tmp</value>
<final>true</final>
</property>
<property>
<name>dfs.permissions</name>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<property>
Expand Down Expand Up @@ -223,46 +219,12 @@ write_files:
content: |
<configuration>
<property>
<name>fs.default.name</name>
<name>fs.defaultFS</name>
<value>hdfs://HadoopMaster:9000</value>
</property>
</configuration>
permissions: '644'

- path: /tmp/hadoop/webconfigs/web.xml
content: |
<?xml version="1.0" encoding="UTF-8"?>
<web-app version="2.4" xmlns="http://java.sun.com/xml/ns/j2ee">
<security-constraint>
<web-resource-collection>
<web-resource-name>Protected</web-resource-name>
<url-pattern>/*</url-pattern>
</web-resource-collection>
<auth-constraint>
<role-name>admin</role-name>
</auth-constraint>
</security-constraint>
<login-config>
<auth-method>BASIC</auth-method>
<realm-name>realm</realm-name>
</login-config>
</web-app>
permissions: '644'

- path: /tmp/hadoop/webconfigs/jetty-web.xml
content: |
<Configure class="org.mortbay.jetty.webapp.WebAppContext">
<Get name="securityHandler">
<Set name="userRealm">
<New class="org.mortbay.jetty.security.HashUserRealm">
<Set name="name">realm</Set>
<Set name="config">/home/sparkuser/hadoop/etc/hadoop/realm.properties</Set>
</New>
</Set>
</Get>
</Configure>
permissions: '644'

- path: /home/sparkuser/consul/hosts.ctmpl
content: |
127.0.0.1 localhost
Expand Down Expand Up @@ -338,8 +300,8 @@ write_files:
if [ `diff /etc/hosts $FILE_LOCATION | grep '>' | wc -l` -gt 0 ]; then
echo -e `date +%Y-%m-%d` `date +"%T"` "Downscale detected. Restarting name node service..."
su - sparkuser -c '/home/sparkuser/hadoop/sbin/hadoop-daemon.sh stop namenode'
su - sparkuser -c '/home/sparkuser/hadoop/sbin/hadoop-daemon.sh start namenode'
su - sparkuser -c '/home/sparkuser/hadoop/bin/hdfs --daemon stop namenode'
su - sparkuser -c '/home/sparkuser/hadoop/bin/hdfs --daemon start namenode'
echo -e `date +%Y-%m-%d` `date +"%T"` "Namenode restarted!"
cp /etc/hosts $FILE_LOCATION
else [ `diff /etc/hosts $FILE_LOCATION | grep '<' | wc -l` -gt 0 ]
Expand Down

0 comments on commit f3ca73e

Please sign in to comment.