# Spark Cluster Configuration

### Importing Required Libraries ::
* __boto3__: Required to connect as operate AWS task
* __botocore__: Required to handle the exceptions related to boto3 tasks
* __paramiko__: Reuired to run commands inside EC2 instances
* __json__: To convert python native dictionaries to string, to write in files
* __datetime__, __pprint__, __sys__, __time__: General purpose use

In [1]:
import boto3, botocore, paramiko
from datetime import datetime
import pprint, sys, time, json
from botocore.exceptions import ClientError

### Creating boto3 session, clients and resources ::

In [2]:
try:
    session = boto3.session.Session(region_name='us-east-1')
    ec2_client = session.client('ec2')
    ec2_resource = session.resource('ec2')
except ClientError as e:
    print("Unexpected error while creating boto3 session, client and resources: " + str(e))
    exit()

### Declaring the hardcoded informations ::

In [3]:
run_id = datetime.now().strftime('%Y%m%d%H%M%S')
user = 'root'
# config_dir = '/Volumes/WorkSpace/POC/SparkClusterEC2/ConfigDir'
# config_file_name = config_dir + '/' + user + '_node_details.dat'
spark_home = '/home/ec2-user/spark-2.4.5-bin-hadoop2.7'
cluster_instance_type = 't2.micro'
cluster_key_pair_path = '/Volumes/WorkSpace/AWS/Access_Keys'
cluster_key_pair_name = 'SparkCluster'
cluster_subnet_id = 'subnet-070cddc01a126f07f'
cluster_security_group_list = ['sg-05ee7f205f173862c']

### Check for running Master Node for current user ::
* To start any cluster, first master node needs to be detected.

In [4]:
try:
    master_instance_details = ec2_resource.instances.filter(
        Filters=[
            {
                'Name': 'instance-state-name',
                'Values': ['running']
            },
            {
                'Name': 'tag:Project',
                'Values': ['SparkCluster']
            },
            {
                'Name': 'tag:User',
                'Values': [user]
            },
            {
                'Name': 'tag:NodeType',
                'Values': ['Master']
            }
        ]
    )
    if list(master_instance_details):
        master_node_id = list(master_instance_details)[0].id
        print("Master node: Instance('" + master_node_id + "').")
    else:
        print("No running master node for User('" + user + "'). Quitting process.")
        exit()
except ClientError as e:
    print("Unexpected error while looking for already running Master node EC2 instance for user-'" + user + "': " + str(e))
    exit()

Master node: Instance('i-0a27306c67986fd4b').


### Fetching required information of the Master Node ::
* Need to iterate and probe a few times to check whether the node is up before we can extract the informations

In [5]:
try:
    master_node_temp = ec2_client.describe_instances(InstanceIds=[master_node_id])['Reservations'][0]['Instances'][0]
    master_node = {
            'InstanceId': master_node_temp['InstanceId'],
            'PublicDnsName': master_node_temp['PublicDnsName'],
            'PublicIpAddress': master_node_temp['PublicIpAddress'],
            'PrivateIpAddress': master_node_temp['PrivateIpAddress'],
            'NodeName': 'master'
        }
    pprint.pprint(master_node)
except Exception as e:
    print("Unexpected error while extracting Spark Cluster Master node details: " + str(e))
    exit()

{'InstanceId': 'i-0a27306c67986fd4b',
 'NodeName': 'master',
 'PrivateIpAddress': '172.75.0.12',
 'PublicDnsName': 'ec2-3-95-251-159.compute-1.amazonaws.com',
 'PublicIpAddress': '3.95.251.159'}


### Check for running Slave Nodes for current user ::
* To start any cluster, first master node needs to be detected.

In [6]:
try:
    slave_node_id_list = []
    slave_instance_details = ec2_resource.instances.filter(
        Filters=[
            {
                'Name': 'instance-state-name',
                'Values': ['running']
            },
            {
                'Name': 'tag:Project',
                'Values': ['SparkCluster']
            },
            {
                'Name': 'tag:User',
                'Values': [user]
            },
            {
                'Name': 'tag:NodeType',
                'Values': ['Slave']
            }
        ]
    )
    if list(slave_instance_details):
        for slave_instance in slave_instance_details:
            slave_node_id_list.append(slave_instance.id)
            print("Slave node: Instance('" + slave_instance.id + "').")
    else:
        print("No running slave node for User('" + user + "'). Quitting process.")
        exit()
except ClientError as e:
    print("Unexpected error while looking for already running Slave node EC2 instance for user-'" + user + "': " + str(e))
    exit()

Slave node: Instance('i-0d1b1bbaf19af5f60').
Slave node: Instance('i-0d199f2f7e4a24207').


### Fetching required information of the Slave Nodes ::
* Need to iterate and probe a few times to check whether the node is up before we can extract the informations

In [7]:
try:
    slave_node_list = []
    slave_node_list_temp = ec2_client.describe_instances(InstanceIds=slave_node_id_list)['Reservations'][0]['Instances']
    for i in range(len(slave_node_list_temp)):
        slave_node_list.append({
            'InstanceId': slave_node_list_temp[i]['InstanceId'],
            'PublicDnsName': slave_node_list_temp[i]['PublicDnsName'],
            'PublicIpAddress': slave_node_list_temp[i]['PublicIpAddress'],
            'PrivateIpAddress': slave_node_list_temp[i]['PrivateIpAddress'],
            'NodeName': 'SlaveNode' + str(i + 1)
        })
    pprint.pprint(slave_node_list)
except Exception as e:
    print("Unexpected error while extracting Spark Cluster Master node details: " + str(e))
    exit()

[{'InstanceId': 'i-0d1b1bbaf19af5f60',
  'NodeName': 'SlaveNode1',
  'PrivateIpAddress': '172.75.0.22',
  'PublicDnsName': 'ec2-35-175-225-147.compute-1.amazonaws.com',
  'PublicIpAddress': '35.175.225.147'},
 {'InstanceId': 'i-0d199f2f7e4a24207',
  'NodeName': 'SlaveNode2',
  'PrivateIpAddress': '172.75.0.102',
  'PublicDnsName': 'ec2-18-234-92-252.compute-1.amazonaws.com',
  'PublicIpAddress': '18.234.92.252'}]


### Configuring all created nodes and create Spark Cluster ::
* Login using pre-defined .pem file
* Connect using ssh protocol
* Configure /ect/hosts file of each node(master and slave)
* Configure passwordless ssh between master and slave nodes
* Configure all spark related properties in spark-env.sh (only Master Ip is configured here).
* Configure all slave nodes in SPARK_HOME/conf/slaves file

In [8]:
try:
    for node in [master_node] + slave_node_list:
        print("Starting to configure Spark Node("+ node['PublicDnsName'] +" :: " + node['NodeName'] + ").")
        ssh = paramiko.SSHClient()
        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        privkey = paramiko.RSAKey.from_private_key_file(cluster_key_pair_path + '/' + cluster_key_pair_name + '.pem')
        connect_limit = 5
        for _ in range(1, 5):
            try:
                ssh.connect(node['PublicDnsName'], username='ec2-user', pkey=privkey)
                break
            except Exception as e:
                print("Unexpected error while trying to connect Spark Cluster Master Node: '" + node['PublicDnsName'] + "'. Retrying after 5 secs...")
                time.sleep(5)
        else:
            print("Maximum connection try limit exceeded, still could not connect to master node. Check AWS Management console for further details.")
            print(node)
            exit()

        _, stdout, _ = ssh.exec_command("sudo cp -p /etc/hosts /etc/hosts_bkp")
        if stdout.channel.recv_exit_status() != 0:
            print("Unexpected error occured while creating back up of '/etc/hosts' file.")
            raise Exception
        hosts_file_master_cmd = "echo '" + master_node['PrivateIpAddress'] + " master' >> /etc/hosts"
        _, stdout, _ = ssh.exec_command(hosts_file_master_cmd)
        if stdout.channel.recv_exit_status() != 0:
            print("Unexpected error occured while adding master node private ip to '/etc/hosts' file.")
            raise Exception

        if node['NodeName'] == 'master':
            _, stdout, _ = ssh.exec_command("cp -p " + spark_home + "/conf/slaves " + spark_home + "/conf/slaves_bkp")
            if stdout.channel.recv_exit_status()  != 0:
                print("Unexpected error occured while taking back up of '" + spark_home + "/conf/slaves' file.")
                raise Exception

        for slave_node in slave_node_list:
            hosts_file_slave_cmd = "echo '" + slave_node['PrivateIpAddress'] + " " + slave_node['NodeName'] + "' >> /etc/hosts"
            _, stdout, _ = ssh.exec_command(hosts_file_slave_cmd)
            if stdout.channel.recv_exit_status()  != 0:
                print("Unexpected error occured while adding slave node private ip('" + slave_node['PrivateIpAddress'] + "') to '/etc/hosts' file.")
                raise Exception
            if node['NodeName'] == 'master':
                hosts_file_slave_cmd = "echo '" + slave_node['NodeName'] + "' >> " + spark_home + "/conf/slaves"
                _, stdout, _ = ssh.exec_command(hosts_file_slave_cmd)
                if stdout.channel.recv_exit_status()  != 0:
                    print("Unexpected error occured while adding slave node name(" + slave_node['NodeName'] + ") to '/conf/slaves' file.")
                    raise Exception

        spark_env_file_cmd = "sed 's/<SPARK_CLUSTER_MASTER_PRIVATE_IP>/" + master_node['PrivateIpAddress'] + "/g' " + spark_home + "/conf/spark-env.sh > " + spark_home + "/conf/spark-env-new.sh"
        _, stdout, _ = ssh.exec_command(spark_env_file_cmd)
        if stdout.channel.recv_exit_status()  != 0:
            print("Unexpected error occured while putting Master node private ip in '" + spark_home + "/conf/spark-env.sh' file.")
            raise Exception
        _, stdout, _ = ssh.exec_command("chmod 755 " + spark_home + "/conf/spark-env-new.sh")
        if stdout.channel.recv_exit_status()  != 0:
            print("Unexpected error occured while putting Master node private ip in '" + spark_home + "/conf/spark-env.sh' file.")
            raise Exception
        _, stdout, _ = ssh.exec_command("mv " + spark_home + "/conf/spark-env.sh  " + spark_home + "/conf/spark-env-bkp.sh")
        if stdout.channel.recv_exit_status()  != 0:
            print("Unexpected error occured while putting Master node private ip in '" + spark_home + "/conf/spark-env.sh' file.")
            raise Exception
        _, stdout, _ = ssh.exec_command("mv " + spark_home + "/conf/spark-env-new.sh " + spark_home + "/conf/spark-env.sh")
        if stdout.channel.recv_exit_status()  != 0:
            print("Unexpected error occured while putting Master node private ip in '" + spark_home + "/conf/spark-env.sh' file.")
            raise Exception

        auth_key_file_cmd = "sed 's/<SPARK_CLUSTER_MASTER_PRIVATE_IP>/" + master_node['PrivateIpAddress'] + "/g' /home/ec2-user/.ssh/authorized_keys > /home/ec2-user/.ssh/authorized_keys_new"
        _, stdout, _ = ssh.exec_command(auth_key_file_cmd)
        if stdout.channel.recv_exit_status()  != 0:
            print("Unexpected error occured while putting Master node private ip in '/home/ec2-user/.ssh/authorized_keys' file.")
            raise Exception
        _, stdout, _ = ssh.exec_command("mv /home/ec2-user/.ssh/authorized_keys  /home/ec2-user/.ssh/authorized_keys_bkp")
        if stdout.channel.recv_exit_status()  != 0:
            print("Unexpected error occured while putting Master node private ip in '/home/ec2-user/.ssh/authorized_keys' file.")
            raise Exception
        _, stdout, _ = ssh.exec_command("mv /home/ec2-user/.ssh/authorized_keys_new /home/ec2-user/.ssh/authorized_keys")
        if stdout.channel.recv_exit_status()  != 0:
            print("Unexpected error occured while putting Master node private ip in '/home/ec2-user/.ssh/authorized_keys' file.")
            raise Exception
        _, stdout, _ = ssh.exec_command("chmod 600 /home/ec2-user/.ssh/authorized_keys")
        if stdout.channel.recv_exit_status()  != 0:
            print("Unexpected error occured while putting Master node private ip in '/home/ec2-user/.ssh/authorized_keys' file.")
            raise Exception

        ssh.close()
        print("Spark Node("+ node['PublicDnsName'] +" :: " + node['NodeName'] + ") is ready to be used.")
    
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("Spark cluster configuration is done. Run 'spark_cluster_start.ipynb' to start the cluster.")
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
except Exception as e:
    print("Unexpected error while changing spark env in Spark Cluster Master Node: " + str(e))
    if ssh:
        ssh.close()
    exit()

Starting to configure Spark Node(ec2-3-95-251-159.compute-1.amazonaws.com :: master).
Spark Node(ec2-3-95-251-159.compute-1.amazonaws.com :: master) is ready to be used.
Starting to configure Spark Node(ec2-35-175-225-147.compute-1.amazonaws.com :: SlaveNode1).
Spark Node(ec2-35-175-225-147.compute-1.amazonaws.com :: SlaveNode1) is ready to be used.
Starting to configure Spark Node(ec2-18-234-92-252.compute-1.amazonaws.com :: SlaveNode2).
Spark Node(ec2-18-234-92-252.compute-1.amazonaws.com :: SlaveNode2) is ready to be used.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Spark cluster configuration is done. Run 'spark_cluster_start.ipynb' to start the cluster.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
