# Spark Cluster Start

### Importing Required Libraries ::
* __boto3__: Required to connect as operate AWS task
* __botocore__: Required to handle the exceptions related to boto3 tasks
* __paramiko__: Reuired to run commands inside EC2 instances
* __json__: To convert python native dictionaries to string, to write in files
* __datetime__, __pprint__, __sys__, __time__: General purpose use

In [1]:
import boto3, botocore, paramiko
from datetime import datetime
import pprint, sys, time, json
from botocore.exceptions import ClientError

### Creating boto3 session, clients and resources ::

In [2]:
try:
    session = boto3.session.Session(region_name='us-east-1')
    ec2_client = session.client('ec2')
    ec2_resource = session.resource('ec2')
except ClientError as e:
    print("Unexpected error while creating boto3 session, client and resources: " + str(e))
    exit()

### Declaring the hardcoded informations ::

In [3]:
run_id = datetime.now().strftime('%Y%m%d%H%M%S')
user = 'root'
# config_dir = '/Volumes/WorkSpace/POC/SparkClusterEC2/ConfigDir'
# config_file_name = config_dir + '/' + user + '_node_details.dat'
spark_home = '/home/ec2-user/spark-2.4.5-bin-hadoop2.7'
cluster_instance_type = 't2.micro'
cluster_key_pair_path = '/Volumes/WorkSpace/AWS/Access_Keys'
cluster_key_pair_name = 'SparkCluster'
cluster_subnet_id = 'subnet-070cddc01a126f07f'
cluster_security_group_list = ['sg-05ee7f205f173862c']

### Check for running Master Node for current user ::
* To start any cluster, first master node needs to be detected.

In [4]:
try:
    master_instance_details = ec2_resource.instances.filter(
        Filters=[
            {
                'Name': 'instance-state-name',
                'Values': ['running']
            },
            {
                'Name': 'tag:Project',
                'Values': ['SparkCluster']
            },
            {
                'Name': 'tag:User',
                'Values': [user]
            },
            {
                'Name': 'tag:NodeType',
                'Values': ['Master']
            }
        ]
    )
    if list(master_instance_details):
        master_node_id = list(master_instance_details)[0].id
        print("Master node: Instance('" + master_node_id + "').")
    else:
        print("No running master node for User('" + user + "'). Quitting process.")
        exit()
except ClientError as e:
    print("Unexpected error while looking for already running Master node EC2 instance for user-'" + user + "': " + str(e))
    exit()

Master node: Instance('i-0a27306c67986fd4b').


### Fetching required information of the Master Node ::
* Need to iterate and probe a few times to check whether the node is up before we can extract the informations

In [5]:
try:
    master_node_temp = ec2_client.describe_instances(InstanceIds=[master_node_id])['Reservations'][0]['Instances'][0]
    master_node = {
            'InstanceId': master_node_temp['InstanceId'],
            'PublicDnsName': master_node_temp['PublicDnsName'],
            'PublicIpAddress': master_node_temp['PublicIpAddress'],
            'PrivateIpAddress': master_node_temp['PrivateIpAddress'],
            'NodeName': 'master'
        }
    pprint.pprint(master_node)
except Exception as e:
    print("Unexpected error while extracting Spark Cluster Master node details: " + str(e))
    exit()

{'InstanceId': 'i-0a27306c67986fd4b',
 'NodeName': 'master',
 'PrivateIpAddress': '172.75.0.12',
 'PublicDnsName': 'ec2-3-95-251-159.compute-1.amazonaws.com',
 'PublicIpAddress': '3.95.251.159'}


### Stop the cluster ::
* Login using pre-defined .pem file
* Connect using ssh protocol
* un stop-all.sh

In [6]:
try:
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    privkey = paramiko.RSAKey.from_private_key_file(cluster_key_pair_path + '/' + cluster_key_pair_name + '.pem')
    connect_limit = 5
    for _ in range(1, 5):
        try:
            ssh.connect(master_node['PublicDnsName'], username='ec2-user', pkey=privkey)
            break
        except Exception as e:
            print("Unexpected error while trying to connect Spark Cluster Master Node: '" + master_node['PublicDnsName'] + "'. Retrying after 5 secs...")
            time.sleep(5)
    else:
        print("Maximum connection try limit exceeded, still could not connect to master node. Check AWS Management console for further details.")
        print(master_node)
        exit()

    cluster_down_cmd = spark_home + "/sbin/stop-all.sh"
    print(cluster_down_cmd)
    _, stdout, _ = ssh.exec_command(cluster_down_cmd)
    exit_status = stdout.channel.recv_exit_status()   
    if exit_status != 0:
        print("Spark cluster stop command failed on master node Node(" + master_node['PublicDnsName'] + "). Please log in manualy to the node and do the needfull.")
        ssh.close()
    else:
        ssh.close()
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("Spark cluster is stoped. Please clean up the nodes by running 'slave_nodes_termination.ipynb' and 'master_nodes_termination.ipynb' notebooks in provided sequence.")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
except Exception as e:
    print("Unexpected error while changing spark env in Spark Cluster Master Node: " + str(e))
    ssh.close()
    exit()

/home/ec2-user/spark-2.4.5-bin-hadoop2.7/sbin/stop-all.sh
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Spark cluster is stoped. Please clean up the nodes by running 'slave_nodes_termination.ipynb' and 'master_nodes_termination.ipynb' notebooks in provided sequence.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
