In [4]:
import boto3

In [5]:
emr_client=boto3.client('emr')

In [8]:
# Create EMR cluster
response = emr_client.run_job_flow(
    Name='pavan-cluster-boto3-1',
    ReleaseLabel='emr-6.14.0',
    Instances={
        'MasterInstanceType': 'm5.xlarge',
        'SlaveInstanceType': 'm5.xlarge',
        'InstanceCount': 2,
        'KeepJobFlowAliveWhenNoSteps': True,
        'TerminationProtected': False,
        'Ec2KeyName': 'pavan-emr-key1',
        'EmrManagedMasterSecurityGroup': 'sg-0c5c379d574383ae0',
        'EmrManagedSlaveSecurityGroup': 'sg-0e7f534e5302aacd9',
        'Ec2SubnetId': 'subnet-0a5473cae43eb4043',
    },
    Applications=[
        {'Name': 'Spark'},
    ],
    VisibleToAllUsers=True,
    JobFlowRole='arn:aws:iam::032731649829:instance-profile/pavan-emr-ec2-role', #this is instance profile role
    ServiceRole='arn:aws:iam::032731649829:role/pavan-emr-service-role',
)

In [9]:
# List EMR clusters
response = emr_client.list_clusters(
    ClusterStates=['WAITING']
)
print(response)
cluster_id=response['Clusters'][0]['Id']
print(cluster_id)

{'Clusters': [{'Id': 'j-11N7J1MFJI3AI', 'Name': 'pavan-cluster-boto3-1', 'Status': {'State': 'WAITING', 'StateChangeReason': {'Message': 'Cluster ready to run steps.'}, 'Timeline': {'CreationDateTime': datetime.datetime(2023, 10, 18, 18, 6, 4, 824000, tzinfo=tzlocal()), 'ReadyDateTime': datetime.datetime(2023, 10, 18, 18, 9, 56, 590000, tzinfo=tzlocal())}}, 'NormalizedInstanceHours': 0, 'ClusterArn': 'arn:aws:elasticmapreduce:us-east-1:032731649829:cluster/j-11N7J1MFJI3AI'}, {'Id': 'j-2A1S9OTUBQOL0', 'Name': 'superman-cluster-boto3-04', 'Status': {'State': 'WAITING', 'StateChangeReason': {'Message': 'Cluster ready to run steps.'}, 'Timeline': {'CreationDateTime': datetime.datetime(2023, 10, 18, 17, 18, 30, 740000, tzinfo=tzlocal()), 'ReadyDateTime': datetime.datetime(2023, 10, 18, 17, 21, 50, 75000, tzinfo=tzlocal())}}, 'NormalizedInstanceHours': 0, 'ClusterArn': 'arn:aws:elasticmapreduce:us-east-1:032731649829:cluster/j-2A1S9OTUBQOL0'}], 'ResponseMetadata': {'RequestId': 'df757e7a-7d3

In [10]:
cluster_id='j-11N7J1MFJI3AI'

In [13]:
# Add steps to the cluster ::
step_name = 'pavan_cust_segmentation_transform2'
script_location = 's3://pavan-emr-boto3-customer/cus-script/customer_segmentation_transform.py' 
arguments = ['s3://pavan-emr-boto3-customer/input/sales/sales_data.csv', 's3://pavan-emr-boto3-customer/input/customer/customer_data.csv',
            's3://pavan-emr-boto3-customer/output/']  

step_response = emr_client.add_job_flow_steps(
    JobFlowId=cluster_id,
    Steps=[
        {
            'Name': step_name,
            'ActionOnFailure': 'CONTINUE',
            'HadoopJarStep': {
                'Jar': 'command-runner.jar',
                'Args': ['spark-submit', script_location] + arguments,
            },
        },
    ]
)

# spark-submit command: spark submit,--deploy-mode,'cluster',--master
step_id = step_response['StepIds'][0]
print(f"Step added with ID: {step_id}")

Step added with ID: s-09423983T1YR63MHMRLF
