# Problem Statement ::You have two data files stored in an Amazon S3 bucket. Your goal is to create a data processing pipeline using PySpark that reads these files from S3, performs some transformations and classify customers based on its purchases, and then submits the PySpark script as an EMR step using boto3

In [None]:
pip install boto3

In [None]:
import boto3
from get_creds import fetch_credentials

In [None]:
# Create an S3 client
emr_client=boto3.client('emr',
    aws_access_key_id=fetch_credentials()[0],
    aws_secret_access_key=fetch_credentials()[1],
    region_name='us-east-1')

In [None]:
# Create EMR cluster
response = emr_client.run_job_flow(
    Name='ds-cluster-boto3',
    ReleaseLabel='emr-6.14.0',
    Instances={
        'MasterInstanceType': 'm5.xlarge',
        'SlaveInstanceType': 'm5.xlarge',
        'InstanceCount': 2,
        'KeepJobFlowAliveWhenNoSteps': True,
        'TerminationProtected': False,
        'Ec2KeyName': 'first_keypair',
        'EmrManagedMasterSecurityGroup': 'sg-0309ac88be23385a2',
        'EmrManagedSlaveSecurityGroup': 'sg-0a19af4ba0a963cca',
        'Ec2SubnetId': 'subnet-090cd51dd31d81d26',
    },
    Applications=[
        {'Name': 'Spark'},
    ],
    VisibleToAllUsers=True,
    JobFlowRole='arn:aws:iam::182424271996:instance-profile/EMR_EC2_DefaultRole', #this is instance profile role
    ServiceRole='arn:aws:iam::182424271996:role/EMR_Default_Role',
)

In [None]:
# List EMR clusters
response = emr_client.list_clusters(
    ClusterStates=['WAITING']
)
print(response)
cluster_id=response['Clusters'][0]['Id']
print(cluster_id)

In [None]:
# Add steps to the cluster ::
step_name = 'cust_segmentation_transform_v2'
script_location = 's3://cab-data-science-demo/BOTO3/Scripts/customer_segmentation_transform.py' 
arguments = ['s3://cab-data-science-demo/BOTO3/Input/sales_data.csv', 's3://cab-data-science-demo/BOTO3/Input/customer_data.csv',
            's3://cab-data-science-demo/BOTO3/Output/']  

step_response = emr_client.add_job_flow_steps(
    JobFlowId=cluster_id,
    Steps=[
        {
            'Name': step_name,
            'ActionOnFailure': 'CONTINUE',
            'HadoopJarStep': {
                'Jar': 'command-runner.jar',
                'Args': ['spark-submit',"--deploy-mode","cluster", script_location] + arguments,
            },
        },
    ]
)

# spark-submit command: spark submit,--deploy-mode,'cluster',--master
step_id = step_response['StepIds'][0]
print(f"Step added with ID: {step_id}")