### Loading packages from config/init.py

In [1]:
%run ../config/init.py

### Testing AWS-cli configuration

In [16]:
access_key = !aws configure get aws_access_key_id
secret_access_key = !aws configure get aws_secret_access_key
if access_key and secret_access_key:
    print('Using access_key: ........{}'.format(access_key[0][10:]))
else:
    print('Please, configure AWS-cli before running this notebook')
    print('Open a Terminal and run: aws configure')

Using access_key: ........ZYMLD6V5WV


### Defining variables

Edit AWS zone and region variable accordingly to your geographical location.

In [180]:
REGION = 'us-east-1'
ZONE = 'us-east-1c'
!aws configure set region {REGION}

### Defining global Tags for identifying resources

Associate AWS Tags with each resource created by this notebook helps to compile total cost used by AWS.
The notebook will use, if exists, a file in the **CONFIG/aws** folder named: **aws-tags.json**

In [313]:
TAGFILE_S3 = None
TAGFILE = None
TAGDIR = None
if os.path.exists(os.path.join(CONFIG, "aws", "aws-tags-s3.json")):
    TAGFILE_S3 = os.path.join(CONFIG, "aws", "aws-tags-s3.json")
if os.path.exists(os.path.join(CONFIG, "aws", "aws-tags.json")):
    TAGFILE = os.path.join(CONFIG, "aws", "aws-tags.json")
    with open(TAGFILE) as fin:
        TAGDICT = json.loads(fin.read())
    PROJECT = None
    for k in TAGDICT['Tags']:
        if k['Key'] == 'Project':
            PROJECT = k['Value']
    if PROJECT:
        print("Using project tag: {}".format(PROJECT))

Using project tag: cbb-research-dl


### AWS machine types

| Instance Size | vCPU | Memory (GiB) | Instance Storage (GiB) | Network Bandwidth (Gbps) | EBS Bandwidth (Mbps) | $/Hour |
|---------|----------|----------|-------------|---------------|---------------|-----------|
| m5d.4xlarge | 16 | 64 | 2 x 300 NVMe SSD | Up to 10 | 4,750 | 0.904 |
| m5d.8xlarge | 32 | 128 | 2 x 600 NVMe SSD | 10 | 6,800 | 1.808 |
| m5d.16xlarge | 64 | 256 | 4 x 600 NVMe SSD | 20 | 13,600 | 3.616 |
| m5dn.4xlarge | 16 | 64 | 2 x 300 NVMe SSD | Up to 25 | 4,750 | 1.088 |
| m5dn.8xlarge | 32 | 128 | 2 x 600 NVMe SSD | 25 | 6,800| 2.176 |
| m5dn.16xlarge | 64 | 256 | 4 x 600 NVMe SSD | 75 | 13,600 | 4.352 |


 

In [19]:
QUERY_SIZES = [2000, 6000, 10000]

MACHINE_TYPES = ['m5d', 'm5dn']
CPUs = [16, 32, 64]

# Prices from 03/04/2020
PRICE = {
    'n1':{
        16: 0.904,
        32: 1.808,
        64: 3.616
    },
    'n2':{
        16: 1.088,
        32: 2.176,
        64: 4.352
    }    
}


In [21]:
result_dir = os.path.join(RESULTS, DATASET)
if not os.path.exists(result_dir):
    os.mkdir(result_dir) 
os.chdir(result_dir)
print('Using as output directory: {}'.format(result_dir))

Using as output directory: /panfs/pan1.be-md.ncbi.nlm.nih.gov/alt_splicing/cloud-transcriptome-annotation/results/PRJNA320545


### Create or retrieve AWS S3 storage bucket

In [47]:
bucket_list = !aws s3 ls | awk '{print $3}'
buckets = {}
for q in QUERY_SIZES:
    prefix = 'nopal-' + str(q) + '-'
    suffix = None
    for l in bucket_list:
        if prefix in l:
            suffix = l.replace('nopal-' + str(q) + '-','')
            break
    if suffix:
        buckets[q] = suffix    

for q in QUERY_SIZES:
    if q not in buckets:
        suffix = str(uuid.uuid4())
        inbucket = 'nopal-' + str(q) + '-' + suffix
        outbucket = 'nopal-results-' + str(q) + '-' + suffix
        buckets[q] = suffix 
        
        !aws s3 mb s3://{inbucket} --region {REGION}          
        !aws s3 mb s3://{outbucket} --region {REGION}
        if TAGFILE_S3:
            !aws s3api put-bucket-tagging --bucket {inbucket} --tagging file://{TAGFILE_S3} 
            !aws s3api put-bucket-tagging --bucket {outbucket} --tagging file://{TAGFILE_S3} 
        !aws s3 cp {q}/fasta/ s3://{inbucket}/ --recursive
        
for q in buckets:
    print('Query size: {0}\n\tin-bucket: nopal-{0}-{1}\n\tout-bucket: nopal-results{0}-{1}'.format(q, buckets[q]))

make_bucket: nopal-2000-97f602dc-d34a-4f98-8d37-492abfb0d83a
make_bucket: nopal-results-2000-97f602dc-d34a-4f98-8d37-492abfb0d83a
upload: 2000/fasta/2000_12.fa to s3://nopal-2000-97f602dc-d34a-4f98-8d37-492abfb0d83a/2000_12.fa
upload: 2000/fasta/2000_1.fa to s3://nopal-2000-97f602dc-d34a-4f98-8d37-492abfb0d83a/2000_1.fa
upload: 2000/fasta/2000_11.fa to s3://nopal-2000-97f602dc-d34a-4f98-8d37-492abfb0d83a/2000_11.fa
upload: 2000/fasta/2000_15.fa to s3://nopal-2000-97f602dc-d34a-4f98-8d37-492abfb0d83a/2000_15.fa
upload: 2000/fasta/2000_13.fa to s3://nopal-2000-97f602dc-d34a-4f98-8d37-492abfb0d83a/2000_13.fa
upload: 2000/fasta/2000_16.fa to s3://nopal-2000-97f602dc-d34a-4f98-8d37-492abfb0d83a/2000_16.fa
upload: 2000/fasta/2000_14.fa to s3://nopal-2000-97f602dc-d34a-4f98-8d37-492abfb0d83a/2000_14.fa
upload: 2000/fasta/2000_10.fa to s3://nopal-2000-97f602dc-d34a-4f98-8d37-492abfb0d83a/2000_10.fa
upload: 2000/fasta/2000_17.fa to s3://nopal-2000-97f602dc-d34a-4f98-8d37-492abfb0d83a/2000_17.fa

### Creating a AWS Batch unmanaged Cluster

### Creating an Internet gateway.

https://docs.aws.amazon.com/cli/latest/reference/ec2/create-internet-gateway.html

In [181]:
igw = !aws ec2 describe-internet-gateways --filters Name=tag:Project,Values={PROJECT}
igw = json.loads(''.join(igw))   
if 'InternetGateways' in igw and len(igw['InternetGateways']) > 0:
    igw = igw['InternetGateways'][0]
    print('Using Internet Gateway: {}'.format(igw['InternetGatewayId']))
else:
    igw = !aws ec2 create-internet-gateway
    igw = json.loads(''.join(igw))
    if 'InternetGateway' in igw and 'InternetGatewayId' in igw['InternetGateway']:
        igw = igw['InternetGateway']
        print('Created Internet Gateway: {}'.format(igw['InternetGatewayId']))
        if TAGFILE:
            igw_id = igw['InternetGatewayId']
            !aws ec2 create-tags --resources {igw_id} --cli-input-json file://{TAGFILE}

Created Internet Gateway: igw-0941f7aeb58334343


### Creating the Amazon Virtual Private Cloud  (VPC) and all its componets

* VPC: https://docs.aws.amazon.com/cli/latest/reference/ec2/create-vpc.html
* ACL: https://docs.aws.amazon.com/cli/latest/reference/ec2/create-network-acl.html
* Route Table: https://docs.aws.amazon.com/cli/latest/reference/ec2/create-route-table.html
* Subnet: https://docs.aws.amazon.com/cli/latest/reference/ec2/create-subnet.html

In [191]:
vpc = !aws ec2 describe-vpcs --filters Name=tag:Project,Values={PROJECT}
vpc = json.loads(''.join(vpc))
if 'Vpcs' in vpc and len(vpc['Vpcs']) > 0:
    vpc = vpc['Vpcs'][0]
    vpc_id = vpc['VpcId']
    print('Using VPC: {}'.format(vpc['VpcId']))
    subnet = !aws ec2 describe-subnets --filters "Name=vpc-id,Values={vpc_id}"
    subnet = json.loads(''.join(subnet))
    if 'Subnets' in subnet:
        subnet = subnet['Subnets'][0]   
        subnet_id = subnet['SubnetId']

        print('Subnet {} attached to VPC {}'.format(subnet_id, vpc_id))    
else:
    print('No VPC, creating it ..... ')
    vpc = !aws ec2 create-vpc --cidr-block 10.0.0.0/16 --amazon-provided-ipv6-cidr-block 
    vpc = json.loads(''.join(vpc))
    if 'Vpc' in vpc:
        vpc = vpc['Vpc']
        vpc_id = vpc['VpcId']
        print('Created VPC: {}'.format(vpc_id))
        # adding Tags if file exists
        if TAGFILE:        
            !aws ec2 create-tags --resources {vpc_id} --cli-input-json file://{TAGFILE}

        # Attaching igw
        igw_id = igw['InternetGatewayId']
        print('Attaching IGW {} to the VPC: {}'.format(igw_id, vpc_id))
        !aws ec2 attach-internet-gateway --internet-gateway-id {igw_id} --vpc-id {vpc_id}

        # Retrieving created ACL
        acl = !aws ec2 describe-network-acls --filters Name=vpc-id,Values={vpc_id}
        acl = json.loads(''.join(acl))
        if 'NetworkAcls' in acl and len(acl['NetworkAcls']) == 1:
            acl = acl['NetworkAcls'][0]
            # adding Tags if file exists
            if TAGFILE:                
                acl_id = acl['NetworkAclId']
                print('Tagging ACL {}'.format(acl_id))
                !aws ec2 create-tags --resources {acl_id} --cli-input-json file://{TAGFILE}

        # Retrieving created routes
        route = !aws ec2 describe-route-tables --filters Name=vpc-id,Values={vpc_id}
        route = json.loads(''.join(route))
        if 'RouteTables' in route and len(route['RouteTables']) == 1:
            route = route['RouteTables'][0]
            route_id = route['RouteTableId']
                
            route_igw = !aws ec2 create-route --route-table-id {route_id} --destination-cidr-block 0.0.0.0/0 --gateway-id {igw_id}
            route_igw = json.loads(''.join(route_igw))
            if 'Return' in route_igw and route_igw['Return']:
                print('IGW {} attached to route {}'.format(igw_id, route_id))
            
            # adding Tags if file exists
            if TAGFILE:
                print('Tagging Route {}'.format(route_id))
                !aws ec2 create-tags --resources {route_id} --cli-input-json file://{TAGFILE}

        # Creating Subnets
        subnet = !aws ec2 create-subnet --vpc-id {vpc_id} --cidr-block 10.0.0.0/16 --availability-zone {ZONE}
        subnet = json.loads(''.join(subnet))
        if 'Subnet' in subnet:
            subnet = subnet['Subnet']   
            subnet_id = subnet['SubnetId']
            
            print('Subnet {} attached to VPC {}'.format(subnet_id, vpc_id))
            !aws ec2 modify-subnet-attribute --subnet-id {subnet_id} --map-public-ip-on-launch
            print('Public IPs enable on subnet {}'.format(subnet_id))
            
            # adding Tags if file exists
            if TAGFILE:
                print('Tagging subnet {}'.format(subnet_id))
                !aws ec2 create-tags --resources {subnet_id} --cli-input-json file://{TAGFILE}

Using VPC: vpc-0811acb4a1a497102
Subnet subnet-076bdd7bddb7c51cc attached to VPC vpc-0811acb4a1a497102


### Getting the ARN for AWSBatchServiceRole

In [271]:
batchRole = !aws iam get-role --role-name AWSBatchServiceRole
try:
    batchRole = json.loads(''.join(batchRole))
    if 'Role' in batchRole:
        batchRole = batchRole['Role']
        batchRole_arn = batchRole['Arn']
        print('Using AWSBatchServiceRole Arn: {}'.format(batchRole_arn))
    
    batchRole = !aws iam get-role --role-name AWSBatchServiceRole
except:
    print('AWSBatchServiceRole does not exists.')
    print('Please create the role as described here: https://docs.aws.amazon.com/batch/latest/userguide/service_IAM_role.html')
    print('Then, run this cell again')
    print('ERROR: {}'.format(batchRole))

s3_rolefile = os.path.join(CONFIG, 'aws', 's3-role.json') 
s3_role = None
with open(s3_rolefile) as fin:
    s3_role = json.loads(fin.read())
    s3_role['Statement'][0]['Resource'] = []
    for q in buckets:
        s3_role['Statement'][0]['Resource'].append('arn:aws:s3:::nopal-{0}-{1}'.format(q,buckets[q]))
        s3_role['Statement'][0]['Resource'].append('arn:aws:s3:::nopal-results-{0}-{1}'.format(q,buckets[q]))
if s3_role:
    with open(s3_rolefile, 'w') as fout:
        fout.write(json.dumps(s3_role, indent=4) + '\n')
    
output = !aws iam get-role --role-name {PROJECT}-batch-role  
try:
    s3_role = json.loads(''.join(output))
    if 'Role' in s3_role:
        s3_role = s3_role['Role']
        s3_role_arn = s3_role['Arn']
        print('Using s3_role Arn: {}'.format(s3_role_arn))
    else:
        output = !aws iam create-role --role-name {PROJECT}-batch-role --assume-role-policy-document file://{s3_rolefile}
        s3_role = json.loads(''.join(output))
except:
    print('Project Role does not exists.')
    print('Creating the role')
    try:
        output = !aws iam create-role --role-name {PROJECT}-batch-role --assume-role-policy-document file://{s3_rolefile}
        s3_role = json.loads(''.join(output))
    except:
        print('ERROR: {}'.format(output))

# Delete this after testing the role creation         
output = !aws iam get-role --role-name cbb-research-db-batch-role
try:
    s3_role = json.loads(''.join(output))
    if 'Role' in s3_role:
        s3_role = s3_role['Role']
        s3_role_arn = s3_role['Arn']
        print('Using s3_role Arn: {}'.format(s3_role_arn))    
except:
    print('Project Role does not exists.')

Using AWSBatchServiceRole Arn: arn:aws:iam::250813660784:role/service-role/AWSBatchServiceRole
Project Role does not exists.
Creating the role
ERROR: ['', 'An error occurred (AccessDenied) when calling the CreateRole operation: User: arn:aws:iam::250813660784:user/veraalva is not authorized to perform: iam:CreateRole on resource: arn:aws:iam::250813660784:role/cbb-research-dl-batch-role']
Using s3_role Arn: arn:aws:iam::250813660784:role/cbb-research-db-batch-role


### Creating AWS Batch Components

 * Computational environment: https://docs.aws.amazon.com/cli/latest/reference/batch/create-compute-environment.html
 * Batch queue: https://docs.aws.amazon.com/cli/latest/reference/batch/create-job-queue.html

In [302]:
comp_env = None
queue = None
output = !aws batch describe-compute-environments
try:
    comp_envs = json.loads(''.join(output))    
    if 'computeEnvironments' in comp_envs:
        for c in comp_envs['computeEnvironments']:
            if c['computeEnvironmentName'] == '{}-unmanaged'.format(PROJECT):
                comp_env = c
                break
    if not comp_env:
        print('Unmanaged compute environment does not exist. Creating it ....')
        output = !aws batch create-compute-environment --compute-environment-name {PROJECT}-unmanaged --type UNMANAGED --state ENABLED --service-role {batchRole_arn}
        comp_env = json.loads(''.join(output))
    
    if comp_env:
        print('Compute environment: {}'.format(comp_env['computeEnvironmentArn']))
        output = !aws batch describe-job-queues
        queues = json.loads(''.join(output))  
        queue = None
        if 'jobQueues' in queues:
            for c in queues['jobQueues']:
                if c['jobQueueName'] == '{}-queue'.format(PROJECT):
                    queue = c
                    break
        if queue:
            print('Queue: {}'.format(queue['jobQueueArn']))
        else:
            print('Queue does not exist. Creating it ....') 
            compu_env_arn = comp_env['computeEnvironmentArn']
            output = !aws batch create-job-queue --job-queue-name {PROJECT}-queue --state ENABLED --priority 1 --compute-environment-order order=1,computeEnvironment={compu_env_arn}
            queue = json.loads(''.join(output))    
            print('Queue {}'.format(queue['jobQueueArn']))
        if not queue:
            print('ERROR: No AWS Batch queue available. Please, check possible errors')    
except ex:
    print('ERROR: {}'.format(output))  
    print(e)

if not comp_env:
    print('ERROR: No computational environment available. Please, check possible errors')
if not queue:
    print('ERROR: No AWS Batch queue available. Please, check possible errors')     

Unmanaged compute environment does not exist. Creating it ....
Compute environment: arn:aws:batch:us-east-1:250813660784:compute-environment/cbb-research-dl-unmanaged
Queue does not exist. Creating it ....
Queue arn:aws:batch:us-east-1:250813660784:job-queue/cbb-research-dl-queue


### Getting ECS cluster name

In [305]:
ecs_cluster = None
output = !aws batch describe-compute-environments
try:
    comp_envs = json.loads(''.join(output))    
    if 'computeEnvironments' in comp_envs:
        for c in comp_envs['computeEnvironments']:
            if c['computeEnvironmentName'] == '{}-unmanaged'.format(PROJECT):
                comp_env = c
                ecs_cluster = c['ecsClusterArn']
                break
except ex:
    print('ERROR: {}'.format(output))  
    print(e)

if not ecs_cluster:
    print('ERROR: No ECS cluster available. Please, check possible errors')
print('ECS cluster: {}'.format(ecs_cluster))

ECS cluster: arn:aws:ecs:us-east-1:250813660784:cluster/cbb-research-dl-unmanaged_Batch_ad56ff41-45e2-36e9-855f-7f87e2251d68


### Creating EC2 components

 * Security Group: https://docs.aws.amazon.com/cli/latest/reference/ec2/create-security-group.html

In [192]:
sg = !aws ec2 describe-security-groups --filters Name=tag:Project,Values={PROJECT}
sg = json.loads(''.join(sg))
if 'SecurityGroups' in sg and len(sg['SecurityGroups']) >= 1:
    sg = sg['SecurityGroups'][0]
    sg_id = sg['GroupId']
    print('Using sg {}'.format(sg_id))
else:
    sg_name = PROJECT
    sg_descr = 'Security Group for project: ' + PROJECT
    sg = !aws ec2 create-security-group --group-name {sg_name}  --description "{sg_descr}" --vpc-id {vpc_id}
    sg = json.loads(''.join(sg))
    if 'GroupId' in sg:
        sg_id = sg['GroupId']
        print('SG {} created'.format(sg_id))
        
        print('Adding SSH inbound to the sg {}'.format(sg_id))
        !aws ec2 authorize-security-group-ingress --group-id {sg_id} --protocol tcp --port 22 --cidr 0.0.0.0/0
        
        # adding Tags if file exists
        if TAGFILE:
            print('Tagging sg {}'.format(sg_id))
            !aws ec2 create-tags --resources {sg_id} --cli-input-json file://{TAGFILE}

Using sg sg-05c7b9ccd93afc9c1


### Creating ECS cluster init script for the instances

In [340]:
user_data = 'echo ECS_CLUSTER={} >> /etc/ecs/ecs.config\nsystemctl enable --now --no-block ecs.service\n'.format(comp_env['ecsClusterArn'])
user_datafile = os.path.join(CONFIG, 'aws', 'user-data.txt')
user_datafile_ecs = os.path.join(CONFIG, 'aws', 'user-data-ecs.txt')
with open(user_datafile_ecs) as fin:
    with open(user_datafile, 'w') as fout:
        fout.write(fin.read() + '\n')
        fout.write(user_data + '\n')
        
        

### Launching 20 instances for the cluster

Using default default Amazon Linux 2 AMI (HVM), SSD Volume Type (ami-0fc61db8544a617ed).
In the user data script the following services are installed and configure:

 * Amazon ECs client
 * The /dev/nvme1n1 is partitioned, formated (XFS) and mounted in /data
 * User ec2-user is added to the docker group
 * The ecs services is configured and started
 
For accessing to the instance using SSH require a SSH key created in the EC2 console (see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html)

In [341]:
# Number of instances to launch
NO_INSTANCES = 1

# Instance type
INSTANCE_TYPE = 'm5d.16xlarge'

# If SSH key created you can added it to the instance for login
SSH_KEY = "cbb-research-dl-pk"

# Amazon image
IMAGE = "ami-0fc61db8544a617ed"

cmd = 'aws ec2 run-instances --image-id {} --count {} --instance-type {} '.format(IMAGE, NO_INSTANCES, INSTANCE_TYPE)
cmd += '--placement "AvailabilityZone={}" '.format(ZONE)
cmd += '--key-name {} '.format(SSH_KEY)
cmd += '--iam-instance-profile Name=ecsInstanceRole '
cmd += '--security-group-ids {} --subnet-id {} '.format(sg_id, subnet_id)
cmd += '--user-data file://{} '.format(user_datafile)
cmd += '--tag-specifications '

#Using defined tags in the instances
if TAGDICT:
    tags = ''
    for t in TAGDICT['Tags']:
        if tags:
            tags += ','
        tags += '{' + 'Key={0},Value={1}'.format(t['Key'], t['Value']) + '}'
    cmd += '\'ResourceType=instance,Tags=[{}]\' '.format(tags)
    cmd += '\'ResourceType=volume,Tags=[{}]\''.format(tags)

output = !{cmd}
try:
    instances = json.loads(''.join(output))    
except:
    print('ERROR: {}'.format(output))    

### Creating a AWS Batch Job defintion

In [330]:
job_definitionfile = os.path.join(CONFIG, 'aws', 'transannot-job-definition.json') 
job_definition = None
with open(job_definitionfile) as fin:
    job_definition = json.loads(fin.read())
    job_definition['jobDefinitionName'] = '{}-transannotation-job-definition'.format(PROJECT)
    job_definition['containerProperties']['jobRoleArn'] = s3_role['Arn']

In [331]:
job_definition

{'jobDefinitionName': 'cbb-research-dl-transannotation-job-definition',
 'type': 'container',
 'parameters': {},
 'containerProperties': {'image': 'gcr.io/cbb-research-dl/transannot',
  'vcpus': 64,
  'memory': 252000,
  'command': ['/usr/envs/transannot/bin/aws-pipeline.sh'],
  'jobRoleArn': 'arn:aws:iam::250813660784:role/cbb-research-db-batch-role',
  'volumes': [{'host': {'sourcePath': '/data'}, 'name': 'data'}],
  'environment': [{'name': 'BLAST_S3_BUCKET_SAMPLE',
    'value': 'cbb-research-dl-blastdb'}],
  'mountPoints': [{'containerPath': '/data', 'sourceVolume': 'data'}],
  'privileged': True,
  'ulimits': [],
  'user': 'root'}}