In [1]:
import os
import logging
from io import BytesIO
import time
import zipfile
import numpy as np
import boto3
from tqdm import tqdm
from datetime import datetime, timezone
from time import gmtime, strftime
import json
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [9]:
import math
import base64

In [3]:
# The difference between UTC and local timezone
timezone_offset = 0

### Function Name List

In [76]:
function_prefix = "TextProcessingApp"
function_count = 4
function_name_list = [function_prefix+'_f'+str(i) for i in range(1, function_count+1)]
print(function_name_list)

['TextProcessingApp_f1', 'TextProcessingApp_f2', 'TextProcessingApp_f3', 'TextProcessingApp_f4']


In [77]:
mem_config_list={
    'f1':256,
    'f2':256,
    'f3':256,
    'f4':256
}

# Execute the Application

In [6]:
sfn_client = boto3.client('stepfunctions')

In [11]:
stateMachineArn='arn:aws:states:us-east-2:499537426559:stateMachine:TextProcessingApp'

## Test Run

In [14]:
sfn_client.start_execution(
    stateMachineArn=stateMachineArn,
    input = json.dumps({"input": base64.b64encode(str.encode("And like the baseless fabric of this vision, The cloud-capped towers, the gorgeous palaces, The solemn temples, the great globe itself— Yea, all which it inherit—shall dissolve, And like this insubstantial pageant faded, Leave not a rack behind. We are such stuff As dreams are made on, and our little life Is rounded with a sleep. Sir, I am vexed. Bear with my weakness. My old brain is troubled. Be not disturbed with my infirmity. If you be pleased, retire into my cell And there repose. A turn or two I’ll walk To still my beating mind."*200)).decode("utf-8")})
)

{'executionArn': 'arn:aws:states:us-east-2:499537426559:express:TextProcessingApp:5b56c799-406d-4188-ad7b-cc066c866d93:8102878b-8775-400c-bcb4-f16f75bca0df',
 'startDate': datetime.datetime(2022, 9, 29, 7, 29, 17, 695000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': 'e8a2846e-0868-4d6e-b821-c1436e40f791',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e8a2846e-0868-4d6e-b821-c1436e40f791',
   'date': 'Thu, 29 Sep 2022 07:29:17 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '185'},
  'RetryAttempts': 0}}

## Configure Logging

In [15]:
logging.basicConfig(filename='AppExecution.log', encoding='utf-8', format='%(asctime)s.%(msecs)03d %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO)

## Execute Sequence

In [7]:
np.random.seed(256)

In [16]:
for i in range(2000):
    response = sfn_client.start_execution(stateMachineArn=stateMachineArn, input = json.dumps({"input": base64.b64encode(str.encode("And like the baseless fabric of this vision, The cloud-capped towers, the gorgeous palaces, The solemn temples, the great globe itself— Yea, all which it inherit—shall dissolve, And like this insubstantial pageant faded, Leave not a rack behind. We are such stuff As dreams are made on, and our little life Is rounded with a sleep. Sir, I am vexed. Bear with my weakness. My old brain is troubled. Be not disturbed with my infirmity. If you be pleased, retire into my cell And there repose. A turn or two I’ll walk To still my beating mind."*200)).decode("utf-8")}))
    RequestId = response.get('ResponseMetadata', {}).get('RequestId')
    StatusCode = response.get('ResponseMetadata', {}).get('HTTPStatusCode', 'ERR')
    logging.info(f'{i+1} {StatusCode} {RequestId}')
    time.sleep(10)

## Get the start time and the end time

In [17]:
app_exeuction_start_time = ' '.join(os.popen('head -1 AppExecution.log').read().split(' ')[:2])
app_execution_end_time = ' '.join(os.popen('tail -1 AppExecution.log').read().split(' ')[:2])
app_exeuction_start_time = datetime.strptime(app_exeuction_start_time, '%Y-%m-%d %H:%M:%S.%f')
app_execution_end_time = datetime.strptime(app_execution_end_time, '%Y-%m-%d %H:%M:%S.%f')

In [18]:
app_exeuction_start_time

datetime.datetime(2022, 9, 29, 7, 30, 38, 162000)

In [19]:
app_execution_end_time

datetime.datetime(2022, 9, 29, 13, 16, 48, 741000)

In [20]:
app_exeuction_start_time = int(datetime.timestamp(app_exeuction_start_time))

In [21]:
app_execution_end_time = int(datetime.timestamp(app_execution_end_time)) + 20

# Retrieve Logs

In [22]:
logclient = boto3.client('logs')

## Query Step Functions Logs

In [56]:
query_sfn_TextProcessing = logclient.start_query(
    logGroupName='/aws/vendedlogs/states/TextProcessingApp-Logs',
    queryString="fields type, @timestamp| filter type = 'ExecutionStarted' or type = 'ExecutionSucceeded' | sort id desc",
    startTime=app_exeuction_start_time,
    endTime=app_execution_end_time,
    limit = 10000
)

In [63]:
query_results_sfn_TextProcessing = logclient.get_query_results(
    queryId=query_sfn_TextProcessing['queryId']
)

In [64]:
TextProcessing_starttimestamp = np.sort([datetime.timestamp(datetime.strptime(item[1]['value'], '%Y-%m-%d %H:%M:%S.%f'))+timezone_offset*3600 for item in query_results_sfn_TextProcessing['results'] if item[0]['value']=='ExecutionStarted'])
TextProcessing_endtimestamp = np.sort([datetime.timestamp(datetime.strptime(item[1]['value'], '%Y-%m-%d %H:%M:%S.%f'))+timezone_offset*3600 for item in query_results_sfn_TextProcessing['results'] if item[0]['value']=='ExecutionSucceeded'])

In [65]:
len(TextProcessing_starttimestamp)

2000

In [66]:
len(TextProcessing_endtimestamp)

2000

In [67]:
pd.DataFrame({'Start': TextProcessing_starttimestamp, 'End':TextProcessing_endtimestamp}).to_csv('TextProcessing_SFN_Logs.csv', index=False)


In [68]:
TextProcessing_sfn_logs = pd.read_csv('TextProcessing_SFN_Logs.csv', low_memory=False)

In [69]:
TextProcessing_sfn_logs.shape

(2000, 2)

## Query Lambda Function Logs

### Functions for parsing Logs

In [75]:
def lambda_report_log_to_dict(log):
    res={}
    lis=[item.split(': ') for item in log[1]['value'].split('\t')]
    res['RequestId']=lis[0][1]
    res['Duration']=float(lis[1][1].split(' ')[0])
    res['Billed_Duration']=int(lis[2][1].split(' ')[0])
    res['Memory_Size']=int(lis[3][1].split(' ')[0])
    res['Max_Memory_Used']=int(lis[4][1].split(' ')[0])
    res['UTC_Timestamp'] = time.mktime(datetime.strptime(log[0]['value'], "%Y-%m-%d %H:%M:%S.%f").timetuple()) +timezone_offset*3600
    return res

### Prepare Logs

In [78]:
query_lambda = []
for function in function_name_list:
    query_lambda.append(logclient.start_query(
        logGroupName='/aws/lambda/{}'.format(function),
        queryString="fields @timestamp, @message| filter @message like 'REPORT'| sort @timestamp asc",
        startTime=app_exeuction_start_time,
        endTime=app_execution_end_time,
        limit=10000
        ))
    time.sleep(4)
time.sleep(10)

### Retrieve Logs

In [79]:
query_lambda_results = []
for q in query_lambda:
    query_lambda_results.append(logclient.get_query_results(
        queryId=q['queryId']
    ))
    time.sleep(4)

In [80]:
with open('query_lambda_results_sfn.pickle', 'wb') as f:
    f.write(pickle.dumps(query_lambda_results))

In [82]:
TextProcessing_lambda_logs_dict = {'f'+str(i):None for i in range(1, function_count+1)}
for i in range(1, function_count+1):
    TextProcessing_lambda_logs_dict['f'+str(i)] = [lambda_report_log_to_dict(item) for item in query_lambda_results[i-1]['results']]
    for item in TextProcessing_lambda_logs_dict['f'+str(i)]:
        item['Function']='f'+str(i)

In [83]:
len(TextProcessing_lambda_logs_dict['f1'])

2000

#### Convert Logs into DataFrame and Save as CSV

In [87]:
TextProcessing_lambda_logs=pd.DataFrame()
for i in range(1, function_count+1):
    TextProcessing_lambda_logs = TextProcessing_lambda_logs.append(pd.DataFrame(TextProcessing_lambda_logs_dict['f'+str(i)]))
TextProcessing_lambda_logs.index = range(TextProcessing_lambda_logs.shape[0])
TextProcessing_lambda_logs = TextProcessing_lambda_logs[['Function', 'Memory_Size', 'Max_Memory_Used', 'Duration', 'Billed_Duration', 'UTC_Timestamp', 'RequestId']]
TextProcessing_lambda_logs.to_csv('TextProcessing_sfn_lambda_logs.csv',index=False)

In [88]:
TextProcessing_lambda_logs = pd.read_csv('TextProcessing_sfn_lambda_logs.csv', low_memory=False)
TextProcessing_lambda_logs.columns = ['Function', 'Memory_Size', 'Max_Memory_Used', 'Duration', 'Billed_Duration', 'UTCTimestamp', 'RequestId']

In [90]:
TextProcessing_lambda_logs.head()

Unnamed: 0,Function,Memory_Size,Max_Memory_Used,Duration,Billed_Duration,UTCTimestamp,RequestId
0,f1,256,39,10.17,11,1664437000.0,dcc992f8-3b00-415f-9c33-82539d39513c
1,f1,256,39,17.06,18,1664437000.0,b621bfc6-2e80-42b5-ab0b-d414b4d77e52
2,f1,256,39,15.44,16,1664437000.0,17530a70-7d10-41d5-8130-8868dc909e03
3,f1,256,39,4.87,5,1664437000.0,5473913f-938c-4391-b6cf-c1c479ac1788
4,f1,256,39,10.71,11,1664437000.0,97ceeab6-e674-4424-b6d9-80d0ecdc97b7


In [91]:
for i in range(1, function_count+1):
    print(f"f{i}", TextProcessing_lambda_logs.query(f"Function == 'f{i}'").shape[0], TextProcessing_lambda_logs.query(f"Function == 'f{i}'")['Duration'].mean())

f1 2000 7.681019999999999
f2 2000 74.459995
f3 2000 55.15273
f4 2000 76.962785


In [92]:
def calculate_cost(rt: float, mem: float, pmms: float = 0.0000166667/1024/1000, ppi: float = 0.0000002) -> float:
    return math.ceil(rt) * mem * pmms + ppi

def adjacent_values(vals, q1, q3):
    upper_adjacent_value = q3 + (q3 - q1) * 1.5
    upper_adjacent_value = np.clip(upper_adjacent_value, q3, vals[-1])

    lower_adjacent_value = q1 - (q3 - q1) * 1.5
    lower_adjacent_value = np.clip(lower_adjacent_value, vals[0], q1)
    return lower_adjacent_value, upper_adjacent_value

# End-to-end RT Reported by AWS

In [106]:
TextProcessing_duration = pd.DataFrame((TextProcessing_sfn_logs['End'] - TextProcessing_sfn_logs['Start'])*1000, columns=['Duration'])
TextProcessing_duration.to_csv('TextProcessing_duration_aws.csv', index=False)
TextProcessing_duration = pd.read_csv('TextProcessing_duration_aws.csv', low_memory=False)

In [107]:
print('Number of Executions: ', len(TextProcessing_duration['Duration']))
TextProcessing_avg_duration_aws = np.mean(TextProcessing_duration['Duration'])
TextProcessing_mid_duration_aws = np.median(TextProcessing_duration['Duration'])
TextProcessing_percentile10_aws = np.percentile(TextProcessing_duration['Duration'], 10)
TextProcessing_percentile90_aws = np.percentile(TextProcessing_duration['Duration'], 90)
print('Average Duration Reported by AWS: ', TextProcessing_avg_duration_aws, 'ms')
print('Median Duration Reported by AWS: ', TextProcessing_mid_duration_aws, 'ms')
print('10-th percentile of Duration Reported by AWS: ', TextProcessing_percentile10_aws, 'ms')
print('90-th percentile Duration Reported by AWS: ', TextProcessing_percentile90_aws, 'ms')
print('Standard Deviation of Duration Reported by AWS: ', np.std(TextProcessing_duration['Duration']), 'ms')

Number of Executions:  2000
Average Duration Reported by AWS:  445.60450077056885 ms
Median Duration Reported by AWS:  417.9999828338623 ms
10-th percentile of Duration Reported by AWS:  358.0000400543213 ms
90-th percentile Duration Reported by AWS:  581.100082397461 ms
Standard Deviation of Duration Reported by AWS:  98.17248028092828 ms


# Cost Reported by AWS

In [93]:
TextProcessing_sfn_logs.head()

Unnamed: 0,Start,End
0,1664437000.0,1664437000.0
1,1664437000.0,1664437000.0
2,1664437000.0,1664437000.0
3,1664437000.0,1664437000.0
4,1664437000.0,1664437000.0


In [94]:
TextProcessing_lambda_logs.head()

Unnamed: 0,Function,Memory_Size,Max_Memory_Used,Duration,Billed_Duration,UTCTimestamp,RequestId
0,f1,256,39,10.17,11,1664437000.0,dcc992f8-3b00-415f-9c33-82539d39513c
1,f1,256,39,17.06,18,1664437000.0,b621bfc6-2e80-42b5-ab0b-d414b4d77e52
2,f1,256,39,15.44,16,1664437000.0,17530a70-7d10-41d5-8130-8868dc909e03
3,f1,256,39,4.87,5,1664437000.0,5473913f-938c-4391-b6cf-c1c479ac1788
4,f1,256,39,10.71,11,1664437000.0,97ceeab6-e674-4424-b6d9-80d0ecdc97b7


In [95]:
cost_list = []
for index, row in TextProcessing_sfn_logs.iterrows():
    cost = 0
    app_start = row['Start'] - 2.5
    app_end = row['End'] + 2.5
    lambda_logs = TextProcessing_lambda_logs.query(f"""UTCTimestamp>{app_start-4} and UTCTimestamp<{app_end+4}""")
    for i, r in lambda_logs.iterrows():
        memory_size = r['Memory_Size']
        billed_duration = r['Billed_Duration']
        cost += calculate_cost(rt=billed_duration, mem=memory_size) * 1000000
    cost_list.append(cost)

In [96]:
TextProcessing_avg_cost_aws = np.mean(cost_list)
TextProcessing_mid_cost_aws = np.median(cost_list)
TextProcessing_percentile10_cost_aws = np.percentile(cost_list, 10)
TextProcessing_percentile90_cost_aws = np.percentile(cost_list, 90)
TextProcessing_std_cost_aws = np.std(cost_list)
print('Average Cost Reported by AWS: ', TextProcessing_avg_cost_aws, 'USD')
print('Median Cost Reported by AWS: ', TextProcessing_mid_cost_aws, 'USD')
print('10-th percentile of Cost Reported by AWS: ', TextProcessing_percentile10_cost_aws, 'USD')
print('90-th percentile Cost Reported by AWS: ', TextProcessing_percentile90_cost_aws, 'USD')
print('Standard Deviation of Cost Reported by AWS: ', TextProcessing_std_cost_aws, 'USD')

Average Cost Reported by AWS:  1.7011518022999998 USD
Median Cost Reported by AWS:  1.704168475 USD
10-th percentile of Cost Reported by AWS:  1.6208349750000002 USD
90-th percentile Cost Reported by AWS:  1.7791686249999998 USD
Standard Deviation of Cost Reported by AWS:  0.06229406601791994 USD


# End-to-end RT and Cost Derived from the Modeling Algorithm

In [97]:
import sys
sys.path.append('../../')
from slappsim.Structures import *
from slappsim.Function import *
from slappsim.PetriApp import *
from slappsim.States import *

In [187]:
TextProcessing_lambda_profile_logs = pd.read_csv('TextProcessing_lambda_logs.csv', low_memory=False)
TextProcessing_lambda_profile_logs.columns = ['Function', 'Memory_Size', 'Max_Memory_Used', 'Duration', 'Billed_Duration',
                              'UTCTimestamp', 'RequestId']
scheduling_overhead = pd.read_csv('Scheduling_Overhead.csv')
scheduling_overhead = np.array(scheduling_overhead['scheduling_overhead'].to_list())
function_execution_delay = pd.read_csv('Function_Execution_Delay.csv')
function_execution_delay = np.array(function_execution_delay['Duration'].to_list())

In [188]:
rs = np.random.RandomState(64)
random.seed(64)

## Define the application

In [189]:
f1_rt = np.array(TextProcessing_lambda_profile_logs.query(f"Function=='f1'")['Duration'].to_list()[500:4501])
f1_pp_fun = partial(rs.choice, a=f1_rt)
f1 = Function(pf_fun=f1_pp_fun, mem=mem_config_list['f1'], name='f1')
f2_rt = np.array(TextProcessing_lambda_profile_logs.query(f"Function=='f2'")['Duration'].to_list()[500:4501])
f2_pp_fun = partial(rs.choice, a=f2_rt)
f2 = Function(pf_fun=f2_pp_fun, mem=mem_config_list['f2'], name='f2')
f3_rt = np.array(TextProcessing_lambda_profile_logs.query(f"Function=='f3'")['Duration'].to_list()[500:4501])
f3_pp_fun = partial(rs.choice, a=f3_rt)
f3 = Function(pf_fun=f3_pp_fun, mem=mem_config_list['f3'], name='f3')
f4_rt = np.array(TextProcessing_lambda_profile_logs.query(f"Function=='f4'")['Duration'].to_list()[500:4501])
f4_pp_fun = partial(rs.choice, a=f4_rt)
f4 = Function(pf_fun=f4_pp_fun, mem=mem_config_list['f4'], name='f4')
sfn_scheduling_overhead_fun = partial(rs.choice, a=scheduling_overhead)
function_execution_delay_fun = partial(rs.choice, a=function_execution_delay)
delays = {'FunctionExecution': function_execution_delay_fun, 'SchedulingOverhead': sfn_scheduling_overhead_fun}
start = Start()
end = End()
sequence1 = Sequence(actions=[f1, f2, f3, f4])
structures = [sequence1]
i1 = InArc(place=start)
o1 = OutArc(place=sequence1.structure_start)
t1 = Transition(in_arcs=[i1], out_arcs=[o1])
i2 = InArc(place=sequence1.structure_end)
o2 = OutArc(place=end)
t2 = Transition(in_arcs=[i2], out_arcs=[o2])
transitions = [t1, t2]
transitions += sequence1.transitions
TextProcessingApp = PetriApp(transitions=transitions,
                  functions=[f1, f2, f3, f4],
                  structures=structures,
                  delays=delays)

## Run the modeling algorithm

In [198]:
ert = []
ec = []
for i in range(100000):
    rt, c, s, logs = TextProcessingApp.execute()
    ert.append(rt)
    ec.append(c * 1000000)
    TextProcessingApp.reset()
TextProcessingApp_avg_cost_mdl = np.mean(ec)
TextProcessingApp_mid_cost_mdl = np.median(ec)
TextProcessingApp_percentile10_cost_mdl = np.percentile(ec, 10)
TextProcessingApp_percentile90_cost_mdl = np.percentile(ec, 90)
TextProcessingApp_std_cost_mdl = np.std(ec)
TextProcessingApp_avg_ert_mdl = np.mean(ert)
TextProcessingApp_mid_ert_mdl = np.median(ert)
TextProcessingApp_percentile10_ert_mdl = np.percentile(ert, 10)
TextProcessingApp_percentile90_ert_mdl = np.percentile(ert, 90)
TextProcessingApp_std_ert_mdl = np.std(ert)
print('Average Duration Reported by Algorithm: ', TextProcessingApp_avg_ert_mdl, 'ms')
print('Median Duration Reported by Algorithm: ', TextProcessingApp_mid_ert_mdl, 'ms')
print('10-th percentile of Duration Reported by Algorithm: ', TextProcessingApp_percentile10_ert_mdl, 'ms')
print('90-th percentile Duration Reported by Algorithm: ', TextProcessingApp_percentile90_ert_mdl, 'ms')
print('Standard Deviation of Duration Reported by Algorithm: ', TextProcessingApp_std_ert_mdl, 'ms')
print('Average Cost Reported by Algorithm: ', TextProcessingApp_avg_cost_mdl, 'USD')
print('Median Cost Reported by Algorithm: ', TextProcessingApp_mid_cost_mdl, 'USD')
print('10-th percentile of Cost Reported by Algorithm: ', TextProcessingApp_percentile10_cost_mdl, 'USD')
print('90-th percentile Cost Reported by Algorithm: ', TextProcessingApp_percentile90_cost_mdl, 'USD')
print('Standard Deviation of Cost Reported by Algorithm: ', TextProcessingApp_std_cost_mdl, 'USD')

Average Duration Reported by Algorithm:  445.49864957348854 ms
Median Duration Reported by Algorithm:  430.2648741149902 ms
10-th percentile of Duration Reported by Algorithm:  359.409997138977 ms
90-th percentile Duration Reported by Algorithm:  549.6712565383912 ms
Standard Deviation of Duration Reported by Algorithm:  82.782754399849 ms
Average Cost Reported by Algorithm:  1.7001918003799998 USD
Median Cost Reported by Algorithm:  1.704168475 USD
10-th percentile of Cost Reported by Algorithm:  1.6166683 USD
90-th percentile Cost Reported by Algorithm:  1.7791686249999998 USD
Standard Deviation of Cost Reported by Algorithm:  0.06581033333715856 USD


In [199]:
TextProcessingApp_cost_aws = pd.DataFrame(pd.Series(cost_list), columns=['Cost'])
TextProcessingApp_cost_aws.to_csv('TextProcessingApp_cost_aws.csv', index=False)
TextProcessingApp_duration_model = pd.DataFrame(pd.Series(ert), columns=['Duration'])
TextProcessingApp_duration_model.to_csv('TextProcessingApp_duration_model.csv', index=False)
TextProcessingApp_cost_model = pd.DataFrame(pd.Series(ec), columns=['Cost'])
TextProcessingApp_cost_model.to_csv('TextProcessingApp_cost_model.csv', index=False)