In [1]:
import os
import logging
from io import BytesIO
import time
import zipfile
import numpy as np
import boto3
from datetime import datetime, timezone
from time import gmtime, strftime
import json
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [53]:
import math

In [2]:
# The difference between UTC and local timezone
timezone_offset = 0

### Function Name List

In [27]:
function_prefix = "Structures"
function_count = 4
function_name_list = [function_prefix+'_f'+str(i) for i in range(1, function_count+1)]
print(function_name_list)

['Structures_f1', 'Structures_f2', 'Structures_f3', 'Structures_f4']


In [28]:
mem_config_list={
    'f1':1280,
    'f2':896,
    'f3':1536,
    'f4':1088
}

# Execute the Application

In [3]:
sfn_client = boto3.client('stepfunctions')

In [4]:
stateMachineArn='arn:aws:states:us-east-2:499537426559:stateMachine:Sequence'

The serverless application workflow can be found in README.md.

## Test Run

In [5]:
sfn_client.start_execution(
    stateMachineArn=stateMachineArn
)

{'executionArn': 'arn:aws:states:us-east-2:499537426559:express:Sequence:6c20490c-5889-4083-a7f9-b837f7c17705:ef3627cb-93f2-4f03-b658-79aa9a306881',
 'startDate': datetime.datetime(2022, 1, 1, 21, 43, 11, 951000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': '84baa58a-23b4-4bbc-8fe4-f16856782a56',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '84baa58a-23b4-4bbc-8fe4-f16856782a56',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '176'},
  'RetryAttempts': 0}}

## Configure Logging

In [6]:
logging.basicConfig(filename='AppExecution.log', encoding='utf-8', format='%(asctime)s.%(msecs)03d %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO)

## Execute Sequence

In [7]:
np.random.seed(256)

In [8]:
for i in range(5000):
    response = sfn_client.start_execution(stateMachineArn=stateMachineArn)
    RequestId = response.get('ResponseMetadata', {}).get('RequestId')
    StatusCode = response.get('ResponseMetadata', {}).get('HTTPStatusCode', 'ERR')
    logging.info(f'{i+1} {StatusCode} {RequestId}')
    time.sleep(10)

## Get the start time and the end time

In [9]:
app_exeuction_start_time = ' '.join(os.popen('head -1 AppExecution.log').read().split(' ')[:2])
app_execution_end_time = ' '.join(os.popen('tail -1 AppExecution.log').read().split(' ')[:2])
app_exeuction_start_time = datetime.strptime(app_exeuction_start_time, '%Y-%m-%d %H:%M:%S.%f')
app_execution_end_time = datetime.strptime(app_execution_end_time, '%Y-%m-%d %H:%M:%S.%f')

In [10]:
app_exeuction_start_time

datetime.datetime(2022, 1, 1, 21, 44, 2, 976000)

In [11]:
app_execution_end_time

datetime.datetime(2022, 1, 2, 11, 49, 54, 885000)

In [12]:
app_exeuction_start_time = int(datetime.timestamp(app_exeuction_start_time))

In [13]:
app_execution_end_time = int(datetime.timestamp(app_execution_end_time)) + 20

# Retrieve Logs

In [14]:
logclient = boto3.client('logs')

## Query Step Functions Logs

In [15]:
query_sfn_Sequence = logclient.start_query(
    logGroupName='/aws/vendedlogs/states/{}-Logs'.format('Sequence'),
    queryString="fields type, @timestamp| filter type = 'ExecutionStarted' or type = 'ExecutionSucceeded' | sort id desc",
    startTime=app_exeuction_start_time,
    endTime=app_execution_end_time,
    limit = 10000
)

In [16]:
query_results_sfn_Sequence = logclient.get_query_results(
    queryId=query_sfn_Sequence['queryId']
)

In [17]:
Sequence_starttimestamp = np.sort([datetime.timestamp(datetime.strptime(item[1]['value'], '%Y-%m-%d %H:%M:%S.%f'))+timezone_offset*3600 for item in query_results_sfn_Sequence['results'] if item[0]['value']=='ExecutionStarted'])
Sequence_endtimestamp = np.sort([datetime.timestamp(datetime.strptime(item[1]['value'], '%Y-%m-%d %H:%M:%S.%f'))+timezone_offset*3600 for item in query_results_sfn_Sequence['results'] if item[0]['value']=='ExecutionSucceeded'])

In [18]:
pd.DataFrame({'Start': Sequence_starttimestamp, 'End':Sequence_endtimestamp}).to_csv('Sequence_SFN_Logs.csv', index=False)


In [19]:
Sequence_sfn_logs = pd.read_csv('Sequence_SFN_Logs.csv', low_memory=False)

In [20]:
Sequence_sfn_logs.shape

(5000, 2)

## Query Lambda Function Logs

### Functions for parsing Logs

In [29]:
def lambda_report_log_to_dict(log):
    res={}
    lis=[item.split(': ') for item in log[1]['value'].split('\t')]
    res['RequestId']=lis[0][1]
    res['Duration']=float(lis[1][1].split(' ')[0])
    res['Billed_Duration']=int(lis[2][1].split(' ')[0])
    res['Memory_Size']=int(lis[3][1].split(' ')[0])
    res['Max_Memory_Used']=int(lis[4][1].split(' ')[0])
    res['UTC_Timestamp'] = time.mktime(datetime.strptime(log[0]['value'], "%Y-%m-%d %H:%M:%S.%f").timetuple()) +timezone_offset*3600
    return res

### Prepare Logs

In [31]:
query_lambda = []
for function in function_name_list:
    query_lambda.append(logclient.start_query(
        logGroupName='/aws/lambda/{}'.format(function),
        queryString="fields @timestamp, @message| filter @message like 'REPORT'| sort @timestamp asc",
        startTime=app_exeuction_start_time,
        endTime=app_execution_end_time,
        limit=10000
        ))
    time.sleep(4)
time.sleep(10)

### Retrieve Logs

In [32]:
query_lambda_results = []
for q in query_lambda:
    query_lambda_results.append(logclient.get_query_results(
        queryId=q['queryId']
    ))
    time.sleep(4)

In [33]:
with open('query_lambda_results.pickle', 'wb') as f:
    f.write(pickle.dumps(query_lambda_results))

In [34]:
Sequence_lambda_logs_dict = {'f'+str(i):None for i in range(1, function_count+1)}
for i in range(1, function_count+1):
    Sequence_lambda_logs_dict['f'+str(i)] = [lambda_report_log_to_dict(item) for item in query_lambda_results[i-1]['results']]
    for item in Sequence_lambda_logs_dict['f'+str(i)]:
        item['Function']='f'+str(i)

In [35]:
len(Sequence_lambda_logs_dict['f1'])

5000

#### Convert Logs into DataFrame and Save as CSV

In [36]:
Sequence_lambda_logs=pd.DataFrame()
for i in range(1, function_count+1):
    Sequence_lambda_logs = Sequence_lambda_logs.append(pd.DataFrame(Sequence_lambda_logs_dict['f'+str(i)]))
Sequence_lambda_logs.index=range(Sequence_lambda_logs.shape[0])
Sequence_lambda_logs=Sequence_lambda_logs[['Function', 'Memory_Size', 'Max_Memory_Used', 'Duration', 'Billed_Duration', 'UTC_Timestamp', 'RequestId']]
Sequence_lambda_logs.to_csv('Sequence_lambda_logs.csv',index=False)

In [37]:
Sequence_lambda_logs = pd.read_csv('Sequence_lambda_logs.csv', low_memory=False)
Sequence_lambda_logs.columns = ['Function', 'Memory_Size', 'Max_Memory_Used', 'Duration', 'Billed_Duration', 'UTCTimestamp', 'RequestId']

In [43]:
Sequence_lambda_logs.head()

Unnamed: 0,Function,Memory_Size,Max_Memory_Used,Duration,Billed_Duration,UTCTimestamp,RequestId
0,f1,1280,37,268.28,269,1641073000.0,9d490a02-f6d9-4b66-860b-fdeb99af4676
1,f1,1280,37,272.49,273,1641073000.0,a6b28794-6b7c-48ce-8b2b-ba3a2830183c
2,f1,1280,37,276.59,277,1641073000.0,fe5c2eb6-f740-4ac1-9f5d-4deabd741ad6
3,f1,1280,37,272.7,273,1641073000.0,91e47349-5d23-440c-a2b6-0e450461b6a1
4,f1,1280,37,277.07,278,1641073000.0,1503ffea-41ba-4d16-b549-0471571fab5c


In [44]:
for i in range(1, function_count+1):
    print(f"f{i}", Sequence_lambda_logs.query(f"Function == 'f{i}'").shape[0], Sequence_lambda_logs.query(f"Function == 'f{i}'")['Duration'].mean())

f1 5000 276.184512
f2 5000 207.657308
f3 5000 695.08304
f4 5000 297.17729199999997


In [122]:
def calculate_cost(rt: float, mem: float, pmms: float = 0.0000166667/1024/1000, ppi: float = 0.0000002) -> float:
    return math.ceil(rt) * mem * pmms + ppi

def adjacent_values(vals, q1, q3):
    upper_adjacent_value = q3 + (q3 - q1) * 1.5
    upper_adjacent_value = np.clip(upper_adjacent_value, q3, vals[-1])

    lower_adjacent_value = q1 - (q3 - q1) * 1.5
    lower_adjacent_value = np.clip(lower_adjacent_value, vals[0], q1)
    return lower_adjacent_value, upper_adjacent_value

# End-to-end RT Reported by AWS

In [185]:
Sequence_duration = pd.DataFrame((Sequence_sfn_logs['End'] - Sequence_sfn_logs['Start'])*1000, columns=['Duration'])
Sequence_duration.to_csv('Sequence_duration_aws.csv', index=False)
Sequence_duration = pd.read_csv('Sequence_duration_aws.csv', low_memory=False)

In [186]:
print('Number of Executions: ', len(Sequence_duration['Duration']))
Sequence_avg_duration_aws = np.mean(Sequence_duration['Duration'])
Sequence_mid_duration_aws = np.median(Sequence_duration['Duration'])
Sequence_percentile10_aws = np.percentile(Sequence_duration['Duration'], 10)
Sequence_percentile90_aws = np.percentile(Sequence_duration['Duration'], 90)
print('Average Duration Reported by AWS: ', Sequence_avg_duration_aws, 'ms')
print('Median Duration Reported by AWS: ', Sequence_mid_duration_aws, 'ms')
print('10-th percentile of Duration Reported by AWS: ', Sequence_percentile10_aws, 'ms')
print('90-th percentile Duration Reported by AWS: ', Sequence_percentile90_aws, 'ms')
print('Standard Deviation of Duration Reported by AWS: ', np.std(Sequence_duration['Duration']), 'ms')

Number of Executions:  5000
Average Duration Reported by AWS:  1563.7534010410309 ms
Median Duration Reported by AWS:  1569.999933242798 ms
10-th percentile of Duration Reported by AWS:  1462.9998207092283 ms
90-th percentile Duration Reported by AWS:  1658.9999437332153 ms
Standard Deviation of Duration Reported by AWS:  84.33829285086627 ms


# Cost Reported by AWS

In [126]:
Sequence_sfn_logs.head()

Unnamed: 0,Start,End
0,1641073000.0,1641073000.0
1,1641073000.0,1641073000.0
2,1641073000.0,1641073000.0
3,1641073000.0,1641073000.0
4,1641073000.0,1641073000.0


In [127]:
Sequence_lambda_logs.head()

Unnamed: 0,Function,Memory_Size,Max_Memory_Used,Duration,Billed_Duration,UTCTimestamp,RequestId
0,f1,1280,37,268.28,269,1641073000.0,9d490a02-f6d9-4b66-860b-fdeb99af4676
1,f1,1280,37,272.49,273,1641073000.0,a6b28794-6b7c-48ce-8b2b-ba3a2830183c
2,f1,1280,37,276.59,277,1641073000.0,fe5c2eb6-f740-4ac1-9f5d-4deabd741ad6
3,f1,1280,37,272.7,273,1641073000.0,91e47349-5d23-440c-a2b6-0e450461b6a1
4,f1,1280,37,277.07,278,1641073000.0,1503ffea-41ba-4d16-b549-0471571fab5c


In [165]:
cost_list = []
for index, row in Sequence_sfn_logs.iterrows():
    cost = 0
    app_start = row['Start'] - 2.5
    app_end = row['End'] + 2.5
    lambda_logs = Sequence_lambda_logs.query(f"""UTCTimestamp>{app_start-4} and UTCTimestamp<{app_end+4}""")
    for i, r in lambda_logs.iterrows():
        memory_size = r['Memory_Size']
        billed_duration = r['Billed_Duration']
        cost += calculate_cost(rt=billed_duration, mem=memory_size) * 1000000
    cost_list.append(cost)

In [166]:
Sequence_avg_cost_aws = np.mean(cost_list)
Sequence_mid_cost_aws = np.median(cost_list)
Sequence_percentile10_cost_aws = np.percentile(cost_list, 10)
Sequence_percentile90_cost_aws = np.percentile(cost_list, 90)
Sequence_std_cost_aws = np.std(cost_list)
print('Average Cost Reported by AWS: ', Sequence_avg_cost_aws, 'USD')
print('Median Cost Reported by AWS: ', Sequence_mid_cost_aws, 'USD')
print('10-th percentile of Cost Reported by AWS: ', Sequence_percentile10_cost_aws, 'USD')
print('90-th percentile Cost Reported by AWS: ', Sequence_percentile90_cost_aws, 'USD')
print('Standard Deviation of Cost Reported by AWS: ', Sequence_std_cost_aws, 'USD')

Average Cost Reported by AWS:  32.2608600049275 USD
Median Cost Reported by AWS:  32.547459328125 USD
10-th percentile of Cost Reported by AWS:  30.002975072500004 USD
90-th percentile Cost Reported by AWS:  34.588921744375 USD
Standard Deviation of Cost Reported by AWS:  1.8841740013152626 USD


# End-to-end RT and Cost Derived from the Modeling Algorithm

In [152]:
import sys
sys.path.append('../../')
from slappsim.Structures import *
from slappsim.Function import *
from slappsim.PetriApp import *
from slappsim.States import *

In [153]:
Structures_lambda_logs = pd.read_csv('../structures/Structures_lambda_logs.csv', low_memory=False)
Structures_lambda_logs.columns = ['Function', 'Memory_Size', 'Max_Memory_Used', 'Duration', 'Billed_Duration',
                              'UTCTimestamp', 'RequestId']
scheduling_overhead = pd.read_csv('../sfn-delay/Scheduling_Overhead.csv')
scheduling_overhead = np.array(scheduling_overhead['scheduling_overhead'].to_list())
function_execution_delay = pd.read_csv('../sfn-delay/Function_Execution_Delay.csv')
function_execution_delay = np.array(function_execution_delay['Duration'].to_list())

In [182]:
rs = np.random.RandomState(64)
random.seed(64)

## Define the application

In [183]:
f1_rt = np.array(Structures_lambda_logs.query(f"Function=='f1'")['Duration'].to_list()[500:9501])
f1_pp_fun = partial(rs.choice, a=f1_rt)
f1 = Function(pf_fun=f1_pp_fun, mem=mem_config_list['f1'], name='f1')
f2_rt = np.array(Structures_lambda_logs.query(f"Function=='f2'")['Duration'].to_list()[500:9501])
f2_pp_fun = partial(rs.choice, a=f2_rt)
f2 = Function(pf_fun=f2_pp_fun, mem=mem_config_list['f2'], name='f2')
f3_rt = np.array(Structures_lambda_logs.query(f"Function=='f3'")['Duration'].to_list()[500:9501])
f3_pp_fun = partial(rs.choice, a=f3_rt)
f3 = Function(pf_fun=f3_pp_fun, mem=mem_config_list['f3'], name='f3')
f4_rt = np.array(Structures_lambda_logs.query(f"Function=='f4'")['Duration'].to_list()[500:9501])
f4_pp_fun = partial(rs.choice, a=f4_rt)
f4 = Function(pf_fun=f4_pp_fun, mem=mem_config_list['f4'], name='f4')
sfn_scheduling_overhead_fun = partial(rs.choice, a=scheduling_overhead)
function_execution_delay_fun = partial(rs.choice, a=function_execution_delay)
delays = {'FunctionExecution': function_execution_delay_fun, 'SchedulingOverhead': sfn_scheduling_overhead_fun}
start = Start()
end = End()
sequence1 = Sequence(actions=[f1, f2, f3, f4])
structures = [sequence1]
i1 = InArc(place=start)
o1 = OutArc(place=sequence1.structure_start)
t1 = Transition(in_arcs=[i1], out_arcs=[o1])
i2 = InArc(place=sequence1.structure_end)
o2 = OutArc(place=end)
t2 = Transition(in_arcs=[i2], out_arcs=[o2])
transitions = [t1, t2]
transitions += sequence1.transitions
AppSequence = PetriApp(transitions=transitions,
                  functions=[f1, f2, f3, f4],
                  structures=structures,
                  delays=delays)

## Run the modeling algorithm

In [184]:
ert = []
ec = []
for i in range(100000):
    rt, c, s, logs = AppSequence.execute()
    ert.append(rt)
    ec.append(c * 1000000)
    AppSequence.reset()
Sequence_avg_cost_mdl = np.mean(ec)
Sequence_mid_cost_mdl = np.median(ec)
Sequence_percentile10_cost_mdl = np.percentile(ec, 10)
Sequence_percentile90_cost_mdl = np.percentile(ec, 90)
Sequence_std_cost_mdl = np.std(ec)
Sequence_avg_ert_mdl = np.mean(ert)
Sequence_mid_ert_mdl = np.median(ert)
Sequence_percentile10_ert_mdl = np.percentile(ert, 10)
Sequence_percentile90_ert_mdl = np.percentile(ert, 90)
Sequence_std_ert_mdl = np.std(ert)
print('Average Duration Reported by Algorithm: ', Sequence_avg_ert_mdl, 'ms')
print('Median Duration Reported by Algorithm: ', Sequence_mid_ert_mdl, 'ms')
print('10-th percentile of Duration Reported by Algorithm: ', Sequence_percentile10_ert_mdl, 'ms')
print('90-th percentile Duration Reported by Algorithm: ', Sequence_percentile90_ert_mdl, 'ms')
print('Standard Deviation of Duration Reported by Algorithm: ', Sequence_std_ert_mdl, 'ms')
print('Average Cost Reported by Algorithm: ', Sequence_avg_cost_mdl, 'USD')
print('Median Cost Reported by Algorithm: ', Sequence_mid_cost_mdl, 'USD')
print('10-th percentile of Cost Reported by Algorithm: ', Sequence_percentile10_cost_mdl, 'USD')
print('90-th percentile Cost Reported by Algorithm: ', Sequence_percentile90_cost_mdl, 'USD')
print('Standard Deviation of Cost Reported by Algorithm: ', Sequence_std_cost_mdl, 'USD')

Average Duration Reported by Algorithm:  1580.8224330443036 ms
Median Duration Reported by Algorithm:  1581.2000801086424 ms
10-th percentile of Duration Reported by Algorithm:  1482.0801949310305 ms
90-th percentile Duration Reported by Algorithm:  1666.0011735687256 ms
Standard Deviation of Duration Reported by Algorithm:  176.79663144992318 ms
Average Cost Reported by Algorithm:  32.961944136259625 USD
Median Cost Reported by Algorithm:  33.1500647 USD
10-th percentile of Cost Reported by Algorithm:  30.724018181250003 USD
90-th percentile Cost Reported by Algorithm:  34.781317962500005 USD
Standard Deviation of Cost Reported by Algorithm:  3.2991070094304167 USD


In [188]:
Sequence_cost_aws = pd.DataFrame(pd.Series(cost_list), columns=['Cost'])
Sequence_cost_aws.to_csv('Sequence_cost_aws.csv', index=False)
Sequence_duration_model = pd.DataFrame(pd.Series(ert), columns=['Duration'])
Sequence_duration_model.to_csv('Sequence_duration_model.csv', index=False)
Sequence_cost_model = pd.DataFrame(pd.Series(ec), columns=['Cost'])
Sequence_cost_model.to_csv('Sequence_cost_model.csv', index=False)