In [None]:
# calculate potential savings in USD from moving your existing RDS fleet to Aurora Serverless V2
#
# BUGS
# - hard-wired Aurora Serverless V2 ACU pricing
# - assumes all On-Demand, does not factor in Reserved Instances
# - assumes 1 ACU = 0.25 vCPU which (probably) is fine for now but may change
# - doesn't work well for burstable instances (assumes that they are equivalent to regular instances)
# - not tested on account with large number of DB instances (does describe_db_instances paginate?)
# - requires that your AWS CLI is properly set up and with proper AWS access/secret keys and region
# - only queries RDS fleet in current region (defined in AWS CLI configuration)
# - does not factor in storage cost savings when moving from MAZ RDS (non-Aurora) to Aurora
#
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHOR OR COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
#
# receiver of blame: orly.andico@gmail.com

import boto3
import pandas as pd
import numpy as np
from datetime import date, timedelta, datetime
import json
import math
import os

client = boto3.client('rds')
cw = boto3.client('cloudwatch')
sess = boto3.session.Session()
region = sess.region_name

print(region)

In [None]:
response = client.describe_db_instances()

In [None]:
df = pd.DataFrame()
df = pd.concat([df, pd.DataFrame(response['DBInstances']) ], ignore_index=True)
df

In [None]:
df2 = df.filter(['DBInstanceIdentifier','DBInstanceClass','Engine', 'DBInstanceStatus', 'AllocatedStorage', 'SecondaryAvailabilityZone'], axis=1)
df2 = df2.astype({"AllocatedStorage": int, "SecondaryAvailabilityZone": str})

# we need to remove DBInstanceClass = "db.serverless" which corresponds to serverless v2
# serverless v1 does not show up in describe_db_instances
df2 = df2[ df2['DBInstanceClass'] != 'db.serverless'].reset_index(drop=True)

# we also need to remove any databases which aren't supported (i.e. not MySQL or PostgreSQL)
df2 = df2[ df2['Engine'].isin(['aurora-mysql', 'mysql', 'aurora-postgresql', 'postgresql'])]

df2

In [None]:
# extract pricing data
# this only really works for MySQL and PostgreSQL because we hard-wire the no license required
def get_rds_instance_hourly_price(region_name, instance_type, database_engine, deployment_option):

    filters = [
        {'Type': 'TERM_MATCH', 'Field': 'instanceType', 'Value': instance_type},
        {'Type': 'TERM_MATCH', 'Field': 'databaseEngine', 'Value': database_engine},
        {'Type': 'TERM_MATCH', 'Field': 'licenseModel', 'Value': 'No License required'},
        {'Type': 'TERM_MATCH', 'Field': 'deploymentOption', 'Value': deployment_option},        
        {'Type': 'TERM_MATCH', 'Field': 'regionCode', 'Value': region_name}
    ]
    
#    print ("DEBUG: ", filters)

    pricing_client = boto3.client('pricing', region_name='us-east-1')    
    response = pricing_client.get_products(ServiceCode='AmazonRDS', Filters=filters, MaxResults=1)

    j = json.loads(response['PriceList'][0])
    od = j['terms']['OnDemand']
    id1 = list(od)[0]
    id2 = list(od[id1]['priceDimensions'])[0]

    price_od = od[id1]['priceDimensions'][id2]['pricePerUnit']['USD']

    r = {
        'vcpu': j['product']['attributes']['vcpu'],
        'memory': j['product']['attributes']['memory'],
        'pricePerUnit': price_od,
        'instanceType': j['product']['attributes']['instanceType'],
        'databaseEngine': j['product']['attributes']['databaseEngine'],
        'deploymentOption': j['product']['attributes']['deploymentOption']
    }
    return (r)

In [None]:
# fetch the pricing for every row in the RDS instances

nrows = len(df2.index)
idx = 0

# iterate over all rows in dataframe
while (idx < nrows):
    db = df2.iloc[idx]['Engine']
    az = df2.iloc[idx]['SecondaryAvailabilityZone']
    if (len(az) > 4):
        deploymentOption = 'Multi-AZ'
    else:
        deploymentOption = 'Single-AZ'
    
    ic = df2.iloc[idx]['DBInstanceClass']
    
    if (db == 'mysql'):
        databaseEngine = 'MySQL'
    elif (db == 'aurora-mysql'):
        databaseEngine = 'Aurora MySQL'
    elif (db == 'postgresql'):
        databaseEngine = 'PostgreSQL'
    elif (db == 'aurora-postgresql'):
        databaseEngine = 'Aurora PostgreSQL'
    else:
        databaseEngine = 'MySQL'
    
    
    r = get_rds_instance_hourly_price(region, ic, databaseEngine, deploymentOption)
    
    # sanity check that we got the correct match
    if (ic == r['instanceType'] and
        databaseEngine == r['databaseEngine'] and
       deploymentOption == r['deploymentOption']):
        
        r['memory'] = r['memory'].replace(" GiB", "") 
        
        r['memory'] = float(r['memory'])
        r['pricePerUnit'] = float(r['pricePerUnit'])
        r['vcpu'] = float(r['vcpu'])

        if (deploymentOption == 'Multi-AZ'):
            num = 2
        else:
            num = 1
            
        print(r, "\n")
        df2.loc[idx, ['pricePerUnit', 'deploymentOption', 'vcpu', 'memory', 'pricePerMonth']] = [r['pricePerUnit'], r['deploymentOption'], r['vcpu'], r['memory'], r['pricePerUnit'] * 730 * num ]
                                                                                
    idx = idx + 1
                                                            


In [None]:
df2

In [None]:
# we can only fetch 1440 data points from Cloudwatch, so over a 2-week period (20160 minutes)
# our sampling interval is 14 minutes; also note we are fetching the *MAXIMUM* over each sampling period
# however, let's use a less aggressive 1-hour sampling interval

dfutil = pd.DataFrame()

for dbid in df2['DBInstanceIdentifier']:
    stats = cw.get_metric_statistics(
        Namespace='AWS/RDS',
        Dimensions=[
            {
                'Name': 'DBInstanceIdentifier',
                'Value': dbid
            }
        ],
        MetricName='CPUUtilization',
        StartTime=datetime.now() - timedelta(days=14),
        EndTime=datetime.now(),
        Period=3600,
        Statistics=[ 'Maximum' ])
    df3 = pd.DataFrame(stats['Datapoints'])
    df3['DBInstanceIdentifier'] = dbid

    dfutil = pd.concat([dfutil, df3], ignore_index=True)

In [None]:
dfutil

In [None]:
# for each DBInstanceIdentifier, get the average and maximum CPU utilization
df_agg = dfutil.groupby("DBInstanceIdentifier").Maximum.agg(["mean", "std", "max", "count"]).reset_index()
df_agg['threeSigma'] = df_agg['mean'] + 3*df_agg['std']
df_agg

In [None]:
df_combined = pd.merge(df2, df_agg)
df_combined

In [None]:
# 2 GB RAM = 1 ACU, and very roughly, 1 ACU = 0.25 vCPU
# we currently do not recommend < 2 ACU for various reasons.. (although 0.5 ACU is the stated minimum)
df_combined['acu_usage'] = df_combined['mean'] * df_combined['vcpu'] * 4 / 50
df_combined['acu_usage'] = df_combined['acu_usage'].apply(np.floor) + 0.5
df_combined

In [None]:
### FIXME: haven't figured out how to extract ACU pricing from the Pricing API
### hard-wiring for now, currently APG/AMS Serverless V2 ACU pricing is identical
### also.. no GovCloud
### https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Concepts.RegionsAndAvailabilityZones.html

v2_pricing = {
    'ap-east-1': 0.22,
    'ap-northeast-1': 0.20,
    'ap-northeast-2': 0.20,
    'ap-south-1': 0.18,
    'ap-southeast-1': 0.20,
    'ap-southeast-2': 0.20,
    'ca-central-1': 0.14,
    'eu-central-1': 0.14,
    'eu-north-1': 0.14,
    'eu-west-1': 0.14,
    'eu-west-2': 0.14,
    'eu-west-3': 0.14,
    'sa-east-1': 0.25,
    'us-east-1': 0.12,
    'us-east-2': 0.12,
    'us-west-1': 0.16,
    'us-west-2': 0.12,
}

df_pricing = pd.DataFrame.from_dict(v2_pricing, orient='index').reset_index()

df_pricing.columns=[ 'regionCode', 'pricePerAcu']
df_pricing

In [None]:
# prevent this from bombing out unceremoniously if the current region is not one where ServerlessV2 is available
# (via the above hard-wired pricing list)
try:
    ppa = df_pricing[ df_pricing['regionCode'] == region ].values[0][1]
except KeyError:
    # large value to bloat the cost
    ppa = 999999
    
#df_combined['acuPricePerMonth'] = df_combined['acu_usage'] * ppa * 730

df_combined['acuPricePerMonth'] = np.where(df_combined['deploymentOption'] == 'Single-AZ',
                                           df_combined['acu_usage'] * ppa * 730,
                                          df_combined['acu_usage'] * ppa * 730 * 2)

df_combined['potentialSavings'] = np.where(df_combined['acuPricePerMonth'] < df_combined['pricePerMonth'],
                                           df_combined['pricePerMonth'] - df_combined['acuPricePerMonth'], 0)

pid = os.getpid()
outputfile = "aurora_serverless_tco_%06d.csv" % pid
df_combined.to_csv(outputfile, index=False)

print("Wrote output file %s" % outputfile)
df_combined