In [239]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import boto3

In [240]:
pricing_df = pd.read_csv("../data/ec2-prices.csv")
pricing_df['price_per_ecu_on_demand'] = pricing_df['linux_on_demand_cost_hourly']/pricing_df['compute_units_ecu']
pricing_df.head()

Unnamed: 0,Name,InstanceType,memory_gb,compute_units_ecu,vcpu,gpus,fpga,enhanced_networking,linux_on_demand_cost_hourly,price_per_ecu_on_demand
0,R3 High-Memory Large,r3.large,15.25,6.5,2,0,0,Yes,0.17,0.026154
1,M4 Large,m4.large,8.0,6.5,2,0,0,Yes,0.1,0.015385
2,R4 High-Memory Large,r4.large,15.25,7.0,2,0,0,Yes,0.13,0.018571
3,C4 High-CPU Large,c4.large,3.75,8.0,2,0,0,Yes,0.1,0.0125
4,General Purpose GPU Extra Large,p2.xlarge,61.0,12.0,4,1,0,Yes,0.9,0.075


In [241]:
pricing_df.describe()

Unnamed: 0,memory_gb,compute_units_ecu,vcpu,gpus,fpga,linux_on_demand_cost_hourly,price_per_ecu_on_demand
count,53.0,53.0,53.0,53.0,53.0,50.0,50.0
mean,181.033019,65.679245,21.603774,0.603774,0.169811,2.2184,0.032753
std,333.033789,69.828436,24.473172,2.491058,1.104812,3.053673,0.025047
min,3.75,3.0,1.0,0.0,0.0,0.07,0.012045
25%,22.5,14.0,4.0,0.0,0.0,0.3475,0.015595
50%,61.0,35.0,16.0,0.0,0.0,0.98,0.022287
75%,160.0,99.0,32.0,0.0,0.0,2.735,0.045246
max,1952.0,349.0,128.0,16.0,8.0,14.4,0.131429


In [242]:
names = pricing_df["InstanceType"].to_dict()

In [243]:
list(names.values())[0:5]

['r3.large', 'm4.large', 'r4.large', 'c4.large', 'p2.xlarge']

In [244]:
client = boto3.client('ec2')
response =client.describe_spot_price_history(InstanceTypes = list(names.values()),
        ProductDescriptions = ["Linux/UNIX"])

In [245]:
spot_price_history = response['SpotPriceHistory']
spot_history_df = pd.DataFrame(spot_price_history)
spot_history_df.SpotPrice = spot_history_df.SpotPrice.astype(float)

In [246]:
spot_history_df.head()

Unnamed: 0,AvailabilityZone,InstanceType,ProductDescription,SpotPrice,Timestamp
0,us-west-2c,m3.2xlarge,Linux/UNIX,0.0947,2017-09-11 04:04:47+00:00
1,us-west-2c,r3.4xlarge,Linux/UNIX,0.2375,2017-09-11 04:04:46+00:00
2,us-west-2b,r3.xlarge,Linux/UNIX,0.0436,2017-09-11 04:04:46+00:00
3,us-west-2a,c3.4xlarge,Linux/UNIX,0.1786,2017-09-11 04:04:46+00:00
4,us-west-2c,m3.2xlarge,Linux/UNIX,0.0946,2017-09-11 04:04:45+00:00


In [247]:
df = spot_history_df.merge(pricing_df, how="inner", on="InstanceType")
df['price_memory_spot'] = df['SpotPrice']/df['memory_gb']
df['price_ecu_spot'] = df['SpotPrice']/df['compute_units_ecu']
df.describe()

Unnamed: 0,SpotPrice,memory_gb,compute_units_ecu,vcpu,gpus,fpga,linux_on_demand_cost_hourly,price_per_ecu_on_demand,price_memory_spot,price_ecu_spot
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.342192,101.688,67.5595,20.408,0.058,0.0,1.4742,0.022084,0.004523,0.004881
std,0.39051,140.587663,46.703838,14.965878,0.476301,0.0,1.452286,0.012523,0.002691,0.003863
min,0.0166,3.75,7.0,2.0,0.0,0.0,0.1,0.012045,0.000683,0.002263
25%,0.0974,30.0,27.0,8.0,0.0,0.0,0.53,0.015273,0.00263,0.003387
50%,0.2361,60.0,55.0,16.0,0.0,0.0,1.06,0.01963,0.003356,0.003822
75%,0.5565,122.0,104.0,32.0,0.0,0.0,1.68,0.025577,0.00646,0.005293
max,7.2,1952.0,349.0,128.0,8.0,0.0,13.34,0.076596,0.014754,0.076596


In [248]:
df_median = df.groupby("InstanceType").median()
df_median["InstanceType"] = df_median.index
df_median["price_ecu_spot"] = df_median.price_ecu_spot.round(3)
df_median["divide_SpotPrice"] = df_median.SpotPrice/100
df_median.sort_values("", inplace=True)
plt.subplots(figsize=(20,15))
ax = plt.axes()
sns.set_color_codes("muted")
sns.barplot(x="price_ecu_spot", y="InstanceType", data=df_median,
            label="Spot Price Per ECU", color="b")
sns.set_color_codes("pastel")
sns.barplot(x="divide_SpotPrice", y="InstanceType", data=df_median,
            label="Spot Price/100", color="b")

# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(0, .1), ylabel="",
       xlabel="AWS Spot Pricing by Compute Units (ECU)")
sns.despine(left=True, bottom=True)

KeyError: ''

In [None]:
numerical_df = df_median.loc[:,["price_ecu_spot", "price_memory_spot"]]
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(numerical_df)
scaler.transform(numerical_df)
k_means = KMeans(n_clusters=3)
kmeans = k_means.fit(scaler.transform(numerical_df))
df_median["cluster"]=kmeans.labels_
df_median

In [250]:
g = sns.FacetGrid(df_median, col="cluster",  hue="InstanceType")
g = g.map(plt.scatter, "price_ecu_spot", "memory_gb", edgecolor="w")
g.set(xlim=(0, .015))

KeyError: 'cluster'