In [23]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
%matplotlib inline

## Cost to DAG run duration

In [24]:
filenames = ['prom_data_e2-standard-2.csv' ,'prom_data_e2-standard-4.csv', 'prom_data_n1-standard-4.csv']
machine_cost_conversion = {
    'n1': (3.75, 0.04749975),
    'e2': (4.0, 0.067006)
}

#  wzór kosztu:
#   max{(l. corów / l.exec), (pamięć RAM maszyny / executor_memory * driver_memory)} * (dag_duration / 60) * const_cost

def cost_function(r, machine):
    splitted_machine_type = machine.split('-')
    core_number = int(splitted_machine_type[2])
    # podwojenie liczby corów podwaja koszt
    cost = core_number/2 * machine_cost_conversion[splitted_machine_type[0]][1]
    # ramu 4 razy więcej niż l. corów
    ram_memory = machine_cost_conversion[splitted_machine_type[0]][0] * core_number
    return max(core_number/r['executor_instances'], ram_memory/r['executor_memory'] * r['driver_memory']) * r['dag_duration'] / 60 * cost

for file in filenames:
    data = pd.read_csv(file)
    data['executor_memory'] = data['executor_memory'].str.replace('m', '')
    data['driver_memory'] = data['driver_memory'].str.replace('m', '')
    data['executor_memory'] = pd.to_numeric(data['executor_memory'])
    data['driver_memory'] = pd.to_numeric(data['driver_memory'])

    data['cost'] = 0.0 # new cost column
    machine_type = data['machine_type'][0]
    for index, row in data.iterrows():
        data.at[index, 'cost'] = cost_function(row, machine_type)

    fig = px.scatter(data, x="cost", y="dag_duration",labels={
                     "cost": "Koszt w [$]",
                     "dag_duration": "Czas trwania w [s]",
                     "executor_instances": "Liczba executorów",
                     "executor_memory": "Pamięć executora w [MB]",
                     "driver_memory": "Pamięć drivera w [MB]"
                 }, color_discrete_map={
                        "success": "green",
                        "failed": "red"},
                    color="status", title=machine_type, hover_data=['executor_instances', 'executor_memory', 'driver_memory'])
    fig.show()