In [1]:
import sys
import os
print(sys.executable)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import importlib
seaborn_found = importlib.util.find_spec('seaborn')
if seaborn_found is None:
    sys.write.stderr("[error] Seaborn package not found. exit")
    exit(-1)

import seaborn as sns
import pandas as pd

import matplotlib as mpl

mpl.use("pgf")

import matplotlib.pyplot as plt

plt.rcParams.update({
    "font.family": "serif",  # use serif/main font for text elements
    "text.usetex": False,     # use inline math for ticks
    "pgf.rcfonts": False,    # don't setup fonts from rc parameters
    "pgf.preamble": [
         "\\usepackage{units}",          # load additional packages
         "\\usepackage{metalogo}",
         "\\usepackage{unicode-math}",   # unicode math setup
         r"\setmathfont{xits-math.otf}",
         ]
})

# plt.rc('text', usetex=False)

plt.rcParams.update({
    "font.family": "serif",
    "font.serif": [],                    # use latex default serif font
    "font.size": "18",
})

plt.rcParams.update({
    "pgf.texsystem": "pdflatex",
    "pgf.preamble": [
         r"\usepackage[utf8x]{inputenc}",
         r"\usepackage[T1]{fontenc}",
         r"\usepackage{cmbright}",
         ]
})


sns.set(style="ticks", color_codes=True)
sns.set_context("paper")
sns.set_style("darkgrid", {"axes.facecolor": ".9", 'xtick.bottom': True, 'xtick.top': False, 'ytick.left': True, 'ytick.right': False})

mpl.pyplot.close("all")


tasks = ["STAGEIN_TIME_S", "RESAMPLE_TIME_S", "COMBINE_TIME_S", "MAKESPAN_S"]

/usr/local/opt/python/bin/python3.7


In [2]:
DIR="/Users/lpottier/research/usc-isi/projects/workflow-io-bb/data/traces/swarp/shared-cori/"
csv_file = DIR+"/"+"bb_runs2020-32c.csv"

# DIR="/Users/lpottier/research/usc-isi/projects/workflow-io-bb/tools/"
# csv_file = DIR+"/"+"swarp-run-1W-32c-raw.csv"

CORE="32"
PLOT_DIR="/Users/lpottier/research/usc-isi/projects/paper-workflow-bb/figures/plots/real-swarp"
OUTPUT = PLOT_DIR+"/"+"swarp-1w-"+CORE+"c/"


In [3]:

try:
    # Create target Directory
    os.mkdir(OUTPUT)
    print("Directory " , OUTPUT ,  " Created ") 
except FileExistsError:
    print("Directory " , OUTPUT ,  " already exists")


#csv_file = "swarp_test_switches.csv"
df_swarp = pd.read_csv(csv_file, sep=' ')

df_swarp_priv = df_swarp[df_swarp.BB_TYPE=="PRIVATE"]
df_swarp_strip = df_swarp[df_swarp.BB_TYPE=="STRIPED"]

print(df_swarp.BB_NB_FILES.unique())
df_swarp = df_swarp.loc[df_swarp['BB_NB_FILES'].isin([0,8,16,24,32])]
print(df_swarp.BB_NB_FILES.unique())

print(df_swarp.info())

Directory  /Users/lpottier/research/usc-isi/projects/paper-workflow-bb/figures/plots/real-swarp/swarp-1w-32c/  already exists
[ 0 16 24 32  8]
[ 0 16 24 32  8]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 300 entries, 0 to 299
Data columns (total 24 columns):
ID                     300 non-null int64
START                  0 non-null float64
END                    0 non-null float64
FITS                   300 non-null object
NB_PIPELINE            300 non-null int64
NB_CORES               300 non-null int64
AVG                    300 non-null int64
PIPELINE               300 non-null int64
BB_TYPE                300 non-null object
BB_ALLOC_SIZE_MB       300 non-null float64
TOTAL_NB_FILES         300 non-null int64
BB_NB_FILES            300 non-null int64
TOTAL_SIZE_FILES_MB    300 non-null float64
BB_SIZE_FILES_MB       300 non-null float64
MAKESPAN_S             300 non-null float64
WALLTIME_S             300 non-null float64
STAGEIN_TIME_S         300 non-null float64
STAGEIN

In [4]:
small_dt = df_swarp[["FITS", "BB_TYPE", "NB_CORES", "BB_NB_FILES","RESAMPLE_TIME_S", "COMBINE_TIME_S"]]
#small_dt = small_dt.sort_values(by=['NB_PIPELINE'])
print(small_dt.sort_values(by=['NB_CORES', 'BB_NB_FILES']))

    FITS  BB_TYPE  NB_CORES  BB_NB_FILES  RESAMPLE_TIME_S  COMBINE_TIME_S
0      N  PRIVATE        32            0           90.813          26.265
1      N  PRIVATE        32            0           94.641          28.077
2      N  PRIVATE        32            0           91.890          26.629
3      N  PRIVATE        32            0           90.923          25.972
4      N  PRIVATE        32            0           91.594          28.213
5      N  PRIVATE        32            0           95.387          25.274
6      N  PRIVATE        32            0           88.717          25.550
7      N  PRIVATE        32            0           95.490          25.124
8      N  PRIVATE        32            0           90.047          26.975
9      N  PRIVATE        32            0           90.191          25.455
10     N  PRIVATE        32            0          121.028          26.454
11     N  PRIVATE        32            0          103.485          27.713
12     N  PRIVATE        32           

In [5]:
small_dt = df_swarp[["NB_PIPELINE", "FITS", "BB_NB_FILES" ,"MAKESPAN_S"]]
#small_dt = small_dt.sort_values(by=['NB_PIPELINE'])
print(small_dt.sort_values(by=['NB_PIPELINE', 'FITS', "BB_NB_FILES"]))

     NB_PIPELINE FITS  BB_NB_FILES  MAKESPAN_S
0              1    N            0  117.172039
1              1    N            0  122.814628
2              1    N            0  118.663112
3              1    N            0  117.028844
4              1    N            0  120.190182
5              1    N            0  120.850907
6              1    N            0  114.438590
7              1    N            0  120.748271
8              1    N            0  117.174212
9              1    N            0  115.770284
10             1    N            0  147.647841
11             1    N            0  131.344710
12             1    N            0  113.013523
13             1    N            0  117.271874
14             1    N            0  134.498929
30             1    N            0  111.536143
31             1    N            0  111.201032
32             1    N            0  113.952142
33             1    N            0  112.891694
34             1    N            0  111.582203
35           

In [6]:
small_dt = df_swarp[["FITS", "BB_NB_FILES" ,"RESAMPLE_TIME_S", "COMBINE_TIME_S"]]
#small_dt = small_dt.sort_values(by=['NB_PIPELINE'])
print(small_dt.sort_values(by=['BB_NB_FILES']))

    FITS  BB_NB_FILES  RESAMPLE_TIME_S  COMBINE_TIME_S
0      N            0           90.813          26.265
32     N            0           87.423          26.378
33     N            0           87.471          25.279
34     N            0           85.763          25.664
35     N            0           85.535          30.542
36     N            0           88.414          26.305
37     N            0           85.911          27.384
38     N            0           84.598          28.146
39     N            0           84.628          26.540
40     N            0           84.705          27.945
41     N            0           88.054          27.881
42     N            0           85.334          26.356
43     N            0           88.465          27.490
44     N            0           89.178          27.394
45     Y            0          101.344          41.799
46     Y            0          115.338          46.195
47     Y            0           97.160          42.685
48     Y  

In [7]:
#grouped = small_dt.groupby('BB_NB_FILES', sort=True)
#print(grouped.groups)

In [8]:
# plt.figure()
# df_swarp_priv = df_swarp_priv.sort_values(by=['BB_NB_FILES'])
# tmp_df = df_swarp_priv[df_swarp_priv.FITS=="N"]
# plt.errorbar(tmp_df.BB_NB_FILES, tmp_df.RESAMPLE_MEAN_TIME_S, xerr=0, yerr=tmp_df.RESAMPLE_SD_TIME)
# #plt.title("")

# #g = sns.lineplot(x="BB_NB_FILES", y="RESAMPLE_MEAN_TIME_S", ci="RESAMPLE_SD_TIME", hue="FITS", style="FITS", markers=True, dashes=True, data=swarp_dt_priv)

In [9]:
#g = sns.lineplot(x="BB_NB_FILES", y="COMBINE_MEAN_TIME_S", hue="FITS", style="FITS", markers=True, dashes=True, data=swarp_dt_priv)

# swarp_dt_priv = swarp_dt_priv.sort_values(by=['BB_NB_FILES'])
# swarp_dt_priv_nofits = swarp_dt_priv[swarp_dt_priv.FITS=="N"]
# swarp_dt_priv_fits = swarp_dt_priv[swarp_dt_priv.FITS=="Y"]


# fig, axs = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=False)
# #plt.axis('scaled')

# ax = axs[0,0]
# ax.errorbar(swarp_dt_priv_fits.BB_NB_FILES, swarp_dt_priv_fits.MEAN_MAKESPAN_S, yerr=swarp_dt_priv_fits.SD_MAKESPAN, color="b")
# ax.errorbar(swarp_dt_priv_nofits.BB_NB_FILES, swarp_dt_priv_nofits.MEAN_MAKESPAN_S, yerr=swarp_dt_priv_nofits.SD_MAKESPAN, color="r")

# ax.set_title('MEAN_MAKESPAN_S')

# # With 4 subplots, reduce the number of axis ticks to avoid crowding.
# ax.locator_params(nbins=4)

# ax.minorticks_on()
# ax.grid()
# ax.set_ylabel('Time (s)')


# ax = axs[0,1]
# ax.errorbar(swarp_dt_priv_fits.BB_NB_FILES, swarp_dt_priv_fits.STAGEIN_MEAN_TIME_S, yerr=swarp_dt_priv_fits.STAGEIN_SD_TIME, color="b")
# ax.errorbar(swarp_dt_priv_nofits.BB_NB_FILES, swarp_dt_priv_nofits.STAGEIN_MEAN_TIME_S, yerr=swarp_dt_priv_nofits.STAGEIN_SD_TIME, color="r")
# ax.set_title('STAGEIN_MEAN_TIME_S')

# ax.minorticks_on()
# ax.grid()

# ax = axs[1,0]
# ax.errorbar(swarp_dt_priv_fits.BB_NB_FILES, swarp_dt_priv_fits.RESAMPLE_MEAN_TIME_S, yerr=swarp_dt_priv_fits.RESAMPLE_SD_TIME, color="b")
# ax.errorbar(swarp_dt_priv_nofits.BB_NB_FILES, swarp_dt_priv_nofits.RESAMPLE_MEAN_TIME_S, yerr=swarp_dt_priv_nofits.RESAMPLE_SD_TIME, color="r")

# ax.set_title('RESAMPLE_MEAN_TIME_S')

# ax.minorticks_on()
# ax.grid()
# ax.set_xlabel('Number of files in BB')
# ax.set_ylabel('Time (s)')

# ax = axs[1,1]
# ax.errorbar(swarp_dt_priv_fits.BB_NB_FILES, swarp_dt_priv_fits.COMBINE_MEAN_TIME_S, yerr=swarp_dt_priv_fits.COMBINE_SD_TIME, color="b")
# ax.errorbar(swarp_dt_priv_nofits.BB_NB_FILES, swarp_dt_priv_nofits.COMBINE_MEAN_TIME_S, yerr=swarp_dt_priv_nofits.COMBINE_SD_TIME, color="r")
# ax.set_title('COMBINE_MEAN_TIME_S')

# ax.minorticks_on()
# ax.grid()
# ax.set_xlabel('Number of files in BB')

# plt.show()

In [10]:
# sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 1.8})
#sns.set_style("darkgrid")

In [11]:
## Use the full data

is_FITS =  df_swarp['FITS']=="Y"
is_PRIVATE =  df_swarp['BB_TYPE']=="PRIVATE"


In [12]:
temp_df = df_swarp[is_FITS & is_PRIVATE]
temp_df.BB_NB_FILES = (temp_df.BB_NB_FILES*100)/32

xlabel='Input files in burst buffers (%)'
legend_label=['Private', 'Striped']

def plot_boxplot_all_tasks(df, output, print_dots=False):

    with sns.light_palette("green"):

        #print(test.info())

        f, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True)
        f.patch.set_alpha(0)

        g1 = sns.boxplot(x="BB_NB_FILES", y="STAGEIN_TIME_S", data=df, ax=axes[0,0])
        if print_dots:
            g1 = sns.swarmplot(x="BB_NB_FILES", y="STAGEIN_TIME_S", data=df, color=".25", ax=axes[0,0])
        g1.set(xlabel='', ylabel='Stage in time (s)')

        g2 = sns.boxplot(x="BB_NB_FILES", y="RESAMPLE_TIME_S", data=df, ax=axes[0,1])
        if print_dots:
            g2 = sns.swarmplot(x="BB_NB_FILES", y="RESAMPLE_TIME_S", data=df, color=".25", ax=axes[0,1])
        g2.set(xlabel='', ylabel='Resample time (s)')

        g3 = sns.boxplot(x="BB_NB_FILES", y="COMBINE_TIME_S", data=df, ax=axes[1,0])
        if print_dots:
            g3 = sns.swarmplot(x="BB_NB_FILES", y="COMBINE_TIME_S", data=df, color=".25", ax=axes[1,0])
        g3.set(xlabel=xlabel, ylabel='Combine time (s)')

        g4 = sns.boxplot(x="BB_NB_FILES", y="MAKESPAN_S", data=df, ax=axes[1,1])
        if print_dots:
            g4 = sns.swarmplot(x="BB_NB_FILES", y="MAKESPAN_S", data=df, color=".25", ax=axes[1,1])
        g4.set(xlabel=xlabel, ylabel='Makespan (s)')

        plt.tight_layout()
        plt.savefig(OUTPUT+output+'.pdf', facecolor=f.get_facecolor(), edgecolor='none')
        plt.savefig(OUTPUT+output+'.pgf', facecolor=f.get_facecolor(), edgecolor='none')
        #plt.close() 

        
plot_boxplot_all_tasks(temp_df, output="swarp-box-fits-private-1w-32c")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [13]:
temp_df = df_swarp[is_FITS & is_PRIVATE]
temp_df.BB_NB_FILES = (temp_df.BB_NB_FILES*100)/32

def plot_boxplot_per_task(df, col, output, print_dots=False):
    xlabel='Number of input files in burst buffers (%)'
    legend_label=['Private', 'Striped']
    name = col.split("_")[0].lower()
        
    with sns.light_palette("green"):

        #print(test.info())

        f = plt.figure(figsize=(4, 3))
        f.patch.set_alpha(0)

        g1 = sns.boxplot(x="BB_NB_FILES", y=col, data=df)
        if print_dots:
            g1 = sns.swarmplot(x="BB_NB_FILES", y=col, data=df, color=".25")
        g1.set(xlabel=xlabel, ylabel='Execution time (s)')

        plt.tight_layout()
        plt.savefig(OUTPUT+'swarp-box-'+name+output+'.pdf', facecolor=f.get_facecolor(), edgecolor='none')
        plt.savefig(OUTPUT+'swarp-box-'+name+output+'.pgf', facecolor=f.get_facecolor(), edgecolor='none')
        plt.show()

for task in tasks:
    print("Plot boxplot task:", task)
    plot_boxplot_per_task(temp_df, task, output="-fits-private-1w-32c")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Plot boxplot task: STAGEIN_TIME_S




Plot boxplot task: RESAMPLE_TIME_S




Plot boxplot task: COMBINE_TIME_S




Plot boxplot task: MAKESPAN_S




In [14]:
g = sns.catplot(x="BB_NB_FILES", y="MAKESPAN_S", hue="BB_TYPE", col="FITS", data=df_swarp, kind="box")

In [15]:
print("FITS")
temp_df = df_swarp[is_FITS]
temp_df.BB_NB_FILES = (temp_df.BB_NB_FILES*100)/32

def plot_lineplot_all_tasks_bbtype(df, output, label, legend_label):
    
    #print(test.info())

    f, axes = plt.subplots(2, 2, figsize=(10, 7), sharex=True)
    f.patch.set_alpha(0)

    g2 = sns.lineplot(x="BB_NB_FILES", y="STAGEIN_TIME_S", hue="BB_TYPE", style="BB_TYPE", data=df, ax=axes[0,0],markers=True)
    g2.legend(title='', labels=legend_label)
    g2.set(xlabel=label[0], ylabel='Stage in time (s)')

    g3 = sns.lineplot(x="BB_NB_FILES", y="RESAMPLE_TIME_S", hue="BB_TYPE", style="BB_TYPE", data=df, ax=axes[0,1],markers=True)
    #g3.legend_.remove()
    g3.legend(title='', labels=legend_label)
    g3.set(xlabel=label[0], ylabel='Resample time (s)')

    g4 = sns.lineplot(x="BB_NB_FILES", y="COMBINE_TIME_S", hue="BB_TYPE", style="BB_TYPE", data=df, ax=axes[1,0],markers=True)
    #g4.legend_.remove()
    g4.legend(title='', labels=legend_label)
    g4.set(xlabel=label[0], ylabel='Combine time (s)')

    g1 = sns.lineplot(x="BB_NB_FILES", y="MAKESPAN_S", hue="BB_TYPE", style="BB_TYPE", data=df, ax=axes[1,1],markers=True)
    #g1.legend_.remove()
    g1.legend(title='', labels=legend_label)
    g1.set(xlabel=label[0], ylabel='Makespan (s)')

    plt.tight_layout()
    plt.savefig(OUTPUT+output+'.pdf', facecolor=f.get_facecolor(), edgecolor='none')
    plt.savefig(OUTPUT+output+'.pgf', facecolor=f.get_facecolor(), edgecolor='none')
    #plt.close() 

xlabel='Input files in burst buffers (%)'
legend_label=['Private', 'Striped']
    
plot_lineplot_all_tasks_bbtype(temp_df, "swarp-line-fits-private-1w-32c", label=[xlabel, ''], legend_label=legend_label)


FITS


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [16]:
def plot_lineplot_per_task_bbtype(df, col, output, loc='upper left'):
    xlabel='Input files in burst buffers (%)'
    legend_label=['Private', 'Striped']
    name = col.split("_")[0].lower()

    f = plt.figure(figsize=(4, 3))
    f.patch.set_alpha(0)
    
    g = sns.lineplot(x="BB_NB_FILES", y=col, hue="BB_TYPE", style="BB_TYPE", data=df,markers=True)
    g.legend(title='', labels=legend_label)
    g.set(xlabel=xlabel, ylabel='Execution time (s)', xticks=df.BB_NB_FILES.unique())

    plt.tight_layout()
    plt.savefig(OUTPUT+'swarp-line-'+name+output+'.pdf', facecolor=f.get_facecolor(), edgecolor='none')
    plt.savefig(OUTPUT+'swarp-line-'+name+output+'.pgf', facecolor=f.get_facecolor(), edgecolor='none')
    plt.show()

loc = ['upper left', 'upper right', 'upper left', 'center left']
for i,task in enumerate(tasks):
    print("Plot line task:", task)
    plot_lineplot_per_task_bbtype(temp_df, task, output="-fits-1w-32c", loc=loc[i])
    

Plot line task: STAGEIN_TIME_S


  app.launch_new_instance()


Plot line task: RESAMPLE_TIME_S


  app.launch_new_instance()


Plot line task: COMBINE_TIME_S


  app.launch_new_instance()


Plot line task: MAKESPAN_S


  app.launch_new_instance()


In [17]:
print("NOFITS")
temp_df = df_swarp[~is_FITS]
temp_df.BB_NB_FILES = (temp_df.BB_NB_FILES*100)/32

loc = ['upper left', 'upper right', 'upper left', 'center left']
for i,task in enumerate(tasks):
    print("Plot line task:", task)
    plot_lineplot_per_task_bbtype(temp_df, task, output="-nofits-1w-32c", loc=loc[i])
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


NOFITS
Plot line task: STAGEIN_TIME_S


  app.launch_new_instance()


Plot line task: RESAMPLE_TIME_S


  app.launch_new_instance()


Plot line task: COMBINE_TIME_S


  app.launch_new_instance()


Plot line task: MAKESPAN_S


  app.launch_new_instance()


In [18]:
#g = sns.lineplot(x="BB_NB_FILES", y="RESAMPLE_TIME_S", hue="BB_TYPE", style="BB_TYPE", data=df_swarp[~is_FITS])

In [19]:
print("PRIVATE")
temp_df = df_swarp[is_PRIVATE]
temp_df.BB_NB_FILES = (temp_df.BB_NB_FILES*100)/32

def plot_lineplot_all_tasks_fits(df, output, label, legend_label, loc=['upper left', 'upper left', 'upper left', 'upper left']):
    
    f, axes = plt.subplots(2, 2, figsize=(10, 7), sharex=True)
    f.patch.set_alpha(0)

    g2 = sns.lineplot(x="BB_NB_FILES", y="STAGEIN_TIME_S", hue="FITS", style="FITS", data=df, ax=axes[0,0],markers=True)
    g2.legend(title='Intermediate files', labels=legend_label)
    g2.set(xlabel=xlabel, ylabel='Stage in time (s)')

    g3 = sns.lineplot(x="BB_NB_FILES", y="RESAMPLE_TIME_S", hue="FITS", style="FITS", data=df, ax=axes[0,1],markers=True)
    g3.legend(title='Intermediate files', labels=legend_label)
    g3.set(xlabel=xlabel, ylabel='Resample time (s)')

    g4 = sns.lineplot(x="BB_NB_FILES", y="COMBINE_TIME_S", hue="FITS", style="FITS", data=df, ax=axes[1,0],markers=True)
    g4.legend(title='Intermediate files', labels=legend_label)
    g4.set(xlabel=xlabel, ylabel='Combine time (s)')

    g1 = sns.lineplot(x="BB_NB_FILES", y="MAKESPAN_S", hue="FITS", style="FITS", data=df, ax=axes[1,1],markers=True)
    g1.legend(title='Intermediate files', labels=legend_label)
    g1.set(xlabel=xlabel, ylabel='Makespan (s)')

    plt.tight_layout()
    plt.savefig(OUTPUT+output+'.pdf', facecolor=f.get_facecolor(), edgecolor='none')
    plt.savefig(OUTPUT+output+'.pgf', facecolor=f.get_facecolor(), edgecolor='none')
    plt.show()
    plt.close() 

xlabel='Input files in burst buffers (%)'
ylabel='Execution time (s)'
legend_label=['Parallel File System', 'Burst Buffers']
loc=['upper left', 'upper right', 'center left', 'upper right']

plot_lineplot_all_tasks_fits(temp_df, "swarp-line-private-1w-32c", label=[xlabel, ylabel], legend_label=legend_label, loc=loc)


    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


PRIVATE




In [20]:
print("STRIPED")
temp_df = df_swarp[~is_PRIVATE]
temp_df.BB_NB_FILES = (temp_df.BB_NB_FILES*100)/32

xlabel='Input files in burst buffers (%)'
ylabel='Execution time (s)'
legend_label=['Parallel File System', 'Burst Buffers']
loc=['upper left', 'lower right', 'upper left', 'lower right']

plot_lineplot_all_tasks_fits(temp_df, "swarp-line-striped-1w-32c", label=[xlabel, ylabel], legend_label=legend_label,loc=loc)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


STRIPED




In [21]:
def plot_lineplot_per_task_fits(df, col, output, loc='upper left'):
    xlabel='Input files in burst buffers (%)'
    ylabel='Execution time (s)'
    legend_label=['Parallel File System', 'Burst Buffers']
    
    name = col.split("_")[0].lower()

    f = plt.figure(figsize=(4, 3))
    f.patch.set_alpha(0)
    
    g = sns.lineplot(x="BB_NB_FILES", y=col, hue="FITS", style="FITS", data=df,markers=True)
    g.legend(title='Intermediate files', labels=legend_label)
    g.set(xlabel=xlabel, ylabel=ylabel,xticks=df.BB_NB_FILES.unique())

    plt.tight_layout()
    plt.savefig(OUTPUT+'swarp-line-'+name+output+'.pdf', facecolor=f.get_facecolor(), edgecolor='none')
    plt.savefig(OUTPUT+'swarp-line-'+name+output+'.pgf', facecolor=f.get_facecolor(), edgecolor='none')
    plt.show()

loc=['upper left', 'lower right', 'upper left', 'lower right']

for i,task in enumerate(tasks):
    print("Plot line fits task:", task)
    plot_lineplot_per_task_fits(temp_df, task, output="-fits-private-1w-32c", loc=loc[i])
    

Plot line fits task: STAGEIN_TIME_S




Plot line fits task: RESAMPLE_TIME_S




Plot line fits task: COMBINE_TIME_S




Plot line fits task: MAKESPAN_S




In [22]:
agg_swarp = df_swarp.groupby(['ID', 'BB_NB_FILES','FITS','BB_SIZE_FILES_MB', 'BB_TYPE', 'AVG', 'NB_CORES'], as_index=False).agg({'MAKESPAN_S': 'max', 'COMBINE_TIME_S': 'max', 'RESAMPLE_TIME_S': 'max', 'STAGEIN_TIME_S': 'min'})


print_dots = True
xlabel='Number of input files staged in'
ylabel='Bandwidth (MB/s)'
legend_label=['Private', 'Striped']

agg_swarp['bw'] = agg_swarp['BB_SIZE_FILES_MB'] / agg_swarp['STAGEIN_TIME_S']

# bandwidth_swarp = df_swarp.groupby(['ID', 'NB_PIPELINE','FITS', 'BB_TYPE', 'AVG', 'NB_CORES'], as_index=False).agg(lambda g: g['BB_SIZE_FILES_MB'].max()/g['STAGEIN_TIME_S'].max())

#print(bandwidth_swarp)

# f, axes = plt.subplots(1, 2, figsize=(9, 4.5), sharex=True, sharey=True)
f = plt.figure(figsize=(8, 4))
f.patch.set_alpha(0)

g1 = sns.lineplot(x="BB_NB_FILES", y="bw", data=agg_swarp, hue="BB_TYPE", style="BB_TYPE", markers=True)
g1.set(xlabel=xlabel, ylabel=ylabel)

# g1.set_xscale('log')
# g1.set_xticks(agg_swarp.NB_PIPELINE.unique())
# g1.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())

g1.legend(title='', labels=legend_label)

plt.tight_layout()
plt.savefig(OUTPUT+'swarp-bandwidth-1w-32c-Xf.pdf', facecolor=f.get_facecolor(), edgecolor='none')
plt.savefig(OUTPUT+'swarp-bandwidth-1w-32c-Xf.pgf', facecolor=f.get_facecolor(), edgecolor='none')
plt.show()

