# Notebook to Create the `Job Completion Time` CSV File

In [1]:
import pandas as pd

# Loading the Sparrow CSV File

In [2]:
SPARROW_CSV_FILE_NAME = "sparrow_YH_10_000.csv"

sparrow_df = pd.read_csv(SPARROW_CSV_FILE_NAME,
                         names=["Job ID", "Task ID", "Job Arrival Time",
                                "Task Launch Time", "Task Duration (Trace)"]
                        )
sparrow_df["Task End Time On Node"] = sparrow_df["Task Launch Time"] + sparrow_df["Task Duration (Trace)"]

sparrow_df

Unnamed: 0,Job ID,Task ID,Job Arrival Time,Task Launch Time,Task Duration (Trace),Task End Time On Node
0,1,0,7.527,7.5285,6,13.5285
1,1,1,7.527,7.5285,12,19.5285
2,1,2,7.527,7.5285,17,24.5285
3,1,3,7.527,7.5285,15,22.5285
4,1,4,7.527,7.5285,25,32.5285
...,...,...,...,...,...,...
968330,24262,15,181440.000,181440.0015,6,181446.0015
968331,24262,16,181440.000,181440.0015,9,181449.0015
968332,24262,17,181440.000,181440.0015,5,181445.0015
968333,24262,18,181440.000,181440.0015,6,181446.0015


In [3]:
sparrow_df.dtypes

Job ID                     int64
Task ID                    int64
Job Arrival Time         float64
Task Launch Time         float64
Task Duration (Trace)      int64
Task End Time On Node    float64
dtype: object

# Loading the Megha CSV File

In [4]:
MEGHA_CSV_FILE_NAME = "record-2021-12-09-21-03-12_jobs_info.csv"
megha_df = pd.read_csv(MEGHA_CSV_FILE_NAME)
megha_df

Unnamed: 0,Job ID,Task ID,Job Arrival Time,Task Launch Time,Task Duration (Trace),Task Duration (GM),Task Queuing Delay,Task End Time On Node
0,1,1,7.527,7.528,6,6.002,0.001,13.528
1,1,2,7.527,7.528,12,12.002,0.001,19.528
2,1,3,7.527,7.528,17,17.002,0.001,24.528
3,1,4,7.527,7.528,15,15.002,0.001,22.528
4,1,5,7.527,7.528,25,25.002,0.001,32.528
...,...,...,...,...,...,...,...,...
968330,24262,15,181440.000,181440.001,8,8.002,0.001,181448.001
968331,24262,16,181440.000,181440.001,6,6.002,0.001,181446.001
968332,24262,17,181440.000,181440.001,9,9.002,0.001,181449.001
968333,24262,18,181440.000,181440.001,5,5.002,0.001,181445.001


In [5]:
megha_df.dtypes

Job ID                     int64
Task ID                    int64
Job Arrival Time         float64
Task Launch Time         float64
Task Duration (Trace)      int64
Task Duration (GM)       float64
Task Queuing Delay       float64
Task End Time On Node    float64
dtype: object

# Creating the Ideal DataFrame

In [6]:
ideal_df = megha_df[['Job ID', 'Task ID', 'Job Arrival Time', 'Task Duration (Trace)']].copy()
ideal_df["Task End Time On Node"] = ideal_df['Job Arrival Time'] + ideal_df['Task Duration (Trace)']

ideal_df

Unnamed: 0,Job ID,Task ID,Job Arrival Time,Task Duration (Trace),Task End Time On Node
0,1,1,7.527,6,13.527
1,1,2,7.527,12,19.527
2,1,3,7.527,17,24.527
3,1,4,7.527,15,22.527
4,1,5,7.527,25,32.527
...,...,...,...,...,...
968330,24262,15,181440.000,8,181448.000
968331,24262,16,181440.000,6,181446.000
968332,24262,17,181440.000,9,181449.000
968333,24262,18,181440.000,5,181445.000


In [7]:
ideal_df.dtypes

Job ID                     int64
Task ID                    int64
Job Arrival Time         float64
Task Duration (Trace)      int64
Task End Time On Node    float64
dtype: object

---

# Sparrow Job Completion Time

In [8]:
sparrow_job_completion_time_df = sparrow_df.groupby("Job ID").max("Task End Time On Node")

sparrow_job_completion_time_df

Unnamed: 0_level_0,Task ID,Job Arrival Time,Task Launch Time,Task Duration (Trace),Task End Time On Node
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,50,7.527,7.5285,27,34.5285
2,33,15.092,15.0935,17,32.0935
3,23,22.650,22.6515,35,57.6515
4,4,30.004,30.0055,2,32.0055
5,2,37.425,37.4265,29,66.4265
...,...,...,...,...,...
24258,80,181410.000,181410.0015,58,181468.0015
24259,6,181418.000,181418.0015,16,181434.0015
24260,25,181425.000,181425.0015,40,181465.0015
24261,0,181433.000,181433.0015,1085,182518.0015


# Megha Job Completion Time

In [9]:
megha_job_completion_time_df = megha_df.groupby("Job ID").max("Task End Time On Node")

megha_job_completion_time_df

Unnamed: 0_level_0,Task ID,Job Arrival Time,Task Launch Time,Task Duration (Trace),Task Duration (GM),Task Queuing Delay,Task End Time On Node
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,51,7.527,7.528,27,27.002,0.001,34.528
2,34,15.092,15.093,17,17.002,0.001,32.093
3,24,22.650,22.651,35,35.002,0.001,57.651
4,5,30.004,30.005,2,2.002,0.001,32.005
5,3,37.425,37.426,29,29.002,0.001,66.426
...,...,...,...,...,...,...,...
24258,81,181410.000,181410.001,58,58.002,0.001,181468.001
24259,7,181418.000,181418.001,16,16.002,0.001,181434.001
24260,26,181425.000,181425.001,40,40.002,0.001,181465.001
24261,1,181433.000,181433.001,1085,1085.002,0.001,182518.001


# Ideal Job Completion Time

In [10]:
ideal_job_completion_time_df = ideal_df.groupby("Job ID").max("Task End Time On Node")

ideal_job_completion_time_df

Unnamed: 0_level_0,Task ID,Job Arrival Time,Task Duration (Trace),Task End Time On Node
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,51,7.527,27,34.527
2,34,15.092,17,32.092
3,24,22.650,35,57.650
4,5,30.004,2,32.004
5,3,37.425,29,66.425
...,...,...,...,...
24258,81,181410.000,58,181468.000
24259,7,181418.000,16,181434.000
24260,26,181425.000,40,181465.000
24261,1,181433.000,1085,182518.000


---

In [11]:
data = [megha_job_completion_time_df["Task End Time On Node"],
        sparrow_job_completion_time_df["Task End Time On Node"],
        ideal_job_completion_time_df["Task End Time On Node"],
       ]

headers = ["Megha Job Completion Time", "Sparrow Job Completion Time", "Ideal Job Completion Time"]

final_df = pd.concat(data, axis=1, keys=headers)

final_df

Unnamed: 0_level_0,Megha Job Completion Time,Sparrow Job Completion Time,Ideal Job Completion Time
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,34.528,34.5285,34.527
2,32.093,32.0935,32.092
3,57.651,57.6515,57.650
4,32.005,32.0055,32.004
5,66.426,66.4265,66.425
...,...,...,...
24258,181468.001,181468.0015,181468.000
24259,181434.001,181434.0015,181434.000
24260,181465.001,181465.0015,181465.000
24261,182518.001,182518.0015,182518.000


In [12]:
final_df.to_csv("job_completion_times.csv")

---