In [1]:
import os
import sys
import pandas as pd
import numpy as np
import plot


class EnvironmentPara:
    def __init__(self, node_num:int, node_list:list[str], start_time, end_time):
        self.node_num:int = node_num
        self.node_list = node_list
        self.time_start = start_time
        self.time_end = end_time
        self.date_time = self.get_date_time(start_time, end_time)

    def __str__(self):
        return f"{self.name}: {self.workload}"

    def get_date_time(self, start_time, end_time):
        # start time: 20241227-230036
        # end time: 20241227-230048
        s_date = start_time.split('-')[0]
        s_time = start_time.split('-')[1]
        s_hour = int(s_time[:2])
        s_min = int(s_time[2:4])
        s_sec = int(s_time[4:])

        e_date = end_time.split('-')[0]
        e_time = end_time.split('-')[1]
        e_hour = int(e_time[:2])
        e_min = int(e_time[2:4])
        e_sec = int(e_time[4:])

        date = f"{e_date[0:4]}-{e_date[4:6]}-{e_date[6:8]}"
        time = ''
        if s_date == e_date:
            if e_hour == s_hour:
                if e_min == s_min:
                    if s_sec <= 60 - e_sec:
                        time = f"{s_hour:02}:{s_min:02}:00"
                    else:
                        time = f"{s_hour:02}:{s_min:02}:00"
                else:
                    time = f"{s_hour:02}:{s_min:02}:00"
            else:
                time = f"{e_hour:02}:{e_min:02}:00"
        else:
            time = f"{e_hour:02}:{e_min:02}:00"

        

        return f"{date} {time}"

class ProgramPara:
    def __init__(self, lines:list[str]):
       
        self.vars = 0
        self.iteration = 0
        self.Time = 0
        self.KBytesXchng_per_rank_max = 0.0
        self.MB_per_sec_per_rank = 0.0

        self.Msgs_per_sec = 0.0
        self.MB_per_sec = 0.0

        for i,line in enumerate(lines):
            if  "Message Size" in line:
                if line.split()[-1].isdigit():
                    self.vars = int(line.split()[-1])
                elif line.split()[-2].isdigit():
                    self.vars = int(line.split()[-2])
            elif "Repeats" in line:
                self.iteration = int(line.split()[-1])
            elif line.startswith("#   MsgSize        Time             KMsgs             MB           KMsg/S           MB/S"):
                # pingpang
                line = lines[i+1]
                self.Time = float(line.split()[1])
                self.KMsgs = float(line.split()[2])
                self.MB = float(line.split()[3])
                self.KMsg_per_sec = float(line.split()[4])
                self.MB_per_sec = float(line.split()[5])

    def get_speed(self):
        if self.MB_per_sec_per_rank:
            return self.MB_per_sec_per_rank
        elif self.MB_per_sec:
            return self.MB_per_sec

    def get_time(self):
        return self.Time


class Task:
    def __init__(self, env_para: EnvironmentPara, program_para: ProgramPara):
        self.kernel_name = 'pingpong'
        self.env_para = env_para
        self.program_para = program_para
        self.date_time = self.env_para.date_time
        self.Time = self.program_para.get_time()
        self.speed = self.program_para.get_speed()

class Tasks:
    speed_map = {"incast":"MB_per_sec", "pingpong":"MB_per_sec"}
    headers = ['variables', 'node_num', 'node_list', 'date_time', 'task_type', 'total_time', 'MB/s']
    types = [int, int, str, str, str, float, float]

    def __init__(self, log_path:str, task_type:str, update_tasks=True):
        self.tasks = [] 
        self.task_type = task_type
        file_name_list = os.listdir(log_path)
        self.data = pd.DataFrame(columns=Tasks.headers)

        if update_tasks or os.path.exists(f'data-{task_type}.csv') == False:
            self.update_tasks_datas(file_name_list, log_path)
        else:
            self.data = pd.read_csv(f'data-{task_type}.csv', dtype=dict(zip(Tasks.headers, Tasks.types)))
    
    def update_tasks_datas(self, file_name_list, log_path):
        data_row = []
        for file_name in file_name_list:
            if file_name.endswith('.txt'):
                file_path = os.path.join(log_path, file_name)
                with open(file_path, 'r') as f:
                    node_num = 0
                    node_infos:str = ''
                    node_list = []
                    task_type = ''
                    
                    start_time, end_time = '', ''
                    start_line, end_line = 0, 0
                    lines = f.readlines()
                    for i,line in enumerate(lines):
                        if line.startswith("Node Number:"):
                            node_num = int(line.split()[-1])
                        elif line.startswith("Node List:"):
                            node_list = line.split()[-1]
                        elif line.startswith("Task Type:"):
                            task_type = line.split()[-1]
                        elif line.startswith("Start time:"):
                            tokens = line.split()
                            start_time = tokens[-1]
                            start_line = i + 1
                        elif line.startswith("End time:"):
                            tokens = line.split()
                            end_time = tokens[-1]
                            end_line = i
                            env_para = EnvironmentPara(node_num, node_list, start_time, end_time)
                            program_para = ProgramPara(lines[start_line:end_line])
                            task = Task(env_para, program_para)
                            if program_para.vars > 0 and task.Time > 0 and task.speed > 0:
                                data_row.append([program_para.vars, node_num, node_list, env_para.date_time, 
                                                task_type, task.Time, task.speed])
                                
                                self.tasks.append(task)
                            start_time, end_time = '', ''
                            start_line, end_line = 0, 0

        self.data = pd.DataFrame(data_row, columns=Tasks.headers)
        self.data.sort_values(['node_list', 'date_time', 'task_type', 'variables'], inplace=True)
        # print(self.data)
        self.data.to_csv(f'data-{self.task_type}.csv', index=False)
        
    def plot_histogram(self, save_path:str):
        title_base = f"Histogram of {self.task_type} with 102400 Bytes"
        node_lists = self.data['node_list'].unique()
        for node_list in node_lists:
            data_node = self.data[self.data['node_list'] == node_list]
            title = f"{title_base}\n{node_list}"
            file_name = f"{self.task_type}-{node_list}"
            plot.plot_histogram(data=data_node, title=title, file_name=file_name, save_path=save_path, x='total_time')
            # plot.plot_histogram(data=data_node, title=title, file_name=file_name, save_path=save_path, x='MB/s')



workspace_path="/home/hpclqz/share/project/04_TPBench/congestion_test/workspace"                      
interleaved_path = os.path.join(workspace_path, "logs-interleaved")
single_path = os.path.join(workspace_path, "logs-single")
tasks_inter = Tasks(interleaved_path, task_type="interleaved", update_tasks=True)
tasks_single = Tasks(single_path, task_type="single", update_tasks=True)


In [None]:
import os
import sys
import pandas as pd
import numpy as np



workspace_path="/home/hpclqz/share/project/04_TPBench/congestion_test/workspace"                      
figure_path = os.path.join(workspace_path, "figures")
os.makedirs(figure_path, exist_ok=True)
tasks_inter.plot_histogram(figure_path)
tasks_single.plot_histogram(figure_path)
