In [1]:
import os
import sys
import pandas as pd
import numpy as np
import plot


class EnvironmentPara:
    def __init__(self, node_num:int, node_list:list[str], start_time, end_time):
        self.node_num:int = node_num
        self.node_list = node_list
        self.time_start = start_time
        self.time_end = end_time
        self.date_time = self.get_date_time(start_time, end_time)

    def __str__(self):
        return f"{self.name}: {self.workload}"

    def get_date_time(self, start_time, end_time):
        # start time: 20241227-230036
        # end time: 20241227-230048
        s_date = start_time.split('-')[0]
        s_time = start_time.split('-')[1]
        s_hour = int(s_time[:2])
        s_min = int(s_time[2:4])
        s_sec = int(s_time[4:])

        e_date = end_time.split('-')[0]
        e_time = end_time.split('-')[1]
        e_hour = int(e_time[:2])
        e_min = int(e_time[2:4])
        e_sec = int(e_time[4:])

        date = f"{e_date[0:4]}-{e_date[4:6]}-{e_date[6:8]}"
        time = ""
        if s_date == e_date:
            if e_hour == s_hour:
                if e_min == s_min:
                    if s_sec <= 60 - e_sec:
                        time = f"{s_hour:02}:{s_min:02}:00"
                    else:
                        time = f"{s_hour:02}:{s_min:02}:00"
                else:
                    time = f"{s_hour:02}:{s_min:02}:00"
            else:
                time = f"{e_hour:02}:{e_min:02}:00"
        else:
            time = f"{e_hour:02}:{e_min:02}:00"

        

        

        return f"{date} {time}"

class ProgramPara:
    def __init__(self, lines:list[str]):
       
        self.vars = 0
        self.iteration = 0
        self.Time = 0
        self.KBytesXchng_per_rank_max = 0.0
        self.MB_per_sec_per_rank = 0.0

        self.Msgs_per_sec = 0.0
        self.MB_per_sec = 0.0

        for i,line in enumerate(lines):
            if  "Message Size" in line:
                if line.split()[-1].isdigit():
                    self.vars = int(line.split()[-1])
                elif line.split()[-2].isdigit():
                    self.vars = int(line.split()[-2])
            elif "Repeats" in line:
                self.iteration = int(line.split()[-1])
            elif line.startswith("#   MsgSize        Time             KMsgs             MB           KMsg/S           MB/S"):
                # pingpang
                line = lines[i+1]
                self.Time = float(line.split()[1])
                self.KMsgs = float(line.split()[2])
                self.MB = float(line.split()[3])
                self.KMsg_per_sec = float(line.split()[4])
                self.MB_per_sec = float(line.split()[5])

    def get_speed(self):
        if self.MB_per_sec_per_rank:
            return self.MB_per_sec_per_rank
        elif self.MB_per_sec:
            return self.MB_per_sec

    def get_time(self):
        return self.Time


class Task:
    def __init__(self, env_para: EnvironmentPara, program_para: ProgramPara):
        self.kernel_name = 'pingpong'
        self.env_para = env_para
        self.program_para = program_para
        self.date_time = self.env_para.date_time
        self.Time = self.program_para.get_time()
        self.speed = self.program_para.get_speed()


                    
                            
                            

class Tasks:
    speed_map = {"incast":"MB_per_sec", "pingpong":"MB_per_sec"}
    headers = ['variables', 'node_num', 'node_list', 'date_time', 'task_type', 'total_time', 'MB/s']
    types = [int, int, str, str, str, float, float]

    def __init__(self, log_path:str, task_type:str, update_tasks=True):
        self.tasks = [] 
        self.task_type = task_type
        file_name_list = os.listdir(log_path)
        self.data = pd.DataFrame(columns=Tasks.headers)

        if update_tasks or os.path.exists(f'data-{task_type}.csv') == False:
            self.update_tasks_datas(file_name_list, log_path)
        else:
            self.data = pd.read_csv(f'data-{task_type}.csv', dtype=dict(zip(Tasks.headers, Tasks.types)))
    
    def update_tasks_datas(self, file_name_list, log_path):
        data_row = []
        for file_name in file_name_list:
            if file_name.endswith('.txt'):
                file_path = os.path.join(log_path, file_name)
                with open(file_path, 'r') as f:
                    node_num = 0
                    node_infos:str = ''
                    node_list = []
                    task_type = ''
                    
                    start_time, end_time = '', ''
                    start_line, end_line = 0, 0
                    lines = f.readlines()
                    for i,line in enumerate(lines):
                        if line.startswith("Node Number:"):
                            node_num = int(line.split()[-1])
                        elif line.startswith("Node List:"):
                            node_list = line.split()[-1]
                        elif line.startswith("Task Type:"):
                            task_type = line.split()[-1]
                        elif line.startswith("Start time:"):
                            tokens = line.split()
                            start_time = tokens[-1]
                            start_line = i + 1
                        elif line.startswith("End time:"):
                            tokens = line.split()
                            end_time = tokens[-1]
                            end_line = i
                            env_para = EnvironmentPara(node_num, node_list, start_time, end_time)
                            program_para = ProgramPara(lines[start_line:end_line])
                            task = Task(env_para, program_para)
                            if program_para.vars > 0 and task.Time > 0 and task.speed > 0:
                                data_row.append([program_para.vars, node_num, node_list, env_para.date_time, 
                                                task_type, task.Time, task.speed])
                                
                                self.tasks.append(task)
                            start_time, end_time = '', ''
                            start_line, end_line = 0, 0

        self.data = pd.DataFrame(data_row, columns=Tasks.headers)
        self.data.sort_values(['node_list', 'date_time', 'task_type', 'variables'], inplace=True)
        # print(self.data)
        self.data.to_csv(f'data-{self.task_type}.csv', index=False)
        
    def plot_histogram(self, save_path:str):
        variables_list = self.data['variables'].unique()
        for variables in variables_list:
            data_var = self.data[self.data['variables'] == variables]
            title_base = f"Histogram of {self.task_type} with {variables} Bytes"
            node_lists = data_var['node_list'].unique()
            for node_list in node_lists:
                data_node = data_var[data_var['node_list'] == node_list]
                title = f"{title_base}\n{node_list}"
                file_name = f"Histogram-{variables}-{node_list}-{self.task_type}"
                plot.plot_histogram(data=data_node, title=title, file_name=file_name, save_path=save_path, x='total_time')
                plot.plot_histogram(data=data_node, title=title, file_name=file_name, save_path=save_path, x='MB/s')

    def compare_histogram(self, task2:Task, save_path:str, title:str):
        variables_list_1 = self.data['variables'].unique()
        variables_list_2 = task2.data['variables'].unique()
        variable_list = list(set(variables_list_1) & set(variables_list_2))
        for variable in variable_list:
            data_var_1 = self.data[self.data['variables'] == variable]
            data_var_2 = task2.data[task2.data['variables'] == variable]
            title_base = f"Histogram of {self.task_type} with {variable} Bytes"
            node_lists_1 = data_var_1['node_list'].unique()
            node_lists_2 = data_var_2['node_list'].unique()
            node_list = list(set(node_lists_1) & set(node_lists_2))
            for node in node_list:
                data_node_1 = data_var_1[data_var_1['node_list'] == node]
                data_node_2 = data_var_2[data_var_2['node_list'] == node]
                title = f"{title_base}\n{node}"
                file_name = f"{self.task_type}-{node}"
                plot.plot_histogram(data1=data_node_1, data2=data_node_2, title=title, file_name=file_name, save_path=save_path, x='total_time')
                



workspace_path="/home/hpclqz/share/project/04_TPBench/congestion_test/workspace"                      
interleaved_path = os.path.join(workspace_path, "logs-interleaved")
single_path = os.path.join(workspace_path, "logs-single")
tasks_inter = Tasks(interleaved_path, task_type="interleaved", update_tasks=True)
tasks_single = Tasks(single_path, task_type="single", update_tasks=True)


In [None]:
import os

workspace_path="/home/hpclqz/share/project/04_TPBench/congestion_test/workspace"                      
figure_path = os.path.join(workspace_path, "figures")
os.makedirs(figure_path, exist_ok=True)
tasks_inter.plot_histogram(figure_path)
tasks_single.plot_histogram(figure_path)
# tasks_inter.compare_histogram(tasks_single, figure_path)

In [6]:
class Tasks_Stastic():
    hostfile_pair_list:list[tuple] = [("hostfile1", "hostfile2"), ("hostfile3", "hostfile4"), ("hostfile5", "hostfile6")]
    nodes_pair_list:list[tuple] = [("pi1,pi2", "pi3,pi4"), ("pi1,pi3", "pi2,pi4"), ("pi1,pi4", "pi2,pi3"), ("pi2,pi3", "pi1,pi4"), ("pi2,pi4", "pi1,pi3"), ("pi3,pi4", "pi1,pi2")]
    headers = ['nodes', 'variables', 'task_type', 'perf_type', 'average', 'std', 'max', 'min', '1%', '5%', '1%_avg', '5%_avg']
    def __init__(self, tasks_inter:Tasks, tasks_single:Tasks, update_data=True):
        self.data:pd.DataFrame = None
        if update_data or os.path.exists('data-stastic.csv') == False:
            self.nodes_perf_stastic(tasks_inter)
            self.nodes_perf_stastic(tasks_single)
            self.data.sort_values(['perf_type', 'nodes', 'variables', 'task_type'], inplace=True)
            self.data.to_csv('data-stastic.csv', index=False)
        else:
            self.data = pd.read_csv('data-stastic.csv')

    def nodes_perf_stastic(self, tasks:Tasks):
        '''
        
        '''
        data_row = []
        node_list = tasks.data['node_list'].unique()
        for node in node_list:
            data_node = tasks.data[tasks.data['node_list'] == node]
            variables_list = data_node['variables'].unique()
            for variables in variables_list:
                data_var = data_node[data_node['variables'] == variables]
                task_type_list = data_var['task_type'].unique()
                for task_type in task_type_list:
                    data_type = data_var[data_var['task_type'] == task_type]
                    for perf_type in ['total_time', 'MB/s']:
                        average = data_type[perf_type].mean()
                        std = data_type[perf_type].std()
                        max = data_type[perf_type].max()
                        min = data_type[perf_type].min()
                        if 'time' in perf_type:
                            one_percent = data_type[perf_type].quantile(0.99)
                            five_percent = data_type[perf_type].quantile(0.95)
                            average_one_percent = data_type[data_type[perf_type] > one_percent][perf_type].mean()
                            average_five_percent = data_type[data_type[perf_type] > five_percent][perf_type].mean()
                        else:
                            one_percent = data_type[perf_type].quantile(0.01)
                            five_percent = data_type[perf_type].quantile(0.05)
                            average_one_percent = data_type[data_type[perf_type] < one_percent][perf_type].mean()
                            average_five_percent = data_type[data_type[perf_type] < five_percent][perf_type].mean()
                        data_row.append([node, variables, task_type, perf_type, average, std, max, min, one_percent, five_percent, average_one_percent, average_five_percent])
                    
        if self.data is None:
            self.data = pd.DataFrame(data_row, columns=Tasks_Stastic.headers)
        else:
            self.data = pd.concat([self.data, pd.DataFrame(data_row, columns=Tasks_Stastic.headers)])

        
    def plot_stastic(self, save_path:str):
        '''
        1. 
        
        '''

        pass
                
tasks_stastic = Tasks_Stastic(tasks_inter, tasks_single, update_data=True)

In [None]:
class Task_Paired():
    '''
    
    '''
    def __init__(self):
        self.kernel_name:str = 'pingpong'
        self.date_time:str = ''
        self.nodepair1_inter:Task = None
        self.nodepair2_inter:Task = None
        self.nodepair1_single:Task = None
        self.nodepair2_single:Task = None
        self.speed_inter = 0.0          # Average speed of interleaved node pair
        self.speed_single = 0.0         # Average speed of single node pair
        self.time_inter = 0.0
        self.time_single = 0.0

    
class Task_Paired_List():
    hostfile_pair_list:list[tuple] = [("hostfile1", "hostfile2"), ("hostfile3", "hostfile4"), ("hostfile5", "hostfile6")]
    nodes_pair_list:list[tuple] = [("pi1,pi2", "pi3,pi4"), ("pi1,pi3", "pi2,pi4"), ("pi1,pi4", "pi2,pi3"), ("pi2,pi3", "pi1,pi4"), ("pi2,pi4", "pi1,pi3"), ("pi3,pi4", "pi1,pi2")]
    headers=['variables', 'node_list', 'date_time', 'task_type', 'total_time', 'MB/s']
    def __init__(self, log_path:str, update_tasks=True):
        self.task_paired_list = []
        self.data = pd.DataFrame(columns=Tasks.headers)
    
    def update_tasks_datas(self, file_name_list, log_path):
        log_interleaved_path = os.path.join(log_path, "logs-interleaved")   
        log_single_path = os.path.join(log_path, "logs-single")
        file_inter_list = os.listdir(log_interleaved_path)
        file_single_list = os.listdir(log_single_path)
        time_list_dict:dict[str, list] = {}
        for file_name in file_inter_list:
            if file_name.endswith('.txt'):
                hostfile= file_name.split('-')[1]
                date=file_name.split('-')[2]
                time=file_name.split('-')[3].split('.')[0]
                data_time=f"{date}-{time}"
                if hostfile not in time_list:
                    time_list[hostfile] = []
                if data_time not in time_list[hostfile]:
                    time_list[hostfile].append(data_time)
        for hostfile_pair in Task_Paired_List.hostfile_pair_list:
            for nodes_pair in Task_Paired_List.nodes_pair_list:
                time_list = time_list_dict[hostfile_pair[0]]
                for time in time_list:
                    file1_inter_path=os.path.join(log_interleaved_path,f"logs-{hostfile_pair[0]}-{time}.txt")
                    file2_inter_path=os.path.join(log_interleaved_path,f"logs-{hostfile_pair[1]}-{time}.txt")
                    file1_single_path=os.path.join(log_single_path,f"logs-{hostfile_pair[0]}-{time}.txt")
                    file2_single_path=os.path.join(log_single_path,f"logs-{hostfile_pair[1]}-{time}.txt")
                    if os.path.exists(file1_inter_path) == False or os.path.exists(file2_inter_path) == False or os.path.exists(file1_single_path) == False or os.path.exists(file2_single_path) == False:
                        continue
                    file_list = [file1_inter_path, file2_inter_path, file1_single_path, file2_single_path]
