In [53]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

# from utils import *

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', 100)
matplotlib.rcParams.update({"font.size": 16, 'lines.linewidth': 2.5})

In [54]:
# 将 csv 读到 DataFrame
def get_df(file, header=None):
    df = pd.read_csv(file, header=None) # header=None可明确使用传入的数据为header
    # df.columns = DF_HEADER.get(key, df.columns)
    # 从header文件中拿header并设置
    df.columns = pd.read_csv("{}.header".format(file.split('.csv')[0])).columns if header is None else header
    return df

DATA_DIR = './data/'
dfj = get_df(DATA_DIR + 'pai_job_table.csv')
dft = get_df(DATA_DIR + 'pai_task_table.csv')
dfi = get_df(DATA_DIR + 'pai_instance_table.csv')
dfs = get_df(DATA_DIR + 'pai_sensor_table.csv')
dfg = get_df(DATA_DIR + 'pai_group_tag_table.csv')
dfp = get_df(DATA_DIR + 'pai_machine_spec.csv')
dfm = get_df(DATA_DIR + 'pai_machine_metric.csv')

In [55]:
def get_dfia(dfi):
    dfi_s = dfi[dfi.start_time > 0][['job_name','task_name','start_time']].groupby(['job_name','task_name']).min()  # start_time
    dfi_e = dfi[dfi.end_time > 0][['job_name','task_name','end_time']].groupby(['job_name','task_name']).max()  # end_time
    dfi_m = dfi[(dfi.start_time > 0) & (dfi.end_time > 0)][['job_name','task_name','end_time','start_time']]
    dfi_m['runtime'] = dfi_m.end_time-dfi_m.start_time
    dfi_m = dfi_m.groupby(['job_name','task_name']).mean()[['runtime']].reset_index() # runtime
    dfi_u = dfi[['job_name','task_name','status']].drop_duplicates().groupby(['job_name','task_name']).max() # status
    dfia = dfi_u
    for df in [dfi_s, dfi_e, dfi_m]:
        dfia = dfia.merge(df, on=['job_name','task_name'], how='left')
    return dfia

def get_dfa(dft, dfj, dfi, dfg):
    print('dft + dfj ...')
    dfa = dft.merge(dfj, on=['job_name'], suffixes = ['','_j'])
    dfa.loc[dfa.start_time==0, 'start_time'] = np.nan
    dfa.loc[dfa.start_time==0, 'end_time'] = np.nan
    dfa['runtime'] = dfa.end_time - dfa.start_time
    print('dft + dfj + dfi ...')
    dfia = get_dfia(dfi)
    dfa = dfa.merge(dfia, on=['job_name','task_name'], suffixes=['','_i'])
    dfa['duration_min'] = dfa.runtime_i / 60  # duration of instances
    dfa['wait_time'] = dfa.start_time_i - dfa.start_time # task wait time
    dfa['start_date']=dfa.start_time.apply(pd.Timestamp, unit='s', tz='Asia/Shanghai') # task start time
    # dfa = dfa[dfa.status=='Terminated']
    print('dft + dfj + dfi + dfg ...')
    dfa = dfa.merge(dfg[[x for x in dfg.columns if x != 'user']], on='inst_id', how='left')  # reserve NaN ones by how='left'
    dfa.loc[dfa.group.isnull(),'group'] = dfa.loc[dfa.group.isnull(), 'user']  # fill group==NaN ones with user
    return dfa

dfa = get_dfa(dft, dfj, dfi, dfg)       # dfa: dataframe of task

dft + dfj ...
dft + dfj + dfi ...
dft + dfj + dfi + dfg ...


In [56]:
dfa.head(10)

Unnamed: 0,job_name,task_name,inst_num,status,start_time,end_time,plan_cpu,plan_mem,plan_gpu,gpu_type,inst_id,user,status_j,start_time_j,end_time_j,runtime,status_i,start_time_i,end_time_i,runtime_i,duration_min,wait_time,start_date,gpu_type_spec,group,workload
0,c936346f45eccd34bf748541,tensorflow,1.0,Terminated,2693235.0,2695847.0,600.0,29.296875,50.0,MISC,990f1799a5093b62142b101c0227875b81c7f0329301df98af1148ec8724,58540f191766,Terminated,2693235.0,2695847.0,2612.0,Terminated,2693240.0,2695847.0,2607.0,43.45,5.0,1970-02-01 12:07:15+08:00,,31c5b7fa71e1224f9dff5c5a4d24b571,
1,455c3dec270f4777ad67721c,tensorflow,1.0,Terminated,3399583.0,3399732.0,600.0,29.296875,100.0,MISC,d94b4b01a6dd6c1865972cd628abe206422143ef37e6d449d4b2e862e574,ebba56ec23b9,Terminated,3399583.0,3399732.0,149.0,Terminated,3399681.0,3399732.0,51.0,0.85,98.0,1970-02-09 16:19:43+08:00,,5a3163d07db392e47b6f0d063906592f,
2,ba64aa2f0feff18428923e92,tensorflow,1.0,Terminated,2152271.0,2158213.0,600.0,29.296875,50.0,MISC,af71dfe31db0378561e1bea2c26605f67cb7611f0d38661b43881a1c6006,de69ddc1064e,Terminated,2152271.0,2158213.0,5942.0,Terminated,2152280.0,2158213.0,5933.0,98.883333,9.0,1970-01-26 05:51:11+08:00,,f94dcf2c0efdf3fd36f75dbe1709c43f,
3,704783be2a4b7f88b8d2e4ee,worker,100.0,Failed,2172980.0,,600.0,9.765625,10.0,MISC,a63da076c1210109040acf759e2c1d00b8abc2d84305990a5941d0142690,cfcb2e0deb93,Failed,2172974.0,2173216.0,,Interrupted,2172995.0,,,,15.0,1970-01-26 11:36:20+08:00,,e933ce21ede6be61f206b69b225f9a4a,
4,704783be2a4b7f88b8d2e4ee,ps,10.0,Running,2172974.0,2173216.0,600.0,19.53125,,,a63da076c1210109040acf759e2c1d00b8abc2d84305990a5941d0142690,cfcb2e0deb93,Failed,2172974.0,2173216.0,242.0,Running,,,,,,1970-01-26 11:36:14+08:00,,e933ce21ede6be61f206b69b225f9a4a,
5,bce3b9c55772f9d1e85fe796,PyTorchWorker,4.0,Failed,2374681.0,,400.0,10.0,100.0,V100,8dd676ab7fef90186d5813c5255ce4e89b50bdd9854225e7fa082289c88f,11b3ab54fc9f,Failed,2374681.0,,,Interrupted,2374699.0,2378674.0,,,18.0,1970-01-28 19:38:01+08:00,V100,a2e43ea2be689ce60787b3174f3ffbbf,
6,d45e51734a9bd73fb94fa849,xComputeWorker,1.0,Terminated,2359994.0,2360345.0,1000.0,19.550781,50.0,T4,b41f0a87eba4a15f63ca1e0a881921036522a70760a9936281e4cf026eb0,c4cbaac9966d,Terminated,2359994.0,2360345.0,351.0,Terminated,2359997.0,2360345.0,348.0,5.8,3.0,1970-01-28 15:33:14+08:00,,7755b21bfabe33c2399770fd4f3be0f9,
7,b609d0b21c1702836d822f03,tensorflow,1.0,Terminated,2362291.0,2362331.0,600.0,29.296875,25.0,MISC,cb177eaca0c98d42513f19dc593586f5768569b059e7a91d0b57605da35d,a7c4e1d6c98c,Terminated,2362291.0,2362331.0,40.0,Terminated,2362298.0,2362331.0,33.0,0.55,7.0,1970-01-28 16:11:31+08:00,,924cd0eabdc82669508175677273bcbc,
8,547ad3599b438db832ed11e3,ps,5.0,Terminated,1258178.0,1259908.0,600.0,39.0625,,,6b2a4936ece386a95ce45581009bb9498f79a7c595d2413f0448517f2515,3a7b31330ae3,Terminated,1258178.0,1259908.0,1730.0,Terminated,1258181.0,1259908.0,1725.2,28.753333,3.0,1970-01-15 21:29:38+08:00,,8e816282d753167dc900b997c717cdb5,
9,547ad3599b438db832ed11e3,worker,20.0,Terminated,1258181.0,1259850.0,600.0,19.53125,50.0,MISC,6b2a4936ece386a95ce45581009bb9498f79a7c595d2413f0448517f2515,3a7b31330ae3,Terminated,1258178.0,1259908.0,1669.0,Terminated,1258225.0,1259850.0,1123.7,18.728333,44.0,1970-01-15 21:29:41+08:00,,8e816282d753167dc900b997c717cdb5,


In [57]:
dfa.drop(columns=['job_name', 'inst_id', 'user', 'group', 'gpu_type_spec', 'workload'], axis=1, inplace=True)

In [58]:
dfa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1260920 entries, 0 to 1260919
Data columns (total 20 columns):
 #   Column        Non-Null Count    Dtype                        
---  ------        --------------    -----                        
 0   task_name     1260920 non-null  object                       
 1   inst_num      1260920 non-null  float64                      
 2   status        1260920 non-null  object                       
 3   start_time    1257336 non-null  float64                      
 4   end_time      911489 non-null   float64                      
 5   plan_cpu      1242596 non-null  float64                      
 6   plan_mem      1242596 non-null  float64                      
 7   plan_gpu      1037085 non-null  float64                      
 8   gpu_type      1043312 non-null  object                       
 9   status_j      1260920 non-null  object                       
 10  start_time_j  1254108 non-null  float64                      
 11  end_time_j 

In [59]:
# 删除含有缺失值的行
dfa.dropna(axis=0, subset=["start_time", "end_time", "plan_cpu", "plan_mem", "plan_gpu", "gpu_type", "start_time_j", "end_time_j", "runtime", "start_time_i", "end_time_i", "runtime_i", "duration_min", "wait_time", "start_date"], inplace=True)

In [60]:
dfa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 733617 entries, 0 to 1260918
Data columns (total 20 columns):
 #   Column        Non-Null Count   Dtype                        
---  ------        --------------   -----                        
 0   task_name     733617 non-null  object                       
 1   inst_num      733617 non-null  float64                      
 2   status        733617 non-null  object                       
 3   start_time    733617 non-null  float64                      
 4   end_time      733617 non-null  float64                      
 5   plan_cpu      733617 non-null  float64                      
 6   plan_mem      733617 non-null  float64                      
 7   plan_gpu      733617 non-null  float64                      
 8   gpu_type      733617 non-null  object                       
 9   status_j      733617 non-null  object                       
 10  start_time_j  733617 non-null  float64                      
 11  end_time_j    733617 non-

In [61]:
dfa.head(10)

Unnamed: 0,task_name,inst_num,status,start_time,end_time,plan_cpu,plan_mem,plan_gpu,gpu_type,status_j,start_time_j,end_time_j,runtime,status_i,start_time_i,end_time_i,runtime_i,duration_min,wait_time,start_date
0,tensorflow,1.0,Terminated,2693235.0,2695847.0,600.0,29.296875,50.0,MISC,Terminated,2693235.0,2695847.0,2612.0,Terminated,2693240.0,2695847.0,2607.0,43.45,5.0,1970-02-01 12:07:15+08:00
1,tensorflow,1.0,Terminated,3399583.0,3399732.0,600.0,29.296875,100.0,MISC,Terminated,3399583.0,3399732.0,149.0,Terminated,3399681.0,3399732.0,51.0,0.85,98.0,1970-02-09 16:19:43+08:00
2,tensorflow,1.0,Terminated,2152271.0,2158213.0,600.0,29.296875,50.0,MISC,Terminated,2152271.0,2158213.0,5942.0,Terminated,2152280.0,2158213.0,5933.0,98.883333,9.0,1970-01-26 05:51:11+08:00
6,xComputeWorker,1.0,Terminated,2359994.0,2360345.0,1000.0,19.550781,50.0,T4,Terminated,2359994.0,2360345.0,351.0,Terminated,2359997.0,2360345.0,348.0,5.8,3.0,1970-01-28 15:33:14+08:00
7,tensorflow,1.0,Terminated,2362291.0,2362331.0,600.0,29.296875,25.0,MISC,Terminated,2362291.0,2362331.0,40.0,Terminated,2362298.0,2362331.0,33.0,0.55,7.0,1970-01-28 16:11:31+08:00
9,worker,20.0,Terminated,1258181.0,1259850.0,600.0,19.53125,50.0,MISC,Terminated,1258178.0,1259908.0,1669.0,Terminated,1258225.0,1259850.0,1123.7,18.728333,44.0,1970-01-15 21:29:41+08:00
10,tensorflow,1.0,Terminated,1208483.0,1208543.0,600.0,29.296875,25.0,MISC,Terminated,1208483.0,1208543.0,60.0,Terminated,1208521.0,1208543.0,22.0,0.366667,38.0,1970-01-15 07:41:23+08:00
12,worker,80.0,Terminated,3028751.0,3093250.0,300.0,6.835938,25.0,P100,Terminated,3028709.0,3093306.0,64499.0,Terminated,3028753.0,3093250.0,64320.025,1072.000417,2.0,1970-02-05 09:19:11+08:00
14,worker,20.0,Terminated,2414380.0,2415569.0,600.0,29.296875,20.0,MISC,Terminated,2409576.0,2415625.0,1189.0,Terminated,2414421.0,2415569.0,988.7,16.478333,41.0,1970-01-29 06:39:40+08:00
15,PyTorchWorker,8.0,Terminated,2087083.0,2087221.0,1800.0,58.59375,100.0,MISC,Terminated,2087083.0,2087221.0,138.0,Terminated,2087091.0,2087221.0,127.875,2.13125,8.0,1970-01-25 11:44:43+08:00


In [62]:
# 查看某列不同的值
task_name_types = dfa['task_name'].unique()
print(task_name_types)

status_types = dfa['status'].unique()
print(status_types)

gpu_type_types = dfa['gpu_type'].unique()
print(gpu_type_types)

status_j_types = dfa['status_j'].unique()
print(status_j_types)

status_i_types = dfa['status_i'].unique()
print(status_i_types)


['tensorflow' 'xComputeWorker' 'worker' 'PyTorchWorker' 'ps' 'evaluator'
 'TVMTuneMain' 'OpenmpiWorker' 'OssToVolumeWorker' 'JupyterTask'
 'BladeMain' 'chief']
['Terminated' 'Failed' 'Running']
['MISC' 'T4' 'P100' 'V100' 'V100M32']
['Terminated' 'Failed' 'Running']
['Terminated' 'Failed' 'Running' 'Ready']


In [63]:
# 转换数据类型 将非数值型转化为数值型
# from sklearn.preprocessing import LabelEncoder
# encoder = LabelEncoder().fit(dfa['status'])
# dfa['status'] = encoder.transform(dfa['status'])
# status_types = dfa['status'].unique()
# print(status_types)

# 为了清楚的知道映射关系 自己写
def trans_task_name_types(e):
    if e == 'tensorflow':
        return 0
    if e == 'xComputeWorker':
        return 1
    if e == 'worker':
        return 2
    if e == 'chief':
        return 3
    if e == 'PyTorchWorker':
        return 4
    if e == 'ps':
        return 5
    if e == 'evaluator':
        return 6
    if e == 'TVMTuneMain':
        return 7
    if e == 'OpenmpiWorker':
        return 8
    if e == 'OssToVolumeWorker':
        return 9
    if e == 'JupyterTask': 
        return 10
    if e == 'BladeMain':
        return 11

def trans_status_types(e):
    if e == 'Terminated':
        return 0
    if e == 'Failed':
        return 1
    if e == 'Running':
        return 2
    if e == 'Ready':
        return 4

def trans_gpu_type_types(e):
    if e == 'MISC':
        return 0
    if e == 'T4':
        return 1
    if e == 'P100':
        return 2
    if e == 'V100':
        return 3
    if e == 'V100M32':
        return 4

dfa['task_name'] = dfa['task_name'].apply(trans_task_name_types)
dfa['status'] = dfa['status'].apply(trans_status_types)
dfa['gpu_type'] = dfa['gpu_type'].apply(trans_gpu_type_types)
dfa['status_j'] = dfa['status_j'].apply(trans_status_types)
dfa['status_i'] = dfa['status_i'].apply(trans_status_types)


In [64]:
dfa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 733617 entries, 0 to 1260918
Data columns (total 20 columns):
 #   Column        Non-Null Count   Dtype                        
---  ------        --------------   -----                        
 0   task_name     733617 non-null  int64                        
 1   inst_num      733617 non-null  float64                      
 2   status        733617 non-null  int64                        
 3   start_time    733617 non-null  float64                      
 4   end_time      733617 non-null  float64                      
 5   plan_cpu      733617 non-null  float64                      
 6   plan_mem      733617 non-null  float64                      
 7   plan_gpu      733617 non-null  float64                      
 8   gpu_type      733617 non-null  int64                        
 9   status_j      733617 non-null  int64                        
 10  start_time_j  733617 non-null  float64                      
 11  end_time_j    733617 non-

In [65]:
task_name_types = dfa['task_name'].unique()
print(task_name_types)

status_types = dfa['status'].unique()
print(status_types)

gpu_type_types = dfa['gpu_type'].unique()
print(gpu_type_types)

status_j_types = dfa['status_j'].unique()
print(status_j_types)

status_i_types = dfa['status_i'].unique()
print(status_i_types)


[ 0  1  2  4  5  6  7  8  9 10 11  3]
[0 1 2]
[0 1 2 3 4]
[0 1 2]
[0 1 2 4]
