In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

# from utils import *

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', 100)
matplotlib.rcParams.update({"font.size": 16, 'lines.linewidth': 2.5})

In [2]:
# 将 csv 读到 DataFrame
def get_df(file, header=None):
    df = pd.read_csv(file, header=None) # header=None可明确使用传入的数据为header
    # df.columns = DF_HEADER.get(key, df.columns)
    # 从header文件中拿header并设置
    df.columns = pd.read_csv("{}.header".format(file.split('.csv')[0])).columns if header is None else header
    return df

DATA_DIR = './data/'
dfj = get_df(DATA_DIR + 'pai_job_table.csv')
dft = get_df(DATA_DIR + 'pai_task_table.csv')
dfi = get_df(DATA_DIR + 'pai_instance_table.csv')
dfs = get_df(DATA_DIR + 'pai_sensor_table.csv')
dfg = get_df(DATA_DIR + 'pai_group_tag_table.csv')
dfp = get_df(DATA_DIR + 'pai_machine_spec.csv')
dfm = get_df(DATA_DIR + 'pai_machine_metric.csv')

In [3]:
def get_dfia(dfi):
    dfi_s = dfi[dfi.start_time > 0][['job_name','task_name','start_time']].groupby(['job_name','task_name']).min()  # start_time
    dfi_e = dfi[dfi.end_time > 0][['job_name','task_name','end_time']].groupby(['job_name','task_name']).max()  # end_time
    dfi_m = dfi[(dfi.start_time > 0) & (dfi.end_time > 0)][['job_name','task_name','end_time','start_time']]
    dfi_m['runtime'] = dfi_m.end_time-dfi_m.start_time
    dfi_m = dfi_m.groupby(['job_name','task_name']).mean()[['runtime']].reset_index() # runtime
    dfi_u = dfi[['job_name','task_name','status']].drop_duplicates().groupby(['job_name','task_name']).max() # status
    dfia = dfi_u
    for df in [dfi_s, dfi_e, dfi_m]:
        dfia = dfia.merge(df, on=['job_name','task_name'], how='left')
    return dfia

def get_dfa(dft, dfj, dfi, dfg):
    print('dft + dfj ...')
    dfa = dft.merge(dfj, on=['job_name'], suffixes = ['','_j'])
    dfa.loc[dfa.start_time==0, 'start_time'] = np.nan
    dfa.loc[dfa.start_time==0, 'end_time'] = np.nan
    dfa['runtime'] = dfa.end_time - dfa.start_time
    print('dft + dfj + dfi ...')
    dfia = get_dfia(dfi)
    dfa = dfa.merge(dfia, on=['job_name','task_name'], suffixes=['','_i'])
    dfa['duration_min'] = dfa.runtime_i / 60  # duration of instances
    dfa['wait_time'] = dfa.start_time_i - dfa.start_time # task wait time
    dfa['start_date']=dfa.start_time.apply(pd.Timestamp, unit='s', tz='Asia/Shanghai') # task start time
    # dfa = dfa[dfa.status=='Terminated']
    print('dft + dfj + dfi + dfg ...')
    dfa = dfa.merge(dfg[[x for x in dfg.columns if x != 'user']], on='inst_id', how='left')  # reserve NaN ones by how='left'
    dfa.loc[dfa.group.isnull(),'group'] = dfa.loc[dfa.group.isnull(), 'user']  # fill group==NaN ones with user
    return dfa

dfa_all = get_dfa(dft, dfj, dfi, dfg)       # dfa: dataframe of task


dft + dfj ...
dft + dfj + dfi ...
dft + dfj + dfi + dfg ...


In [5]:
dfa_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1260920 entries, 0 to 1260919
Data columns (total 26 columns):
 #   Column         Non-Null Count    Dtype                        
---  ------         --------------    -----                        
 0   job_name       1260920 non-null  object                       
 1   task_name      1260920 non-null  object                       
 2   inst_num       1260920 non-null  float64                      
 3   status         1260920 non-null  object                       
 4   start_time     1257336 non-null  float64                      
 5   end_time       911489 non-null   float64                      
 6   plan_cpu       1242596 non-null  float64                      
 7   plan_mem       1242596 non-null  float64                      
 8   plan_gpu       1037085 non-null  float64                      
 9   gpu_type       1043312 non-null  object                       
 10  inst_id        1260920 non-null  object                       
 11

In [4]:
def get_dfiw(dfi):
    dfiw = dfi.sort_values(['status','start_time','end_time'])
    dfiw.drop_duplicates(subset=['worker_name'], keep='last', inplace=True)
    dfiw.dropna(subset=['worker_name'], inplace=True)
    dfiw['runtime'] = dfiw[(dfiw.start_time>0)&(dfiw.end_time>0)]['end_time'] \
                    - dfiw[(dfiw.start_time>0)&(dfiw.end_time>0)]['start_time']
    dfiw.loc[dfiw.start_time==0, 'start_time'] = np.nan
    dfiw.loc[dfiw.start_time==0, 'end_time'] = np.nan
    return dfiw

def get_dfw(dfi, dft, dfg):
    dfw = get_dfiw(dfi)
    dfw['start_date']=dfw.start_time.apply(pd.Timestamp, unit='s', tz='Asia/Shanghai')
    print('dfi + dft ...')
    dfw = dfw.merge(dft, on=['job_name','task_name'], how='left', suffixes=['', '_t'])
    print('dfi + dft + dfg ...')
    dfw = dfw.merge(dfg, on='inst_id', how='left')  # reserve NaN ones by how='left'
    dfw.loc[dfw.group.isnull(),'group'] = dfw.loc[dfw.group.isnull(), 'user']  # fill group==NaN ones with user
    return dfw
dfw_all = get_dfw(dfi, dft, dfg)

dfi + dft ...
dfi + dft + dfg ...


In [6]:
dfw_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7164358 entries, 0 to 7164357
Data columns (total 23 columns):
 #   Column         Dtype                        
---  ------         -----                        
 0   job_name       object                       
 1   task_name      object                       
 2   inst_name      object                       
 3   worker_name    object                       
 4   inst_id        object                       
 5   status         object                       
 6   start_time     float64                      
 7   end_time       float64                      
 8   machine        object                       
 9   runtime        float64                      
 10  start_date     datetime64[ns, Asia/Shanghai]
 11  inst_num       float64                      
 12  status_t       object                       
 13  start_time_t   float64                      
 14  end_time_t     float64                      
 15  plan_cpu       float64          

In [7]:
dfws = dfw_all.merge(dfp.drop(columns={'gpu_type'}), on='machine', how='left')
dfws = dfws.merge(dfs.drop(columns=['job_name','task_name','inst_id','machine']), on='worker_name')
# dfws.to_csv('dfws.csv') # optional

In [8]:
dfws.head(5)

Unnamed: 0,job_name,task_name,inst_name,worker_name,inst_id,status,start_time,end_time,machine,runtime,start_date,inst_num,status_t,start_time_t,end_time_t,plan_cpu,plan_mem,plan_gpu,gpu_type,user,gpu_type_spec,group,workload,cap_cpu,cap_mem,cap_gpu,gpu_name,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem,read,write,read_count,write_count
0,337bc6f19baecbc9706b6492,worker,41e7eb6c7170b2cb514be986397729655b97dd80fd2106a0426b2071df02,998b334783ba5cfcd6f5492fecc497e4288e1aafbe07b3ab95e8481707af,cc157955ec6fe49c15ed9b033b5e1416a6d8a4315d46693be99b40ab16e3,Failed,1588823.0,,fb459ec921e192e9dfc7963b,,1970-01-19 17:20:23+08:00,2.0,Failed,1588821.0,,600.0,29.296875,25.0,T4,38402cb323c8,,e3435144ef63373c77afbfc66c44b7d9,,96,512,2,/dev/nvidia0,1.620546,0.0,0.777763,0.817383,0.098633,0.098633,29754.792731,6268.233528,104.853414,63.715075
1,e4df78c45628acfc1b532387,tensorflow,b8e9a8e3cb359d2496a5680a0d5f48741b9d78b72f33fb7ca12845df07ab,65256dfde4f75b2431a024f28c537459b0b1b683035e41d5e747fa2f3517,fc77169f3db74ed52ab2d17b5d21b2d6e8c3b82d1201fb32a5b85412b4ea,Failed,1636203.0,,2500a3f1f824bf611a9ebe04,,1970-01-20 06:30:03+08:00,1.0,Failed,1636200.0,,3200.0,29.296875,200.0,T4,47baba33cffe,,3c9b50ad2499ca2a996e0bd17916272d,,96,512,2,/dev/nvidia0,149.32784,24.976007,27.082815,56.009766,4.332031,4.332031,884828.028936,73527.89342,242.304809,124.360725
2,db21abf3954bd9418ba54df5,worker,dc16f23774672dbe395f94685591966987f48a5c92264d62967aa9b92809,048c226ed14575943aa3cb459bff74af97dd7ae271cf68cdb5601b1f69bd,d2c2d0081eb2d3718977c0bbbd6addb80951aae277e37972e0bc7da8fa1a,Failed,1682551.0,,72589c833bbdadca0cf899e7,,1970-01-20 19:22:31+08:00,2.0,Failed,1682543.0,,600.0,29.765625,25.0,MISC,b7b576763f61,,ec795302ba4880a98fa159f755e53278,,96,512,8,/dev/nvidia7,4.971858,0.0,12.0653,12.088867,0.594727,0.594727,309726.32661,37874.947721,197.371206,64.184082
3,272c73f1eb7daf9f7cd88d91,worker,4fcd9c09cda154279cc3e2d17ef7ad1cfd5227ef02b0dcc3c53a9dca54f5,53458ea1bffb4eacfd038cbab1bf24fc15ea1989ec25867a6c8dd2389cdc,b530424de414395397f1f73fb30c9036cd98ce8bbb431fcd54e428755aba,Failed,1693841.0,,2592210a2334355c236cd1a7,,1970-01-20 22:30:41+08:00,10.0,Running,1677934.0,,600.0,29.296875,25.0,MISC,7203aa95a9af,,b2392c2ea9ad01bbcc03075082211005,,96,512,8,/dev/nvidia5,0.195648,0.0,0.414499,0.423828,0.0,0.0,5506.436629,6995.871779,53.427545,49.575201
4,93f4887f4e4e8c478bc679c5,worker,7af21e4b6fdf1315675fb6aedcc701ee4d225c8bc9fc4c4d8dc901713374,fa77c688f49e6ab84cc8f26dc7010c6a583a73d31ce4fe5edf72d2f15389,f2bb5ad2d3fcaad9ccbb08912c5bcffdc43b93558505b41b9c90f4161435,Failed,1721428.0,,9d2583e4db3eae548d6ca659,,1970-01-21 06:10:28+08:00,15.0,Failed,1721172.0,,100.0,29.296875,100.0,MISC,5f4cb64dc693,,dca06ec72ecc7a841dcde9bf564f355e,,96,512,8,/dev/nvidia5,11.151594,0.0,28.265042,28.285156,10.581055,10.581055,58137.126271,8069.516249,156.56326,86.425577


In [9]:
dfws.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3019349 entries, 0 to 3019348
Data columns (total 37 columns):
 #   Column           Dtype                        
---  ------           -----                        
 0   job_name         object                       
 1   task_name        object                       
 2   inst_name        object                       
 3   worker_name      object                       
 4   inst_id          object                       
 5   status           object                       
 6   start_time       float64                      
 7   end_time         float64                      
 8   machine          object                       
 9   runtime          float64                      
 10  start_date       datetime64[ns, Asia/Shanghai]
 11  inst_num         float64                      
 12  status_t         object                       
 13  start_time_t     float64                      
 14  end_time_t       float64                      
 15

In [10]:
dfas = dfws.groupby(['job_name','task_name'])[['cpu_usage','gpu_wrk_util','avg_mem','avg_gpu_wrk_mem','plan_cpu','plan_gpu','plan_mem','cap_cpu','cap_gpu','cap_mem']].sum()
dfas = dfa_all.drop(columns=['plan_cpu','plan_mem','plan_gpu']).merge(dfas, on=['job_name','task_name'])
# dfas.to_csv('dfas.csv') # optional

In [11]:
dfas.head(5)
dfas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 850033 entries, 0 to 850032
Data columns (total 33 columns):
 #   Column           Non-Null Count   Dtype                        
---  ------           --------------   -----                        
 0   job_name         850033 non-null  object                       
 1   task_name        850033 non-null  object                       
 2   inst_num         850033 non-null  float64                      
 3   status           850033 non-null  object                       
 4   start_time       850033 non-null  float64                      
 5   end_time         591075 non-null  float64                      
 6   gpu_type         850032 non-null  object                       
 7   inst_id          850033 non-null  object                       
 8   user             850033 non-null  object                       
 9   status_j         850033 non-null  object                       
 10  start_time_j     850020 non-null  float64               

In [12]:
dfas.head(5)

Unnamed: 0,job_name,task_name,inst_num,status,start_time,end_time,gpu_type,inst_id,user,status_j,start_time_j,end_time_j,runtime,status_i,start_time_i,end_time_i,runtime_i,duration_min,wait_time,start_date,gpu_type_spec,group,workload,cpu_usage,gpu_wrk_util,avg_mem,avg_gpu_wrk_mem,plan_cpu,plan_gpu,plan_mem,cap_cpu,cap_gpu,cap_mem
0,c936346f45eccd34bf748541,tensorflow,1.0,Terminated,2693235.0,2695847.0,MISC,990f1799a5093b62142b101c0227875b81c7f0329301df98af1148ec8724,58540f191766,Terminated,2693235.0,2695847.0,2612.0,Terminated,2693240.0,2695847.0,2607.0,43.45,5.0,1970-02-01 12:07:15+08:00,,31c5b7fa71e1224f9dff5c5a4d24b571,,350.353167,77.468451,24.306558,4.184025,600.0,50.0,29.296875,96,8,512
1,455c3dec270f4777ad67721c,tensorflow,1.0,Terminated,3399583.0,3399732.0,MISC,d94b4b01a6dd6c1865972cd628abe206422143ef37e6d449d4b2e862e574,ebba56ec23b9,Terminated,3399583.0,3399732.0,149.0,Terminated,3399681.0,3399732.0,51.0,0.85,98.0,1970-02-09 16:19:43+08:00,,5a3163d07db392e47b6f0d063906592f,,117.888889,0.333333,1.870295,0.069743,600.0,100.0,29.296875,96,8,512
2,ba64aa2f0feff18428923e92,tensorflow,1.0,Terminated,2152271.0,2158213.0,MISC,af71dfe31db0378561e1bea2c26605f67cb7611f0d38661b43881a1c6006,de69ddc1064e,Terminated,2152271.0,2158213.0,5942.0,Terminated,2152280.0,2158213.0,5933.0,98.883333,9.0,1970-01-26 05:51:11+08:00,,f94dcf2c0efdf3fd36f75dbe1709c43f,,139.502103,4.408403,12.811959,4.21284,600.0,50.0,29.296875,96,8,512
3,704783be2a4b7f88b8d2e4ee,worker,100.0,Failed,2172980.0,,MISC,a63da076c1210109040acf759e2c1d00b8abc2d84305990a5941d0142690,cfcb2e0deb93,Failed,2172974.0,2173216.0,,Interrupted,2172995.0,,,,15.0,1970-01-26 11:36:20+08:00,,e933ce21ede6be61f206b69b225f9a4a,,4152.975415,582.481221,821.026606,20.235918,27600.0,460.0,449.21875,4416,368,23552
4,bce3b9c55772f9d1e85fe796,PyTorchWorker,4.0,Failed,2374681.0,,V100,8dd676ab7fef90186d5813c5255ce4e89b50bdd9854225e7fa082289c88f,11b3ab54fc9f,Failed,2374681.0,,,Interrupted,2374699.0,2378674.0,,,18.0,1970-01-28 19:38:01+08:00,V100,a2e43ea2be689ce60787b3174f3ffbbf,,0.0,0.0,0.009766,0.0,800.0,200.0,20.0,192,16,1024


In [13]:
# 把最初的 dfas dfws保存
dfws.to_csv("dfws.csv",header=None)
dfas.to_csv("dfas.csv",header=None)

In [16]:
dfws.columns.values

array(['job_name', 'task_name', 'inst_name', 'worker_name', 'inst_id',
       'status', 'start_time', 'end_time', 'machine', 'runtime',
       'start_date', 'inst_num', 'status_t', 'start_time_t', 'end_time_t',
       'plan_cpu', 'plan_mem', 'plan_gpu', 'gpu_type', 'user',
       'gpu_type_spec', 'group', 'workload', 'cap_cpu', 'cap_mem',
       'cap_gpu', 'gpu_name', 'cpu_usage', 'gpu_wrk_util', 'avg_mem',
       'max_mem', 'avg_gpu_wrk_mem', 'max_gpu_wrk_mem', 'read', 'write',
       'read_count', 'write_count'], dtype=object)

In [17]:
dfas.columns.values

array(['job_name', 'task_name', 'inst_num', 'status', 'start_time',
       'end_time', 'gpu_type', 'inst_id', 'user', 'status_j',
       'start_time_j', 'end_time_j', 'runtime', 'status_i',
       'start_time_i', 'end_time_i', 'runtime_i', 'duration_min',
       'wait_time', 'start_date', 'gpu_type_spec', 'group', 'workload',
       'cpu_usage', 'gpu_wrk_util', 'avg_mem', 'avg_gpu_wrk_mem',
       'plan_cpu', 'plan_gpu', 'plan_mem', 'cap_cpu', 'cap_gpu',
       'cap_mem'], dtype=object)

In [18]:
dfws.head(10)

Unnamed: 0,job_name,task_name,inst_name,worker_name,inst_id,status,start_time,end_time,machine,runtime,start_date,inst_num,status_t,start_time_t,end_time_t,plan_cpu,plan_mem,plan_gpu,gpu_type,user,gpu_type_spec,group,workload,cap_cpu,cap_mem,cap_gpu,gpu_name,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem,read,write,read_count,write_count
0,337bc6f19baecbc9706b6492,worker,41e7eb6c7170b2cb514be986397729655b97dd80fd2106a0426b2071df02,998b334783ba5cfcd6f5492fecc497e4288e1aafbe07b3ab95e8481707af,cc157955ec6fe49c15ed9b033b5e1416a6d8a4315d46693be99b40ab16e3,Failed,1588823.0,,fb459ec921e192e9dfc7963b,,1970-01-19 17:20:23+08:00,2.0,Failed,1588821.0,,600.0,29.296875,25.0,T4,38402cb323c8,,e3435144ef63373c77afbfc66c44b7d9,,96,512,2,/dev/nvidia0,1.620546,0.0,0.777763,0.817383,0.098633,0.098633,29754.79,6268.234,104.853414,63.715075
1,e4df78c45628acfc1b532387,tensorflow,b8e9a8e3cb359d2496a5680a0d5f48741b9d78b72f33fb7ca12845df07ab,65256dfde4f75b2431a024f28c537459b0b1b683035e41d5e747fa2f3517,fc77169f3db74ed52ab2d17b5d21b2d6e8c3b82d1201fb32a5b85412b4ea,Failed,1636203.0,,2500a3f1f824bf611a9ebe04,,1970-01-20 06:30:03+08:00,1.0,Failed,1636200.0,,3200.0,29.296875,200.0,T4,47baba33cffe,,3c9b50ad2499ca2a996e0bd17916272d,,96,512,2,/dev/nvidia0,149.32784,24.976007,27.082815,56.009766,4.332031,4.332031,884828.0,73527.89,242.304809,124.360725
2,db21abf3954bd9418ba54df5,worker,dc16f23774672dbe395f94685591966987f48a5c92264d62967aa9b92809,048c226ed14575943aa3cb459bff74af97dd7ae271cf68cdb5601b1f69bd,d2c2d0081eb2d3718977c0bbbd6addb80951aae277e37972e0bc7da8fa1a,Failed,1682551.0,,72589c833bbdadca0cf899e7,,1970-01-20 19:22:31+08:00,2.0,Failed,1682543.0,,600.0,29.765625,25.0,MISC,b7b576763f61,,ec795302ba4880a98fa159f755e53278,,96,512,8,/dev/nvidia7,4.971858,0.0,12.0653,12.088867,0.594727,0.594727,309726.3,37874.95,197.371206,64.184082
3,272c73f1eb7daf9f7cd88d91,worker,4fcd9c09cda154279cc3e2d17ef7ad1cfd5227ef02b0dcc3c53a9dca54f5,53458ea1bffb4eacfd038cbab1bf24fc15ea1989ec25867a6c8dd2389cdc,b530424de414395397f1f73fb30c9036cd98ce8bbb431fcd54e428755aba,Failed,1693841.0,,2592210a2334355c236cd1a7,,1970-01-20 22:30:41+08:00,10.0,Running,1677934.0,,600.0,29.296875,25.0,MISC,7203aa95a9af,,b2392c2ea9ad01bbcc03075082211005,,96,512,8,/dev/nvidia5,0.195648,0.0,0.414499,0.423828,0.0,0.0,5506.437,6995.872,53.427545,49.575201
4,93f4887f4e4e8c478bc679c5,worker,7af21e4b6fdf1315675fb6aedcc701ee4d225c8bc9fc4c4d8dc901713374,fa77c688f49e6ab84cc8f26dc7010c6a583a73d31ce4fe5edf72d2f15389,f2bb5ad2d3fcaad9ccbb08912c5bcffdc43b93558505b41b9c90f4161435,Failed,1721428.0,,9d2583e4db3eae548d6ca659,,1970-01-21 06:10:28+08:00,15.0,Failed,1721172.0,,100.0,29.296875,100.0,MISC,5f4cb64dc693,,dca06ec72ecc7a841dcde9bf564f355e,,96,512,8,/dev/nvidia5,11.151594,0.0,28.265042,28.285156,10.581055,10.581055,58137.13,8069.516,156.56326,86.425577
5,e5d6d5b546bff61f93b47ebf,worker,cb0fde83862eb74d5adc106a6abb6fec3b436914e835a1162e29f2eadeb0,87309bfc7653f0da7649c61bd040fc746f608dc70dec8702f56bd8f600c4,394df9f9ab826af09f66d798a946bd5c6abfeeb353e421d1ceb9243a8171,Failed,1742858.0,,0018d759f400ef5c0457ab9c,,1970-01-21 12:07:38+08:00,2.0,Failed,1742855.0,,4000.0,48.828125,400.0,V100,67f7bfa4e837,,56bc4999e13aeb143015cae64f2636c2,,96,384,8,/dev/nvidia1,1.321286,0.0,77.447164,311.296875,44.289062,44.289062,648727.1,149727.1,324.73925,84.012447
6,108310c4b751b1855e8acc47,worker,c40fcf25a6a3e2cb00dc21fd95d2a25b291ca9328e26e4409ae473b57893,af26a8282594fb636573d56a6373010220bc74287ce36901dc9e1cea5471,445fda54e17d04e69c9f4bf5363e9d7c765916d1dc5695f5a31a376f3639,Failed,1813192.0,,fd3b839a7905aa88e6bc6ae5,,1970-01-22 07:39:52+08:00,15.0,Failed,1811592.0,,100.0,29.296875,100.0,MISC,5f4cb64dc693,,dca06ec72ecc7a841dcde9bf564f355e,,96,512,8,/dev/nvidia4,145.29228,0.0,14.471995,14.704102,8.342773,8.342773,326990.5,39752.21,229.006398,77.35538
7,2074763b6ab0969181e341f1,worker,04a04748dce573961c075540636fe756234c94b5d5a86ebae53c49bbcbe4,d55cceb38da8b1328f16f9df2338bdf3f900304f4797c5d8966c08434855,203f7bfac0327fe2c58675f0d8e1a5bc59f7d4fef93ab02067e1ec80f7c6,Failed,1818195.0,,d73dbda0dd25f3b3ee3e1bba,,1970-01-22 09:03:15+08:00,3.0,Failed,1818186.0,,600.0,19.53125,25.0,MISC,3a7b31330ae3,,a3441ed169d9956d01451fc551630821,,96,512,8,/dev/nvidia0,202.754545,0.0,9.936354,9.962891,2.780273,2.780273,60605.72,6274.884,155.974985,70.846794
8,a7daa3931b9188dfe7a3040d,worker,3f1faaf9041ce754f5d255a13cff418b2450e551f89b3ee2a3b7d7dce682,8621acd4228c8ee5876da6c380f0174e10ccb1c0e7c6e510f6e12ba6482e,5e84c83a8dcb25a786916655a83c0d5babb9e6e6580d31b0b47f6903c000,Failed,1827331.0,,a052d5eedce297d15d8bc041,,1970-01-22 11:35:31+08:00,5.0,Failed,1827327.0,,1000.0,29.296875,25.0,MISC,b3bfe9b79bb5,,f2248cf82e8d7cb1a156fdeb51ed7092,,96,512,8,/dev/nvidia2,29.289396,0.0,1.445299,1.49707,0.250977,0.250977,731779.2,617041.4,15651.41025,15818.834281
9,6b147d05f96bef1bfc4e5c84,worker,7fca2b5791aec59bf26661906f961b9d1e770cbb5d37ee82ff87920a006c,9eae7fd438cba4021b440cbf088edbc805f2f54f66ff37dcd24698f317f6,91e86dabeffcb7d2b969d3216d27052f3f246f78114978eb29a2ae04cbfb,Failed,1839585.0,,d674e1004bd5b0f4573eb927,,1970-01-22 14:59:45+08:00,1.0,Failed,1839571.0,,300.0,9.765625,50.0,MISC,e97f950dabda,,b4e3ccc352d944b1235a5127f39fcb31,,96,512,8,/dev/nvidia6,117.035289,85.208105,5.933949,6.863281,5.379883,5.379883,7990768.0,12488620.0,716.416085,716.758666


In [23]:
dfws_without_id_col = dfws.drop(columns=['job_name', 'inst_name','worker_name','machine','inst_id', 'user', 'group', 'gpu_type_spec', 'workload', 'gpu_name', 'start_date'], axis=1)

In [24]:
dfws_without_id_col.head(5)

Unnamed: 0,task_name,status,start_time,end_time,runtime,inst_num,status_t,start_time_t,end_time_t,plan_cpu,plan_mem,plan_gpu,gpu_type,cap_cpu,cap_mem,cap_gpu,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem,read,write,read_count,write_count
0,worker,Failed,1588823.0,,,2.0,Failed,1588821.0,,600.0,29.296875,25.0,T4,96,512,2,1.620546,0.0,0.777763,0.817383,0.098633,0.098633,29754.792731,6268.233528,104.853414,63.715075
1,tensorflow,Failed,1636203.0,,,1.0,Failed,1636200.0,,3200.0,29.296875,200.0,T4,96,512,2,149.32784,24.976007,27.082815,56.009766,4.332031,4.332031,884828.028936,73527.89342,242.304809,124.360725
2,worker,Failed,1682551.0,,,2.0,Failed,1682543.0,,600.0,29.765625,25.0,MISC,96,512,8,4.971858,0.0,12.0653,12.088867,0.594727,0.594727,309726.32661,37874.947721,197.371206,64.184082
3,worker,Failed,1693841.0,,,10.0,Running,1677934.0,,600.0,29.296875,25.0,MISC,96,512,8,0.195648,0.0,0.414499,0.423828,0.0,0.0,5506.436629,6995.871779,53.427545,49.575201
4,worker,Failed,1721428.0,,,15.0,Failed,1721172.0,,100.0,29.296875,100.0,MISC,96,512,8,11.151594,0.0,28.265042,28.285156,10.581055,10.581055,58137.126271,8069.516249,156.56326,86.425577


In [28]:
# dfws_without_id_col.info()
dfws_without_id_col.columns.values

array(['task_name', 'status', 'start_time', 'end_time', 'runtime',
       'inst_num', 'status_t', 'start_time_t', 'end_time_t', 'plan_cpu',
       'plan_mem', 'plan_gpu', 'gpu_type', 'cap_cpu', 'cap_mem',
       'cap_gpu', 'cpu_usage', 'gpu_wrk_util', 'avg_mem', 'max_mem',
       'avg_gpu_wrk_mem', 'max_gpu_wrk_mem', 'read', 'write',
       'read_count', 'write_count'], dtype=object)

In [29]:
# 删除含有缺失值的样本
dfws_without_id_col.dropna(axis=0, subset=['task_name', 'status', 'start_time', 'end_time', 'runtime',
       'inst_num', 'status_t', 'start_time_t', 'end_time_t', 'plan_cpu',
       'plan_mem', 'plan_gpu', 'gpu_type', 'cap_cpu', 'cap_mem',
       'cap_gpu', 'cpu_usage', 'gpu_wrk_util', 'avg_mem', 'max_mem',
       'avg_gpu_wrk_mem', 'max_gpu_wrk_mem', 'read', 'write',
       'read_count', 'write_count'], inplace=True)

In [30]:
dfws_without_id_col.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1956708 entries, 5169 to 3019348
Data columns (total 26 columns):
 #   Column           Dtype  
---  ------           -----  
 0   task_name        object 
 1   status           object 
 2   start_time       float64
 3   end_time         float64
 4   runtime          float64
 5   inst_num         float64
 6   status_t         object 
 7   start_time_t     float64
 8   end_time_t       float64
 9   plan_cpu         float64
 10  plan_mem         float64
 11  plan_gpu         float64
 12  gpu_type         object 
 13  cap_cpu          int64  
 14  cap_mem          int64  
 15  cap_gpu          int64  
 16  cpu_usage        float64
 17  gpu_wrk_util     float64
 18  avg_mem          float64
 19  max_mem          float64
 20  avg_gpu_wrk_mem  float64
 21  max_gpu_wrk_mem  float64
 22  read             float64
 23  write            float64
 24  read_count       float64
 25  write_count      float64
dtypes: float64(19), int64(3), object(4)
mem

In [36]:
dfws_without_id_col.head(5)

Unnamed: 0,task_name,status,start_time,end_time,runtime,inst_num,status_t,start_time_t,end_time_t,plan_cpu,plan_mem,plan_gpu,gpu_type,cap_cpu,cap_mem,cap_gpu,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem,read,write,read_count,write_count
5169,,,2193114.0,2193114.0,0.0,5.0,1,2193105.0,2193114.0,100.0,9.785156,1.0,,96,512,8,37.0,0.0,0.003906,0.003906,0.0,0.0,123130.5,130798.0,166.5,111.0
6859,,,2214130.0,2214157.0,27.0,1.0,1,2214123.0,2214157.0,6400.0,78.144531,200.0,,96,512,2,87.5,0.0,0.91569,2.449219,0.0,0.0,340792600.0,344064900.0,57815.0,19027.75
6960,,,2214284.0,2214316.0,32.0,1.0,1,2214281.0,2214316.0,6400.0,78.144531,200.0,,96,512,2,71.6,0.0,0.891741,2.308594,0.0,0.0,303621200.0,307155200.0,51578.222222,17310.888889
6989,,,2214459.0,2214558.0,99.0,1.0,1,2214456.0,2214558.0,6400.0,78.144531,200.0,,96,512,2,51.714286,0.0,1.864597,5.722656,0.0,0.0,185324300.0,137478600.0,35872.92,8229.68
7010,,,2214700.0,2214799.0,99.0,1.0,1,2214697.0,2214799.0,6400.0,78.144531,200.0,,96,512,2,64.05,0.0,1.970746,6.019531,0.0,0.0,165614700.0,114112300.0,30130.76,7721.24


In [37]:
dfws_without_id_col = dfws_without_id_col.drop(columns=['task_name', 'status', 'gpu_type'], axis=1)

In [41]:
# 去掉开始时间和结束时间
dfws_without_id_col = dfws_without_id_col.drop(columns=['start_time', 'end_time', 'start_time_t', 'end_time_t'], axis=1)

In [43]:
dfws_without_id_col.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1956708 entries, 5169 to 3019348
Data columns (total 19 columns):
 #   Column           Dtype  
---  ------           -----  
 0   runtime          float64
 1   inst_num         float64
 2   status_t         int64  
 3   plan_cpu         float64
 4   plan_mem         float64
 5   plan_gpu         float64
 6   cap_cpu          int64  
 7   cap_mem          int64  
 8   cap_gpu          int64  
 9   cpu_usage        float64
 10  gpu_wrk_util     float64
 11  avg_mem          float64
 12  max_mem          float64
 13  avg_gpu_wrk_mem  float64
 14  max_gpu_wrk_mem  float64
 15  read             float64
 16  write            float64
 17  read_count       float64
 18  write_count      float64
dtypes: float64(15), int64(4)
memory usage: 298.6 MB


In [44]:
dfws_without_id_col.describe()

Unnamed: 0,runtime,inst_num,status_t,plan_cpu,plan_mem,plan_gpu,cap_cpu,cap_mem,cap_gpu,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem,read,write,read_count,write_count
count,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0,1956708.0
mean,6424.305,36.7237,0.003039288,572.202,24.94792,55.62539,91.44561,510.1345,5.787945,322.7489,12.43909,6.132792,9.608281,1.881005,2.506543,334346900.0,62962480.0,12536.8,10506.54
std,18213.39,57.74925,0.07616452,407.8902,16.97499,47.02193,11.18026,15.33949,2.894676,634.0961,24.52304,8.634366,25.77764,3.986787,4.496331,771882100.0,287547200.0,29602.1,96277.95
min,0.0,1.0,0.0,1.0,0.390625,1.0,64.0,384.0,2.0,0.0,0.0,0.0009765625,0.0009765625,0.0,0.0,0.0,0.0,0.0,0.0
25%,236.0,4.0,0.0,400.0,14.64844,25.0,96.0,512.0,2.0,80.92593,0.4225352,1.580741,2.292969,0.255205,0.3466797,14999690.0,1724527.0,1680.411,856.2188
50%,1216.0,20.0,0.0,600.0,29.29688,50.0,96.0,512.0,8.0,137.4929,4.285714,3.463119,5.443359,0.7358025,1.223633,52380290.0,5911242.0,4260.561,2049.744
75%,4569.0,50.0,0.0,600.0,29.29688,100.0,96.0,512.0,8.0,309.4459,14.1578,7.564911,11.9502,2.26786,4.311523,225791800.0,25556200.0,11119.54,5873.196
max,626371.0,1050.0,2.0,9000.0,300.0,800.0,96.0,512.0,8.0,9206.331,784.1853,294.9445,2757.586,253.2296,253.5312,61898350000.0,49285000000.0,8557099.0,13392440.0


In [45]:
# 先存一份
dfws_without_id_col.to_csv("dfws_all_number_type.csv", header=None)

In [47]:
dfws_without_id_col.columns.values

array(['runtime', 'inst_num', 'status_t', 'plan_cpu', 'plan_mem',
       'plan_gpu', 'cap_cpu', 'cap_mem', 'cap_gpu', 'cpu_usage',
       'gpu_wrk_util', 'avg_mem', 'max_mem', 'avg_gpu_wrk_mem',
       'max_gpu_wrk_mem', 'read', 'write', 'read_count', 'write_count'],
      dtype=object)

In [48]:
# 对跨度大的数据进行归一化
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(dfws_without_id_col)

dfws_scaler = scaler.transform(dfws_without_id_col)

In [55]:
# 随机抽取若干行先测试
dfws_without_id_col.head(5)
dfws_sample_10000 = dfws_without_id_col.sample(n=10000, random_state=2022, axis=0, replace=True)

In [57]:
dfws_sample_10000.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1364624 to 1627262
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   runtime          10000 non-null  float64
 1   inst_num         10000 non-null  float64
 2   status_t         10000 non-null  int64  
 3   plan_cpu         10000 non-null  float64
 4   plan_mem         10000 non-null  float64
 5   plan_gpu         10000 non-null  float64
 6   cap_cpu          10000 non-null  int64  
 7   cap_mem          10000 non-null  int64  
 8   cap_gpu          10000 non-null  int64  
 9   cpu_usage        10000 non-null  float64
 10  gpu_wrk_util     10000 non-null  float64
 11  avg_mem          10000 non-null  float64
 12  max_mem          10000 non-null  float64
 13  avg_gpu_wrk_mem  10000 non-null  float64
 14  max_gpu_wrk_mem  10000 non-null  float64
 15  read             10000 non-null  float64
 16  write            10000 non-null  float64
 17  read

In [58]:
dfws_sample_100 = dfws_without_id_col.sample(n=100, random_state=2022, axis=0, replace=True)

In [59]:
dfws_sample_100.head()

Unnamed: 0,runtime,inst_num,status_t,plan_cpu,plan_mem,plan_gpu,cap_cpu,cap_mem,cap_gpu,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem,read,write,read_count,write_count
1364624,110.0,10.0,0,1800.0,58.59375,100.0,96,512,8,36.045455,0.0,0.982015,2.117188,0.092148,2.303711,49449550.0,70599010.0,5589.730769,4150.538462
1815756,84.0,5.0,0,800.0,29.296875,50.0,96,512,8,122.466667,2.0,2.756319,4.916992,2.848145,5.532227,168287800.0,4779825.0,7240.789474,2730.105263
1156407,207.0,16.0,0,600.0,4.0,50.0,96,512,8,50.481481,44.321429,1.773507,2.884766,0.323696,0.364258,35505680.0,28590180.0,3357.5,53348.637931
1110693,1291.0,1.0,0,600.0,29.296875,25.0,96,512,8,123.023077,6.310345,2.279106,2.400391,1.418392,1.467773,153305200.0,4500749.0,3399.292776,1607.038023
1460446,206.0,20.0,0,800.0,19.53125,50.0,64,512,2,127.970588,9.684211,11.473687,13.772461,6.067691,7.499023,155215400.0,2413762.0,3748.868421,1603.052632


In [60]:
# MinMax
from sklearn.preprocessing import MinMaxScaler

scaler_features = MinMaxScaler().fit_transform(dfws_sample_100.values)

# 将归一化的数据写入df
scaler_features_df = pd.DataFrame(scaler_features, index=dfws_sample_100.index, columns=dfws_sample_100.columns)

scaler_features_df.head()

Unnamed: 0,runtime,inst_num,status_t,plan_cpu,plan_mem,plan_gpu,cap_cpu,cap_mem,cap_gpu,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem,read,write,read_count,write_count
1364624,0.002179,0.060403,0.0,1.0,0.391695,0.487179,1.0,1.0,1.0,0.00821,0.0,0.015301,0.029689,0.005693,0.126992,0.009165,0.031341,0.041515,0.020613
1815756,0.001595,0.026846,0.0,0.438202,0.188926,0.230769,1.0,1.0,1.0,0.033012,0.027151,0.046458,0.075329,0.17595,0.304963,0.031681,0.002026,0.054285,0.013334
1156407,0.004359,0.100671,0.0,0.325843,0.013842,0.230769,1.0,1.0,1.0,0.012353,0.601676,0.0292,0.042201,0.019997,0.02008,0.006523,0.012631,0.024251,0.272742
1110693,0.028715,0.0,0.0,0.325843,0.188926,0.102564,1.0,1.0,1.0,0.033172,0.085665,0.038078,0.034305,0.087624,0.080911,0.028842,0.001902,0.024574,0.007579
1460446,0.004336,0.127517,0.0,0.438202,0.121337,0.230769,0.0,1.0,0.0,0.034592,0.131466,0.199538,0.219682,0.374844,0.413383,0.029204,0.000973,0.027277,0.007558


In [67]:
from pyod.models.lof import LOF
from pyod.models.iforest import IForest

clf=LOF()
ifor = IForest()

clf.fit(scaler_features)
print(clf.labels_)


ifor.fit(scaler_features)
print(ifor.labels_)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [72]:
for i in range(len(ifor.labels_)):
    if ifor.labels_[i] == 1:
        print(dfws_sample_100.iloc[[i]])    # 打印异常值

         runtime  inst_num  status_t  plan_cpu   plan_mem  plan_gpu  cap_cpu  \
1542812  15915.0      40.0         0     800.0  29.296875     100.0       96   

         cap_mem  cap_gpu   cpu_usage  gpu_wrk_util   avg_mem   max_mem  \
1542812      512        2  173.374921     68.949765  6.775889  7.410156   

         avg_gpu_wrk_mem  max_gpu_wrk_mem          read         write  \
1542812        14.117829        14.317383  1.079508e+06  1.314700e+06   

         read_count  write_count  
1542812  7735.89548  7631.539862  
         runtime  inst_num  status_t  plan_cpu  plan_mem  plan_gpu  cap_cpu  \
1141445   1258.0      20.0         0     800.0      62.5     200.0       64   

         cap_mem  cap_gpu   cpu_usage  gpu_wrk_util   avg_mem    max_mem  \
1141445      512        2  295.541833     17.614173  5.392651  14.751953   

         avg_gpu_wrk_mem  max_gpu_wrk_mem          read         write  \
1141445         0.928869         1.547852  1.115247e+09  1.469408e+07   

           r

In [73]:
dfws_sample_100.describe()

Unnamed: 0,runtime,inst_num,status_t,plan_cpu,plan_mem,plan_gpu,cap_cpu,cap_mem,cap_gpu,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem,read,write,read_count,write_count
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,3081.0,26.77,0.0,561.4,25.949805,60.2,91.52,509.44,5.48,249.088927,12.581666,5.635142,8.729121,2.175745,2.91666,306475900.0,47840870.0,9209.908662,8708.756495
std,6679.184675,28.848211,0.0,346.392724,16.923342,40.762431,11.159523,18.010278,2.97627,452.13685,17.725045,7.405383,10.551917,3.052767,3.470216,739993500.0,228047100.0,16608.856977,27985.092017
min,13.0,1.0,0.0,20.0,2.0,5.0,64.0,384.0,2.0,7.440678,0.0,0.110677,0.295898,0.0,0.0,1079508.0,230051.7,221.989601,128.225293
25%,165.25,1.75,0.0,400.0,19.53125,25.0,96.0,512.0,2.0,85.994449,0.313765,1.4654,1.93335,0.243323,0.374023,14429800.0,2388275.0,2017.645759,1064.530107
50%,552.0,18.0,0.0,600.0,29.296875,50.0,96.0,512.0,8.0,112.917582,4.598604,3.422859,4.804199,0.745295,1.300781,42446980.0,7556144.0,4792.002005,2331.278788
75%,2618.75,50.0,0.0,600.0,29.296875,100.0,96.0,512.0,8.0,294.05619,17.706933,6.499987,12.693115,2.706939,4.347168,163307000.0,27423880.0,8922.035985,5157.713606
max,44519.0,150.0,0.0,1800.0,146.484375,200.0,96.0,512.0,8.0,3491.778068,73.6633,57.057151,61.641602,16.187233,18.140625,5279021000.0,2245530000.0,129518.38233,195259.15559
