In [1]:
import numpy as np
import pandas as pd

In [None]:
'''
### batch task
+----------------------------------------------------------------------------------------+
| task_name       | string     |       | task name. unique within a job                  |
| instance_num    | bigint     |       | number of instances                             |
| job_name        | string     |       | job name                                        |
| task_type       | string     |       | task type                                       |
| status          | string     |       | task status                                     |
| start_time      | bigint     |       | start time of the task                          |
| end_time        | bigint     |       | end of time the task                            |
| plan_cpu        | double     |       | number of cpu needed by the task, 100 is 1 core |
| plan_mem        | double     |       | normalized memorty size, [0, 100]               |
+----------------------------------------------------------------------------------------+
'''

In [2]:
data=pd.read_csv(r'data/batch_task.csv', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M1,1.0,j_1,1,Terminated,419912,419912,100.0,0.2
1,R2_1,1.0,j_2,1,Terminated,87076,87086,50.0,0.2
2,M1,1.0,j_2,1,Terminated,87076,87083,50.0,0.2
3,R6_3,371.0,j_3,1,Terminated,157297,157325,100.0,0.49
4,J4_2_3,1111.0,j_3,1,Terminated,157329,157376,100.0,0.59


In [3]:
len(data)

14295731

In [4]:
data.columns=['task_name', 'instance_name', 'job_name', 'task_type', 'status', 'start_time', 'end_time', 'plan_cpu', 'plan_mem']
data.head()

Unnamed: 0,task_name,instance_name,job_name,task_type,status,start_time,end_time,plan_cpu,plan_mem
0,M1,1.0,j_1,1,Terminated,419912,419912,100.0,0.2
1,R2_1,1.0,j_2,1,Terminated,87076,87086,50.0,0.2
2,M1,1.0,j_2,1,Terminated,87076,87083,50.0,0.2
3,R6_3,371.0,j_3,1,Terminated,157297,157325,100.0,0.49
4,J4_2_3,1111.0,j_3,1,Terminated,157329,157376,100.0,0.59


In [8]:
'''
In this version of cluster data, we include many types of batch workloads.
Most of them are DAGs while some of them are not.
For those tasks that are not DAGs, we name them using random characters, such as task_Nzg3ODAwNDgzMTAwNTc2NTQ2Mw== or task_Nzg3ODAwNDgzMTAwODc2NTQ3MQ==.
These tasks can be treated as independent tasks.
'''
#1.筛选出所有的简单独立任务
data=data[data['task_name'].str.contains("=")]
data.head()

Unnamed: 0,task_name,instance_name,job_name,task_type,status,start_time,end_time,plan_cpu,plan_mem
37,task_LTg0MTUwNTA5Mjg4MDkwNjIzMA==,1.0,j_17,10,Terminated,580371,580515,75.0,0.1
57,task_MTM0ODUxMTY0NjQzMTI1NTc1MQ==,1.0,j_39,6,Terminated,675912,675918,30.0,0.05
58,task_LTE4NjUxMjg5NDY5MDI4NjAzNzU=,1.0,j_39,6,Terminated,675912,675920,5.0,0.03
59,task_ODk2MzU0ODg1MTY5MTExNTUwMg==,1.0,j_39,6,Terminated,675912,675915,10.0,0.05
76,task_LTg0MTUwNTA5Mjg4MDkwNjIzMA==,1.0,j_69,10,Failed,243225,243228,100.0,0.1


In [9]:
len(data)

2058697

In [13]:
data=data.dropna()

In [14]:
len(data)

2013633

In [15]:
data.task_type.value_counts()

task_type
6     1181562
5      378032
10     322928
8       53811
3       34970
11      20885
12       8883
9        6543
4        5519
7         488
2          12
Name: count, dtype: int64

In [16]:
data.status.value_counts()

status
Terminated    1940994
Failed          58875
Running         13764
Name: count, dtype: int64

In [17]:
#2.筛选出所有顺利执行完毕的任务
data=data[data['status']=='Terminated']
data.head()

Unnamed: 0,task_name,instance_name,job_name,task_type,status,start_time,end_time,plan_cpu,plan_mem
37,task_LTg0MTUwNTA5Mjg4MDkwNjIzMA==,1.0,j_17,10,Terminated,580371,580515,75.0,0.1
57,task_MTM0ODUxMTY0NjQzMTI1NTc1MQ==,1.0,j_39,6,Terminated,675912,675918,30.0,0.05
58,task_LTE4NjUxMjg5NDY5MDI4NjAzNzU=,1.0,j_39,6,Terminated,675912,675920,5.0,0.03
59,task_ODk2MzU0ODg1MTY5MTExNTUwMg==,1.0,j_39,6,Terminated,675912,675915,10.0,0.05
104,task_ODk2MzU0ODg1MTY5MTExNTUwMg==,1.0,j_85,6,Terminated,474800,475186,10.0,0.05


In [18]:
len(data)

1940994

In [19]:
data.status.value_counts()

status
Terminated    1940994
Name: count, dtype: int64

In [20]:
#3.计算所有任务的执行时间
data=data[data['end_time']-data['start_time']>0]
data['exec_time']=data['end_time']-data['start_time']
data.head()

Unnamed: 0,task_name,instance_name,job_name,task_type,status,start_time,end_time,plan_cpu,plan_mem,exec_time
37,task_LTg0MTUwNTA5Mjg4MDkwNjIzMA==,1.0,j_17,10,Terminated,580371,580515,75.0,0.1,144
57,task_MTM0ODUxMTY0NjQzMTI1NTc1MQ==,1.0,j_39,6,Terminated,675912,675918,30.0,0.05,6
58,task_LTE4NjUxMjg5NDY5MDI4NjAzNzU=,1.0,j_39,6,Terminated,675912,675920,5.0,0.03,8
59,task_ODk2MzU0ODg1MTY5MTExNTUwMg==,1.0,j_39,6,Terminated,675912,675915,10.0,0.05,3
104,task_ODk2MzU0ODg1MTY5MTExNTUwMg==,1.0,j_85,6,Terminated,474800,475186,10.0,0.05,386


In [28]:
len(data)

1939135

In [29]:
data['plan_cpu']=data['plan_cpu']/100 #单位/个->0.5个cpu核心
data.head()

Unnamed: 0,task_name,instance_name,job_name,task_type,status,start_time,end_time,plan_cpu,plan_mem,exec_time
37,task_LTg0MTUwNTA5Mjg4MDkwNjIzMA==,1.0,j_17,10,Terminated,580371,580515,0.75,0.1,144
57,task_MTM0ODUxMTY0NjQzMTI1NTc1MQ==,1.0,j_39,6,Terminated,675912,675918,0.3,0.05,6
58,task_LTE4NjUxMjg5NDY5MDI4NjAzNzU=,1.0,j_39,6,Terminated,675912,675920,0.05,0.03,8
59,task_ODk2MzU0ODg1MTY5MTExNTUwMg==,1.0,j_39,6,Terminated,675912,675915,0.1,0.05,3
104,task_ODk2MzU0ODg1MTY5MTExNTUwMg==,1.0,j_85,6,Terminated,474800,475186,0.1,0.05,386


In [30]:
data['plan_mem']=data['plan_mem']*1000 #单位Mb
data.head()

Unnamed: 0,task_name,instance_name,job_name,task_type,status,start_time,end_time,plan_cpu,plan_mem,exec_time
37,task_LTg0MTUwNTA5Mjg4MDkwNjIzMA==,1.0,j_17,10,Terminated,580371,580515,0.75,100.0,144
57,task_MTM0ODUxMTY0NjQzMTI1NTc1MQ==,1.0,j_39,6,Terminated,675912,675918,0.3,50.0,6
58,task_LTE4NjUxMjg5NDY5MDI4NjAzNzU=,1.0,j_39,6,Terminated,675912,675920,0.05,30.0,8
59,task_ODk2MzU0ODg1MTY5MTExNTUwMg==,1.0,j_39,6,Terminated,675912,675915,0.1,50.0,3
104,task_ODk2MzU0ODg1MTY5MTExNTUwMg==,1.0,j_85,6,Terminated,474800,475186,0.1,50.0,386


In [35]:
data=data[['exec_time', 'plan_mem', 'instance_name', 'plan_cpu']]
data.head()

Unnamed: 0,exec_time,plan_mem,instance_name,plan_cpu
37,144,100.0,1.0,0.75
57,6,50.0,1.0,0.3
58,8,30.0,1.0,0.05
59,3,50.0,1.0,0.1
104,386,50.0,1.0,0.1


In [37]:
data.rename(columns={'instance_name':'instance_num'}, inplace=True)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns={'instance_name':'instance_num'}, inplace=True)


Unnamed: 0,exec_time,plan_mem,instance_num,plan_cpu
37,144,100.0,1.0,0.75
57,6,50.0,1.0,0.3
58,8,30.0,1.0,0.05
59,3,50.0,1.0,0.1
104,386,50.0,1.0,0.1


In [40]:
data.reset_index(drop=True, inplace=True)
data.head()

Unnamed: 0,exec_time,plan_mem,instance_num,plan_cpu
0,144,100.0,1.0,0.75
1,6,50.0,1.0,0.3
2,8,30.0,1.0,0.05
3,3,50.0,1.0,0.1
4,386,50.0,1.0,0.1


In [52]:
len(data)

1939135

In [43]:
data.exec_time.value_counts().sort_index().to_csv("temp/task_exec_time.csv")
data.plan_mem.value_counts().sort_index().to_csv("temp/task_plan_mem.csv")
data.plan_cpu.value_counts().sort_index().to_csv("temp/task_plan_cpu.csv")
data.instance_num.value_counts().sort_index().to_csv("temp/task_instance_num.csv")

In [44]:
data.to_pickle("temp/data-v1.0.pkl")

In [45]:
data.to_csv("temp/data-v1.0.csv")

In [51]:
data=pd.read_pickle(r"temp/data-v1.0.pkl")
data.head()

Unnamed: 0,exec_time,plan_mem,instance_num,plan_cpu
0,144,100.0,1.0,0.75
1,6,50.0,1.0,0.3
2,8,30.0,1.0,0.05
3,3,50.0,1.0,0.1
4,386,50.0,1.0,0.1


In [17]:
data.exec_time.mean()

233.25271113150967

In [18]:
data.exec_time.median()

39.0

In [19]:
data.instance_num.mean()

20.442107950194288

In [20]:
data.instance_num.median()

1.0

In [21]:
data.instance_num.quantile(0.9)

6.0

In [23]:
data.plan_mem.value_counts().head(20)

plan_mem
50.0      792315
30.0      378385
1720.0    363815
100.0     276758
590.0      42159
390.0      30259
190.0      10371
3440.0      5969
1530.0      5949
2580.0      4510
200.0       2927
960.0       2913
790.0       2477
210.0       2420
370.0       2233
160.0       2141
770.0       1753
1340.0      1169
780.0       1048
80.0         772
Name: count, dtype: int64

In [53]:
#4.筛选出执行时间<40的所有任务(依据中位数)
data=data[data['exec_time']<40]
#5.筛选出实例个数<=5的所有任务
data=data[data['instance_num']<=5]
#6.筛选出CPU核数<=1.0的所有任务
data=data[data['plan_cpu']<=1.0]
#7.筛选出数据量<=100M的所有任务
data=data[data['plan_mem']<=2000.0]
data.head()

Unnamed: 0,exec_time,plan_mem,instance_num,plan_cpu
1,6,50.0,1.0,0.3
2,8,30.0,1.0,0.05
3,3,50.0,1.0,0.1
6,4,390.0,1.0,1.0
7,7,50.0,1.0,0.1


In [54]:
len(data)

783249

In [86]:
data.exec_time.value_counts().sort_index()

exec_time
1.0        146
2.0      27014
3.0     143561
4.0      81141
5.0      45184
6.0      55369
7.0      45081
8.0      38526
9.0      42015
10.0     35394
11.0     28774
12.0     24471
13.0     20162
14.0     17238
15.0     15179
16.0     13874
17.0     12714
18.0     11492
19.0     10680
20.0      9755
21.0      9034
22.0      8289
23.0      7880
24.0      7188
25.0      6670
26.0      6241
27.0      5991
28.0      5659
29.0      5327
30.0      5013
31.0      5075
32.0      4731
33.0      4572
34.0      4424
35.0      4134
36.0      4024
37.0      3911
38.0      3750
39.0      3566
Name: count, dtype: int64

In [88]:
data.inst_num.value_counts().sort_index()

inst_num
1.0    752316
2.0     13520
3.0      7876
4.0      4890
5.0      4647
Name: count, dtype: int64

In [89]:
data.plan_cpu.value_counts().sort_index()

plan_cpu
0.05    144728
0.10    317279
0.30    182947
0.50      4558
0.60       297
0.75      1018
1.00    132422
Name: count, dtype: int64

In [90]:
data.data_vol.value_counts().sort_index()

data_vol
20.0         352
30.0      144749
40.0          18
50.0      500330
60.0          13
70.0          10
80.0           5
90.0           2
100.0     106440
160.0        210
180.0          6
190.0       6518
200.0       1843
210.0          3
240.0          1
290.0          8
310.0          2
350.0          1
360.0          6
370.0        454
390.0      10183
400.0          1
480.0         18
490.0          8
580.0          2
590.0       8294
770.0        817
780.0        174
790.0        660
960.0        349
1150.0         7
1340.0        91
1530.0      1667
1910.0         7
Name: count, dtype: int64

In [80]:
#8.添加每个任务的软截止期->[0.25,0.5]
data['soft_ddl']=data['exec_time'].apply(lambda x: np.ceil(max(1.0, np.random.uniform(0.25, 0.5)*x)))
data.head()

Unnamed: 0,exec_time,plan_mem,instance_num,plan_cpu,soft_ddl
1,6,50.0,1.0,0.3,3.0
2,8,30.0,1.0,0.05,3.0
3,3,50.0,1.0,0.1,2.0
6,4,390.0,1.0,1.0,2.0
7,7,50.0,1.0,0.1,3.0


In [95]:
data.soft_ddl.value_counts()

soft_ddl
2.0     228869
3.0     124074
4.0      91158
1.0      74938
5.0      64815
6.0      44559
7.0      32343
8.0      26023
9.0      21441
10.0     17880
11.0     14343
12.0     11436
13.0      9063
14.0      7051
15.0      5295
16.0      4062
17.0      2879
18.0      1868
19.0       957
20.0       195
Name: count, dtype: int64

In [93]:
data.dtypes

exec_time    float64
data_vol     float64
inst_num     float64
plan_cpu     float64
soft_ddl     float64
dtype: object

In [84]:
data.rename(columns={'instance_num': 'inst_num', 'plan_mem': 'data_vol'}, inplace=True)
data['exec_time']=data['exec_time'].astype(np.float64)
data.head()

Unnamed: 0,exec_time,data_vol,inst_num,plan_cpu,soft_ddl
1,6.0,50.0,1.0,0.3,3.0
2,8.0,30.0,1.0,0.05,3.0
3,3.0,50.0,1.0,0.1,2.0
6,4.0,390.0,1.0,1.0,2.0
7,7.0,50.0,1.0,0.1,3.0


In [94]:
len(data)

783249

In [97]:
data.to_csv("temp/data-v2.0.csv")
data.to_pickle("temp/data-v2.0.pkl")

In [2]:
data=pd.read_pickle('temp/data-v2.0.pkl')
data.head(10)

Unnamed: 0,exec_time,data_vol,inst_num,plan_cpu,soft_ddl
1,6.0,50.0,1.0,0.3,3.0
2,8.0,30.0,1.0,0.05,3.0
3,3.0,50.0,1.0,0.1,2.0
6,4.0,390.0,1.0,1.0,2.0
7,7.0,50.0,1.0,0.1,3.0
10,10.0,100.0,1.0,1.0,4.0
11,5.0,100.0,1.0,1.0,2.0
12,4.0,50.0,1.0,0.1,2.0
16,3.0,50.0,1.0,0.1,2.0
18,4.0,50.0,1.0,0.1,2.0


In [3]:
len(data)

783249

In [4]:
data.reset_index(drop=True, inplace=True)
data.head(10)

Unnamed: 0,exec_time,data_vol,inst_num,plan_cpu,soft_ddl
0,6.0,50.0,1.0,0.3,3.0
1,8.0,30.0,1.0,0.05,3.0
2,3.0,50.0,1.0,0.1,2.0
3,4.0,390.0,1.0,1.0,2.0
4,7.0,50.0,1.0,0.1,3.0
5,10.0,100.0,1.0,1.0,4.0
6,5.0,100.0,1.0,1.0,2.0
7,4.0,50.0,1.0,0.1,2.0
8,3.0,50.0,1.0,0.1,2.0
9,4.0,50.0,1.0,0.1,2.0


In [8]:
data.tail(10)

Unnamed: 0,exec_time,data_vol,inst_num,plan_cpu,soft_ddl
783239,19.0,50.0,1.0,0.3,5.0
783240,18.0,50.0,1.0,0.3,7.0
783241,5.0,50.0,1.0,0.1,2.0
783242,20.0,30.0,1.0,0.05,7.0
783243,7.0,50.0,1.0,0.1,3.0
783244,5.0,50.0,1.0,0.1,2.0
783245,12.0,30.0,1.0,0.05,5.0
783246,3.0,50.0,1.0,0.1,2.0
783247,9.0,50.0,1.0,0.3,3.0
783248,20.0,100.0,4.0,1.0,7.0


In [22]:
data.exec_time.value_counts().sort_index()

exec_time
1.0        145
2.0      26909
3.0     140779
4.0      75527
5.0      41229
6.0      53276
7.0      43324
8.0      37037
9.0      40816
10.0     34347
11.0     27804
12.0     23623
13.0     19452
14.0     16600
15.0     14632
16.0     13354
17.0     12222
18.0     11041
19.0     10248
20.0      9283
21.0      8589
22.0      7945
23.0      7578
24.0      6888
25.0      6383
26.0      5961
27.0      5718
28.0      5377
29.0      5030
30.0      4715
31.0      4783
32.0      4455
33.0      4283
34.0      4156
35.0      3897
36.0      3796
37.0      3737
38.0      3568
39.0      3412
Name: count, dtype: int64

In [21]:
data.data_vol.value_counts().sort_index()

data_vol
20.0        352
30.0     144749
40.0         18
50.0     500330
60.0         13
70.0         10
80.0          5
90.0          2
100.0    106440
Name: count, dtype: int64

In [20]:
data.inst_num.value_counts().sort_index()

inst_num
1.0    726049
2.0     10989
3.0      7283
4.0      4279
5.0      3319
Name: count, dtype: int64

In [19]:
data.plan_cpu.value_counts()

plan_cpu
0.10    317279
0.30    182947
0.05    144728
1.00    105947
0.75      1018
Name: count, dtype: int64

In [18]:
data.soft_ddl.value_counts().sort_index()

soft_ddl
1.0      73873
2.0     218139
3.0     118634
4.0      88037
5.0      62705
6.0      42852
7.0      31057
8.0      24929
9.0      20485
10.0     16992
11.0     13640
12.0     10872
13.0      8600
14.0      6658
15.0      5015
16.0      3807
17.0      2741
18.0      1792
19.0       906
20.0       185
Name: count, dtype: int64

In [17]:
data.head(15)

Unnamed: 0,exec_time,data_vol,inst_num,plan_cpu,soft_ddl
0,6.0,50.0,1.0,0.3,3.0
1,8.0,30.0,1.0,0.05,3.0
2,3.0,50.0,1.0,0.1,2.0
4,7.0,50.0,1.0,0.1,3.0
5,10.0,100.0,1.0,1.0,4.0
6,5.0,100.0,1.0,1.0,2.0
7,4.0,50.0,1.0,0.1,2.0
8,3.0,50.0,1.0,0.1,2.0
9,4.0,50.0,1.0,0.1,2.0
10,4.0,50.0,1.0,0.1,2.0


In [14]:
data.to_pickle(r"outData/task.pkl")

In [15]:
data=pd.read_pickle(r"outData/task.pkl")
data.head()

Unnamed: 0,exec_time,data_vol,inst_num,plan_cpu,soft_ddl
0,6.0,50.0,1.0,0.3,3.0
1,8.0,30.0,1.0,0.05,3.0
2,3.0,50.0,1.0,0.1,2.0
4,7.0,50.0,1.0,0.1,3.0
5,10.0,100.0,1.0,1.0,4.0


In [16]:
len(data)

751919