In [29]:
import time
import numpy as np
import pandas as pd
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import gc
import warnings
warnings.simplefilter('ignore')

In [2]:
def arithmetic(df, cross_features):
    """
    数值特征之间的加减乘除
    @param df:
    @param cross_features: 交叉用的数值特征
    @return:
    """
    cols = []
    for i in tqdm(range(len(cross_features))):
        for j in range(i + 1, len(cross_features)):
            colname_add = '{}_{}_add'.format(cross_features[i], cross_features[j])
            cols.append(colname_add)
            colname_substract = '{}_{}_subtract'.format(cross_features[i], cross_features[j])
            cols.append(colname_substract)
            colname_multiply = '{}_{}c_multiply'.format(cross_features[i], cross_features[j])
            cols.append(colname_multiply)
            df[colname_add] = df[cross_features[i]] + df[cross_features[j]]
            df[colname_substract] = df[cross_features[i]] - df[cross_features[j]]
            df[colname_multiply] = df[cross_features[i]] * df[cross_features[j]]

    for f1 in tqdm(cross_features):
        for f2 in cross_features:
            if f1 != f2:
                colname_ratio = '{}_{}_ratio'.format(f1, f2)
                cols.append(colname_ratio)
                df[colname_ratio] = df[f1].values / (df[f2].values + 0.001)
    return df, cols

In [3]:
train = pd.read_csv('../input/train.csv')
train = train.sort_values(by=['QUEUE_ID', 'DOTTING_TIME']).reset_index(drop=True)

test = pd.read_csv('../input/evaluation_public.csv')
test = test.sort_values(by=['ID', 'DOTTING_TIME']).reset_index(drop=True)

sub_sample = pd.read_csv('../input/submit_example.csv')

In [4]:
train.head()

Unnamed: 0,QUEUE_ID,CU,STATUS,QUEUE_TYPE,PLATFORM,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DOTTING_TIME,RESOURCE_TYPE,DISK_USAGE
0,2,16,available,sql,x86_64,3,54,0,0,0,0,0,1590683100000,vm,20.0
1,2,16,available,sql,x86_64,2,54,0,0,0,0,0,1590683400000,vm,20.0
2,2,16,available,sql,x86_64,7,54,0,0,0,0,0,1590683700000,vm,20.0
3,2,16,available,sql,x86_64,4,54,0,0,0,0,0,1590684000000,vm,20.0
4,2,16,available,sql,x86_64,5,54,0,0,0,0,0,1590684120000,vm,20.0


In [5]:
test.head()

Unnamed: 0,ID,QUEUE_ID,CU,STATUS,QUEUE_TYPE,PLATFORM,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DOTTING_TIME,RESOURCE_TYPE,DISK_USAGE
0,1,297,16,available,sql,x86_64,60,69,0,5,5,0,0,1662213420000,vm,9
1,1,297,16,available,sql,x86_64,58,69,0,9,4,0,0,1662213720000,vm,9
2,1,297,16,available,sql,x86_64,80,67,0,9,1,0,0,1662214020000,vm,9
3,1,297,16,available,sql,x86_64,100,65,0,7,2,0,1,1662214320000,vm,9
4,1,297,16,available,sql,x86_64,98,67,0,10,3,0,1,1662214620000,vm,9


In [6]:
sub_sample.head()

Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,0


In [7]:
train.shape, test.shape, sub_sample.shape

((501730, 15), (14980, 16), (2996, 11))

In [8]:
to_drop = ['STATUS', 'PLATFORM', 'RESOURCE_TYPE']
train.drop(to_drop, axis=1, inplace=True)
test.drop(to_drop, axis=1, inplace=True)
gc.collect()

80

In [9]:
train.drop('DOTTING_TIME', axis=1, inplace=True)
test.drop('DOTTING_TIME', axis=1, inplace=True)
gc.collect()

20

In [10]:
num_cols = ['CU', 'CPU_USAGE', 'MEM_USAGE', 'LAUNCHING_JOB_NUMS', 'RUNNING_JOB_NUMS',
            'SUCCEED_JOB_NUMS', 'CANCELLED_JOB_NUMS', 'FAILED_JOB_NUMS', 'DISK_USAGE']

In [11]:
# Label Encoding
le = LabelEncoder()
train['QUEUE_TYPE'] = le.fit_transform(train['QUEUE_TYPE'].astype(str))
test['QUEUE_TYPE'] = le.transform(test['QUEUE_TYPE'].astype(str))

train['QUEUE_TYPE'] = train['QUEUE_TYPE'].astype('category')
test['QUEUE_TYPE'] = test['QUEUE_TYPE'].astype('category')

In [12]:
# 1 CU = 1 CPU 4G MEM
train['used_cpu'] = train['CU'] * train['CPU_USAGE']
train['used_mem'] = train['CU'] * 4 * train['MEM_USAGE']

test['used_cpu'] = test['CU'] * test['CPU_USAGE']
test['used_mem'] = test['CU'] * 4 * test['MEM_USAGE']

In [13]:
num_cols += ['used_cpu', 'used_mem']

In [14]:
# train, cols = arithmetic(train, num_cols)
# test, _ = arithmetic(test, num_cols)
# num_cols += cols

In [15]:
for data in [train, test]:
    for col in num_cols:
        for n in [1, 2, 3, -1, -2, -3]:
            data[col + '_diff{}'.format(n)] = data.groupby('QUEUE_ID')[col].diff(n)
#             data[col + '_shift{}'.format(n)] = data.groupby('QUEUE_ID')[col].shift(n)

In [16]:
# 加个 id 后面方便处理
train['myid'] = train.index
test['myid'] = test.index

In [17]:
# 生成 target 列
df_train = pd.DataFrame()
for id_ in tqdm(train.QUEUE_ID.unique()):
    tmp = train[train.QUEUE_ID == id_]
    tmp['CPU_USAGE_next25mins'] = tmp['CPU_USAGE'].shift(-5)
    tmp['LAUNCHING_JOB_NUMS_next25mins'] = tmp['LAUNCHING_JOB_NUMS'].shift(-5)
    df_train = df_train.append(tmp)

HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




In [18]:
df_train = df_train[df_train.CPU_USAGE_next25mins.notna()]
print(df_train.shape)
df_train.head()

(501515, 82)


Unnamed: 0,QUEUE_ID,CU,QUEUE_TYPE,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DISK_USAGE,used_cpu,used_mem,CU_diff1,CU_diff2,CU_diff3,CU_diff-1,CU_diff-2,CU_diff-3,CPU_USAGE_diff1,CPU_USAGE_diff2,CPU_USAGE_diff3,CPU_USAGE_diff-1,CPU_USAGE_diff-2,CPU_USAGE_diff-3,MEM_USAGE_diff1,MEM_USAGE_diff2,MEM_USAGE_diff3,MEM_USAGE_diff-1,MEM_USAGE_diff-2,MEM_USAGE_diff-3,LAUNCHING_JOB_NUMS_diff1,LAUNCHING_JOB_NUMS_diff2,LAUNCHING_JOB_NUMS_diff3,LAUNCHING_JOB_NUMS_diff-1,LAUNCHING_JOB_NUMS_diff-2,LAUNCHING_JOB_NUMS_diff-3,RUNNING_JOB_NUMS_diff1,RUNNING_JOB_NUMS_diff2,RUNNING_JOB_NUMS_diff3,RUNNING_JOB_NUMS_diff-1,RUNNING_JOB_NUMS_diff-2,RUNNING_JOB_NUMS_diff-3,SUCCEED_JOB_NUMS_diff1,SUCCEED_JOB_NUMS_diff2,SUCCEED_JOB_NUMS_diff3,SUCCEED_JOB_NUMS_diff-1,SUCCEED_JOB_NUMS_diff-2,SUCCEED_JOB_NUMS_diff-3,CANCELLED_JOB_NUMS_diff1,CANCELLED_JOB_NUMS_diff2,CANCELLED_JOB_NUMS_diff3,CANCELLED_JOB_NUMS_diff-1,CANCELLED_JOB_NUMS_diff-2,CANCELLED_JOB_NUMS_diff-3,FAILED_JOB_NUMS_diff1,FAILED_JOB_NUMS_diff2,FAILED_JOB_NUMS_diff3,FAILED_JOB_NUMS_diff-1,FAILED_JOB_NUMS_diff-2,FAILED_JOB_NUMS_diff-3,DISK_USAGE_diff1,DISK_USAGE_diff2,DISK_USAGE_diff3,DISK_USAGE_diff-1,DISK_USAGE_diff-2,DISK_USAGE_diff-3,used_cpu_diff1,used_cpu_diff2,used_cpu_diff3,used_cpu_diff-1,used_cpu_diff-2,used_cpu_diff-3,used_mem_diff1,used_mem_diff2,used_mem_diff3,used_mem_diff-1,used_mem_diff-2,used_mem_diff-3,myid,CPU_USAGE_next25mins,LAUNCHING_JOB_NUMS_next25mins
0,2,16,2,3,54,0,0,0,0,0,20.0,48,3456,,,,0.0,0.0,0.0,,,,1.0,-4.0,-1.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,,16.0,-64.0,-16.0,,,,0.0,0.0,0.0,0,3.0,0.0
1,2,16,2,2,54,0,0,0,0,0,20.0,32,3456,0.0,,,0.0,0.0,0.0,-1.0,,,-5.0,-2.0,-3.0,0.0,,,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,-16.0,,,-80.0,-32.0,-48.0,0.0,,,0.0,0.0,0.0,1,2.0,0.0
2,2,16,2,7,54,0,0,0,0,0,20.0,112,3456,0.0,0.0,,0.0,0.0,0.0,5.0,4.0,,3.0,2.0,4.0,0.0,0.0,,0.0,0.0,-1.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,80.0,64.0,,48.0,32.0,64.0,0.0,0.0,,0.0,0.0,-64.0,2,2.0,0.0
3,2,16,2,4,54,0,0,0,0,0,20.0,64,3456,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,2.0,1.0,-1.0,1.0,2.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-48.0,32.0,16.0,-16.0,16.0,32.0,0.0,0.0,0.0,0.0,-64.0,0.0,3,5.0,0.0
4,2,16,2,5,54,0,0,0,0,0,20.0,80,3456,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-2.0,3.0,2.0,3.0,3.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,-32.0,48.0,32.0,48.0,48.0,0.0,0.0,0.0,-64.0,0.0,0.0,4,6.0,0.0


In [19]:
used_cols = [i for i in df_train.columns if i not in ['CPU_USAGE_next25mins', 'LAUNCHING_JOB_NUMS_next25mins', 'QUEUE_ID', 'myid']]

In [20]:
def run_lgb_qid(df_train, df_test, target, qid):
    
    feature_names = list(
        filter(lambda x: x not in ['CPU_USAGE_next25mins', 
                                   'LAUNCHING_JOB_NUMS_next25mins', 
                                   'QUEUE_ID', 
                                   'myid',
                                   'CU',
                                   'QUEUE_TYPE'], df_train.columns))

#     feature_names = ['CPU_USAGE', 'MEM_USAGE', 'LAUNCHING_JOB_NUMS',
#                      'RUNNING_JOB_NUMS', 'SUCCEED_JOB_NUMS', 'CANCELLED_JOB_NUMS',
#                      'FAILED_JOB_NUMS', 'DISK_USAGE', 
#                      'used_cpu', 'used_mem', 
#                      'used_cpu_diff1', 'used_mem_diff1',
#                      'used_disk_diff1', 'to_run_jobs_diff1',
#                      'launching_diff1', 'running_diff1',
#                      'succeed_diff1', 'cancelled_diff1',
#                      'failed_diff1',
#                      'used_cpu_diff-1', 'used_mem_diff-1',
#                      'used_disk_diff-1', 'to_run_jobs_diff-1',
#                      'launching_diff-1', 'running_diff-1',
#                      'succeed_diff-1', 'cancelled_diff-1',
#                      'failed_diff-1']
    
    # 提取 QUEUE_ID 对应的数据集
    df_train = df_train[df_train.QUEUE_ID == qid]
    df_test = df_test[df_test.QUEUE_ID == qid]
    
    print(f"QUEUE_ID:{qid}, target:{target}, train:{len(df_train)}, test:{len(df_test)}")
    
    model = lgb.LGBMRegressor(num_leaves=32,
                              max_depth=6,
                              learning_rate=0.08,
                              n_estimators=10000,
                              subsample=0.9,
                              feature_fraction=0.8,
                              reg_alpha=0.5,
                              reg_lambda=0.8,
                              random_state=2020)
    oof = []
    prediction = df_test[['ID', 'QUEUE_ID', 'myid']]
    prediction[target] = 0
    
    kfold = KFold(n_splits=5, random_state=2020)
    for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train, df_train[target])):
        
        X_train = df_train.iloc[trn_idx][feature_names]
        Y_train = df_train.iloc[trn_idx][target]
        X_val = df_train.iloc[val_idx][feature_names]
        Y_val = df_train.iloc[val_idx][target]
        
        lgb_model = model.fit(X_train, 
                              Y_train,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train, Y_train), (X_val, Y_val)],
                              verbose=0,
                              eval_metric='mse',
                              early_stopping_rounds=20)
        
        pred_val = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration_)
        df_oof = df_train.iloc[val_idx][[target, 'myid', 'QUEUE_ID']].copy()
        df_oof['pred'] = pred_val
        oof.append(df_oof)
        
        pred_test = lgb_model.predict(df_test[feature_names], num_iteration=lgb_model.best_iteration_)
        prediction[target] += pred_test / kfold.n_splits
        
        del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
        gc.collect()
        
    df_oof = pd.concat(oof)
    score = mean_squared_error(df_oof[target], df_oof['pred'])
    print('MSE:', score)

    return prediction, score

In [21]:
oofs1 = list()
oofs2 = list()
predictions1 = list()
predictions2 = list()
scores1 = list()
scores2 = list()

for qid in tqdm(test.QUEUE_ID.unique()):
    prediction1, score1 = run_lgb_qid(df_train, test, target='CPU_USAGE_next25mins', qid=qid)
    predictions1.append(prediction1)
    scores1.append(score1)
    prediction2, score2 = run_lgb_qid(df_train, test, target='LAUNCHING_JOB_NUMS_next25mins', qid=qid)
    predictions2.append(prediction2)
    scores2.append(score2)

HBox(children=(FloatProgress(value=0.0, max=23.0), HTML(value='')))

QUEUE_ID:297, target:CPU_USAGE_next25mins, train:21179, test:5710
MSE: 157.33399466256634
QUEUE_ID:297, target:LAUNCHING_JOB_NUMS_next25mins, train:21179, test:5710
MSE: 5.903886280599869
QUEUE_ID:85153, target:CPU_USAGE_next25mins, train:14348, test:1950
MSE: 195.53098027907492
QUEUE_ID:85153, target:LAUNCHING_JOB_NUMS_next25mins, train:14348, test:1950
MSE: 14.197868055814832
QUEUE_ID:291, target:CPU_USAGE_next25mins, train:8879, test:285
MSE: 139.98987006843294
QUEUE_ID:291, target:LAUNCHING_JOB_NUMS_next25mins, train:8879, test:285
MSE: 0.048246527110459916
QUEUE_ID:21487, target:CPU_USAGE_next25mins, train:28926, test:2235
MSE: 67.08665959071062
QUEUE_ID:21487, target:LAUNCHING_JOB_NUMS_next25mins, train:28926, test:2235
MSE: 11.995019059739265
QUEUE_ID:85265, target:CPU_USAGE_next25mins, train:13511, test:95
MSE: 1.0121136871866578
QUEUE_ID:85265, target:LAUNCHING_JOB_NUMS_next25mins, train:13511, test:95
MSE: 21.520918897127494
QUEUE_ID:4, target:CPU_USAGE_next25mins, train:1925

In [22]:
print(np.mean(scores1), np.mean(scores2))

46.89852581694235 5.13971907483116


In [23]:
predictions1 = pd.concat(predictions1)
predictions2 = pd.concat(predictions2)

predictions1 = predictions1.sort_values(by='myid').reset_index(drop=True)
predictions2 = predictions2.sort_values(by='myid').reset_index(drop=True)

In [24]:
prediction = predictions1.copy()
prediction = pd.merge(prediction, predictions2[['myid', 'LAUNCHING_JOB_NUMS_next25mins']], on='myid')

prediction.head()

Unnamed: 0,ID,QUEUE_ID,myid,CPU_USAGE_next25mins,LAUNCHING_JOB_NUMS_next25mins
0,1,297,0,37.07821,0.166071
1,1,297,1,40.870592,0.171553
2,1,297,2,6.174733,0.026735
3,1,297,3,75.37215,0.525696
4,1,297,4,69.31049,0.026656


In [25]:
# 注意: 提交要求预测结果需为非负整数

prediction['CPU_USAGE_next25mins'] = prediction['CPU_USAGE_next25mins'].apply(np.floor)
prediction['CPU_USAGE_next25mins'] = prediction['CPU_USAGE_next25mins'].apply(lambda x: 0 if x<0 else x)
prediction['CPU_USAGE_next25mins'] = prediction['CPU_USAGE_next25mins'].astype(int)
prediction['LAUNCHING_JOB_NUMS_next25mins'] = prediction['LAUNCHING_JOB_NUMS_next25mins'].apply(np.floor)
prediction['LAUNCHING_JOB_NUMS_next25mins'] = prediction['LAUNCHING_JOB_NUMS_next25mins'].apply(lambda x: 0 if x<0 else x)
prediction['LAUNCHING_JOB_NUMS_next25mins'] = prediction['LAUNCHING_JOB_NUMS_next25mins'].astype(int)

In [26]:
preds = []

for id_ in tqdm(prediction.ID.unique()):
    items = [id_]
    tmp = prediction[prediction.ID == id_].sort_values(by='myid').reset_index(drop=True)
    for i, row in tmp.iterrows():
        items.append(row['CPU_USAGE_next25mins'])
        items.append(row['LAUNCHING_JOB_NUMS_next25mins'])
    preds.append(items)

HBox(children=(FloatProgress(value=0.0, max=2996.0), HTML(value='')))




In [27]:
sub = pd.DataFrame(preds)
sub.columns = sub_sample.columns

sub.head(10)

Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1,37,0,40,0,6,0,75,0,69,0
1,2,15,0,27,0,23,0,22,0,16,0
2,3,17,0,40,0,29,0,14,0,29,0
3,4,22,0,24,0,11,0,28,0,15,0
4,5,3,0,5,0,50,0,44,0,51,0
5,6,13,0,12,0,14,0,14,0,15,0
6,7,8,0,6,0,23,0,38,0,12,0
7,8,0,37,1,37,1,18,0,6,0,6
8,9,4,0,3,0,3,0,3,0,3,0
9,10,25,0,17,0,7,0,7,0,6,0


In [30]:
sub.to_csv('../sub/baseline_{}.csv'.format(time.strftime('%Y%m%d')), index=False)