In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import gc
import warnings
warnings.simplefilter('ignore')

In [2]:
train = pd.read_csv('../input/train.csv')
train = train.sort_values(by=['QUEUE_ID', 'DOTTING_TIME']).reset_index(drop=True)

test = pd.read_csv('../input/evaluation_public.csv')
test = test.sort_values(by=['ID', 'DOTTING_TIME']).reset_index(drop=True)

sub_sample = pd.read_csv('../input/submit_example.csv')

In [3]:
train.head()

Unnamed: 0,QUEUE_ID,CU,STATUS,QUEUE_TYPE,PLATFORM,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DOTTING_TIME,RESOURCE_TYPE,DISK_USAGE
0,2,16,available,sql,x86_64,3,54,0,0,0,0,0,1590683100000,vm,20.0
1,2,16,available,sql,x86_64,2,54,0,0,0,0,0,1590683400000,vm,20.0
2,2,16,available,sql,x86_64,7,54,0,0,0,0,0,1590683700000,vm,20.0
3,2,16,available,sql,x86_64,4,54,0,0,0,0,0,1590684000000,vm,20.0
4,2,16,available,sql,x86_64,5,54,0,0,0,0,0,1590684120000,vm,20.0


In [4]:
test.head()

Unnamed: 0,ID,QUEUE_ID,CU,STATUS,QUEUE_TYPE,PLATFORM,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DOTTING_TIME,RESOURCE_TYPE,DISK_USAGE
0,1,297,16,available,sql,x86_64,60,69,0,5,5,0,0,1662213420000,vm,9
1,1,297,16,available,sql,x86_64,58,69,0,9,4,0,0,1662213720000,vm,9
2,1,297,16,available,sql,x86_64,80,67,0,9,1,0,0,1662214020000,vm,9
3,1,297,16,available,sql,x86_64,100,65,0,7,2,0,1,1662214320000,vm,9
4,1,297,16,available,sql,x86_64,98,67,0,10,3,0,1,1662214620000,vm,9


In [5]:
sub_sample.head()

Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,0


In [6]:
train.shape, test.shape, sub_sample.shape

((501730, 15), (14980, 16), (2996, 11))

In [7]:
to_drop = ['STATUS', 'PLATFORM', 'RESOURCE_TYPE']
train.drop(to_drop, axis=1, inplace=True)
test.drop(to_drop, axis=1, inplace=True)
gc.collect()

20

In [8]:
train.drop('DOTTING_TIME', axis=1, inplace=True)
test.drop('DOTTING_TIME', axis=1, inplace=True)
gc.collect()

20

In [None]:
num_cols = ['CU', 'CPU_USAGE', 'MEM_USAGE', 'LAUNCHING_JOB_NUMS', 'RUNNING_JOB_NUMS',
            'SUCCEED_JOB_NUMS', 'CANCELLED_JOB_NUMS', 'FAILED_JOB_NUMS', 'DISK_USAGE']

In [9]:
# Label Encoding
le = LabelEncoder()
train['QUEUE_TYPE'] = le.fit_transform(train['QUEUE_TYPE'].astype(str))
test['QUEUE_TYPE'] = le.transform(test['QUEUE_TYPE'].astype(str))

train['QUEUE_TYPE'] = train['QUEUE_TYPE'].astype('category')
test['QUEUE_TYPE'] = test['QUEUE_TYPE'].astype('category')

In [10]:
# 1 CU = 1 CPU 4G MEM
train['used_cpu'] = train['CU'] * train['CPU_USAGE']
train['used_mem'] = train['CU'] * 4 * train['MEM_USAGE']

test['used_cpu'] = test['CU'] * test['CPU_USAGE']
test['used_mem'] = test['CU'] * 4 * test['MEM_USAGE']

In [11]:
num_cols += ['used_cpu', 'used_mem']

Index(['QUEUE_ID', 'CU', 'QUEUE_TYPE', 'CPU_USAGE', 'MEM_USAGE',
       'LAUNCHING_JOB_NUMS', 'RUNNING_JOB_NUMS', 'SUCCEED_JOB_NUMS',
       'CANCELLED_JOB_NUMS', 'FAILED_JOB_NUMS', 'DISK_USAGE', 'used_cpu',
       'used_mem'],
      dtype='object')

In [None]:
def arithmetic(df, cross_features):
    """
    数值特征之间的加减乘除
    @param df:
    @param cross_features: 交叉用的数值特征
    @return:
    """
    cols = []
    for i in tqdm(range(len(cross_features))):
        for j in range(i + 1, len(cross_features)):
            colname_add = '{}_{}_add'.format(cross_features[i], cross_features[j])
            cols.append(colname_add)
            colname_substract = '{}_{}_subtract'.format(cross_features[i], cross_features[j])
            cols.append(colname_substract)
            colname_multiply = '{}_{}c_multiply'.format(cross_features[i], cross_features[j])
            cols.append(colname_multiply)
            df[colname_add] = df[cross_features[i]] + df[cross_features[j]]
            df[colname_substract] = df[cross_features[i]] - df[cross_features[j]]
            df[colname_multiply] = df[cross_features[i]] * df[cross_features[j]]

    for f1 in tqdm(cross_features):
        for f2 in cross_features:
            if f1 != f2:
                colname_ratio = '{}_{}_ratio'.format(f1, f2)
                cols.append(colname_ratio)
                df[colname_ratio] = df[f1].values / (df[f2].values + 0.001)
    return df, cols

In [None]:
train, cols = arithmetic(train, num_cols)
test, _ = arithmetic(test, num_cols)
num_cols += cols

In [12]:
for data in [train, test]:
    for col in num_cols:
        for n in [1, 2, 3, -1, -2, -3]
        data[col + '_diff{}'.format(n)] = data.groupby('QUEUE_ID')[col].diff(n)
        data[col + '_shift{}'.format(n)] = data.groupby('QUEUE_ID')[col].shift(n)  

In [None]:
# 加个 id 后面方便处理
train['myid'] = train.index
test['myid'] = test.index

In [None]:
# 生成 target 列
df_train = pd.DataFrame()
for id_ in tqdm(train.QUEUE_ID.unique()):
    tmp = train[train.QUEUE_ID == id_]
    tmp['CPU_USAGE_next25mins'] = tmp['CPU_USAGE'].shift(-5)
    tmp['LAUNCHING_JOB_NUMS_next25mins'] = tmp['LAUNCHING_JOB_NUMS'].shift(-5)
    df_train = df_train.append(tmp)

In [None]:
df_train = df_train[df_train.CPU_USAGE_next25mins.notna()]
print(df_train.shape)
df_train.head()

In [None]:
used_cols = [i for i in df_train.columns if i not in ['CPU_USAGE_next25mins', 'LAUNCHING_JOB_NUMS_next25mins', 'QUEUE_ID', 'myid']]
