In [1]:
import numpy as np
import pandas as pd
import re
import gc
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
def count_encode(df, cols=[]):
    for col in cols:
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')


# 交叉特征
def cross_cat_num(df, cat_col, num_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            df_new = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_sum'.format(f1, f2): 'sum',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_nunique'.format(f1, f2): 'nunique'
            })
            df = df.merge(df_new, on=f1, how='left')
            del df_new
            gc.collect()
    return df

In [3]:
def train_func(train_path, test_path, save_path):
    # 请填写训练代码
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    data = pd.concat([train, test])
    del train, test
    gc.collect()

    single_cols = ['appProtocol']
    data.drop(single_cols, axis=1, inplace=True)
    gc.collect()

    cat_cols = ['srcAddress', 'destAddress',
                'tlsVersion', 'tlsSubject', 'tlsIssuerDn', 'tlsSni']

    data['srcAddressPort'] = data['srcAddress'].astype(str) + data['srcPort'].astype(str)
    data['destAddressPort'] = data['destAddress'].astype(str) + data['destPort'].astype(str)
    
    # srcAddress To destAddress
    tmp = data.groupby('srcAddress', as_index=False)['destAddress'].agg({
        's2d_count': 'count',
        's2d_nunique': 'nunique'
    })
    data = data.merge(tmp, on='srcAddress', how='left')
    del tmp
    gc.collect()
    
    # srcAddressPort To destAddressPort
    tmp = data.groupby('srcAddressPort', as_index=False)['destAddressPort'].agg({
        'sp2dp_count': 'count',
        'sp2dp_nunique': 'nunique'
    })
    data = data.merge(tmp, on='srcAddressPort', how='left')
    del tmp
    gc.collect()
    
    # srcAddress To destAddressPort
    tmp = data.groupby('srcAddress', as_index=False)['destAddressPort'].agg({
        's2dp_count': 'count',
        's2dp_nunique': 'nunique'
    })
    data = data.merge(tmp, on='srcAddress', how='left')
    del tmp
    gc.collect()
    
    # srcAddressPort To destAddress
    tmp = data.groupby('srcAddressPort', as_index=False)['destAddress'].agg({
        'sp2d_count': 'count',
        'sp2d_nunique': 'nunique'
    })
    data = data.merge(tmp, on='srcAddressPort', how='left')
    del tmp
    gc.collect()
    
    # destAddress To srcAddress
    tmp = data.groupby('destAddress', as_index=False)['srcAddress'].agg({
        'd2s_count': 'count',
        'd2s_nunique': 'nunique'
    })
    data = data.merge(tmp, on='destAddress', how='left')
    del tmp
    gc.collect()
    
    # destAddressPort To srcAddressPort
    tmp = data.groupby('destAddressPort', as_index=False)['srcAddressPort'].agg({
        'dp2sp_count': 'count',
        'dp2sp_nunique': 'nunique'
    })
    data = data.merge(tmp, on='destAddressPort', how='left')
    del tmp
    gc.collect()
    
    # destAddressPort To srcAddress
    tmp = data.groupby('destAddressPort', as_index=False)['srcAddress'].agg({
        'dp2s_count': 'count',
        'dp2s_nunique': 'nunique'
    })
    data = data.merge(tmp, on='destAddressPort', how='left')
    del tmp
    gc.collect()
    
    # destAddress To srcAddressProt
    tmp = data.groupby('destAddress', as_index=False)['srcAddressPort'].agg({
        'd2sp_count': 'count',
        'd2sp_nunique': 'nunique'
    })
    data = data.merge(tmp, on='destAddress', how='left')
    del tmp
    gc.collect()
    
    cat_cols += ['srcAddressPort', 'destAddressPort']
    num_cols = ['bytesOut', 'bytesIn', 'pktsIn', 'pktsOut']
    data['bytesOut-bytesIn'] = data['bytesOut'] - data['bytesIn']
    data['pktsOut-pktsIn'] = data['pktsOut'] - data['pktsIn']
    
#     num_cols += ['bytesOut-bytesIn', 'pktsOut-pktsIn']
    
    tlsVersion_map = {
        'TLSv1': 1,
        'TLS 1.2': 1,
        'TLS 1.3': 1,
        'SSLv2': 2,
        'SSLv3': 3,
        '0x4854': 4,
        '0x4752': 4,
        'UNDETERMINED': 5
    }
    data['tlsVersion_map'] = data['tlsVersion'].map(tlsVersion_map)
    cat_cols.append('tlsVersion_map')

    count_encode(data, cat_cols)
    data = cross_cat_num(data, cat_cols, num_cols)

    for i in cat_cols:
        lbl = LabelEncoder()
        data[i] = lbl.fit_transform(data[i].astype(str))
        data[i] = data[i].astype('category')

#     for i in ['srcPort', 'destPort']:
#         data[i] = data[i].astype('category')
    
    used_cols = [i for i in data.columns if i not in ['eventId', 'label']]
    train = data.loc[data['label'].notnull(), :]
    test = data.loc[data['label'].isnull(), :]
    sub = test[['eventId']]

    y = train['label']
    X = train[used_cols]
    X_test = test[used_cols]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=2020)
    
    print('y_train mean: ', y_train.mean())
    print('y_valid mean: ', y_valid.mean())

    train_dataset = lgb.Dataset(X_train, y_train)
    valid_dataset = lgb.Dataset(X_valid, y_valid, reference=train_dataset)
    all_dataset = lgb.Dataset(train[used_cols], train['label'], reference=train_dataset)

    params = {'objective': 'binary',
              'boosting': 'gbdt',
              'metric': 'auc',
              # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
              'num_boost_round': 1000000,
              'learning_rate': 0.1,
              'num_leaves': 31,
              'lambda_l1': 0,
              'lambda_l2': 1,
              'num_threads': 23,
              'min_data_in_leaf': 20,
              'first_metric_only': True,
              'is_unbalance': True,
              'max_depth': -1,
              'seed': 2020}
    valid_model = lgb.train(params,
                            train_dataset,
                            valid_sets=[train_dataset, valid_dataset],
                            early_stopping_rounds=200,
                            verbose_eval=300)
    pred = valid_model.predict(X_valid)
    
    f1_best = 0
    for i in np.arange(0.1, 1, 0.01):
        y_valid_pred = np.where(pred > i, 1, 0)
        f1 = np.round(f1_score(y_valid, y_valid_pred), 5)
#         print('f1: ', f1)
        if f1 > f1_best:
            threshold = i
            f1_best = f1
        
    print('threshold: ', threshold)
    y_valid_pred = np.where(pred > threshold, 1, 0)
    print('Valid F1: ', np.round(f1_score(y_valid, y_valid_pred), 5))
    print('Valid mean label: ', np.mean(y_valid_pred))
#     binary_classification_report(y_valid, pred)
    
#     plt.figure(figsize=(8, 4))
#     ax = sns.kdeplot(pred, color='Red', shade=True)
#     ax.set_xlabel('pred')
#     ax.set_ylabel('Frequency')

    params = {'objective': 'binary',
              'boosting': 'gbdt',
              'metric': 'auc',
              # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
              # 'num_boost_round': 1000000,
              'learning_rate': 0.1,
              'num_leaves': 31,
              'lambda_l1': 0,
              'lambda_l2': 1,
              'num_threads': 23,
              'min_data_in_leaf': 20,
              'first_metric_only': True,
              'is_unbalance': True,
              'max_depth': -1,
              'seed': 2020}
    train_model = lgb.train(params,
                            all_dataset,
                            num_boost_round=valid_model.best_iteration+20)
    y_test_pred = np.where(train_model.predict(X_test) > threshold, 1, 0)

    print('Test mean label: ', np.mean(y_test_pred))
    sub['label'] = y_test_pred
    sub.to_csv(save_path + '机器不学习原子弹也不学习_eta_submission_1014.csv', index=False)

In [4]:
if __name__ == '__main__':
    train_path = '../大数据队_eta_submission_1011/data/train.csv'
    test_path = '../大数据队_eta_submission_1011/data/test_1.csv'
    save_path = '../大数据队_eta_submission_1011/result/'
    train_func(train_path, test_path, save_path)

  0%|                                                                                            | 0/9 [00:00<?, ?it/s]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A

srcAddress
destAddress
tlsVersion
tlsSubject
tlsIssuerDn
tlsSni
srcAddressPort
destAddressPort
tlsVersion_map



 25%|█████████████████████                                                               | 1/4 [00:01<00:05,  1.69s/it][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:03<00:03,  1.68s/it][A
 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:04<00:01,  1.65s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.65s/it][A
 11%|█████████▎                                                                          | 1/9 [00:06<00:52,  6.62s/it]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
 25%|█████████████████████                                                               | 1/4 [00:02<00:06,  2.20s/it][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:04<00:04,  2.20s/it][A
 75%|█████████████

y_train mean:  0.09
y_valid mean:  0.09363636363636364
Training until validation scores don't improve for 200 rounds
[300]	training's auc: 1	valid_1's auc: 0.999908
Early stopping, best iteration is:
[380]	training's auc: 1	valid_1's auc: 0.99991
Evaluated only: auc
threshold:  0.17999999999999997
Valid F1:  0.99225
Valid mean label:  0.094
Test mean label:  0.09163636363636364
