In [1]:
import numpy as np
import pandas as pd
import re
import gc
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
def binary_classification_report(y_true, y_pred, qlist=None, cum=True):
    """
    计算各个深度的命中率P和召回率R
    @param y_true:
    @param y_pred:
    @param qlist:
    @param cum:
    @return:
    """
    if isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.Series):
        y_pred = y_pred.values

    assert len(y_pred) == len(y_true)
    df = pd.DataFrame({'pred': np.ravel(y_pred),
                       'true': np.ravel(y_true)})
    df.sort_values('pred', ascending=False, inplace=True)
    if qlist is None:
        qlist = [0.01, 0.05, 0.06, 0.07, 0.08, 0.09, 0.091, 0.092, 0.093, 0.094, 0.095, 0.096, 0.097, 0.098, 0.099,
                 0.1, 0.101, 0.102, 0.103,
                 0.12, 0.13, 0.14, 0.15, 0.2, 0.3, 0.4, 0.5,
                 0.6, 0.7, 0.8, 0.9, 1]

    qlist = np.sort(qlist)
    if qlist[0] < 0 or qlist[-1] > 1:
        print('qlist should be in range [0, 1]')
        return None

    total_cnt = df.shape[0]
    total_pos_cnt = df['true'].sum()
    total_neg_pct = total_pos_cnt / total_cnt

    df['depth'] = qlist[np.searchsorted(qlist, np.arange(1, total_cnt + 1) / total_cnt)]

    def agg_func(grouped):
        pos_count = grouped['true'].sum()
        neg_count = grouped.shape[0] - pos_count
        all_count = pos_count + neg_count

        s = pd.Series({'pos_count': pos_count,
                       'neg_count': neg_count,
                       'all_count': all_count})
        return s

    report = df.groupby('depth').apply(agg_func)
    if cum:
        report = report.apply(np.cumsum)
    else:
        report = report.apply(np.sum)

    report.reset_index(inplace=True)
    report['hit_rate'] = report['pos_count'] / report['all_count']
    report['coverage'] = report['pos_count'] / total_pos_cnt
    report['lift_rate'] = report['hit_rate'] / total_neg_pct
    proba_idx = np.floor(report['depth'] * total_cnt).astype('int') - 1
    proba_idx[proba_idx < 0] = 0
    report['prob'] = df['pred'].values[proba_idx]
    print('*' * 10)
    print(report)
    print('*' * 10)

In [3]:
def count_encode(df, cols=[]):
    for col in cols:
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')


# 交叉特征
def cross_cat_num(df, cat_col, num_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            df_new = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_sum'.format(f1, f2): 'sum',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_nunique'.format(f1, f2): 'nunique'
            })
            df = df.merge(df_new, on=f1, how='left')
            del df_new
            gc.collect()
    return df

In [4]:
def train_func(train_path, test_path, save_path):
    # 请填写训练代码
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    data = pd.concat([train, test])
    del train, test
    gc.collect()

    single_cols = ['appProtocol']
    data.drop(single_cols, axis=1, inplace=True)
    gc.collect()

    cat_cols = ['srcAddress', 'destAddress',
                'tlsVersion', 'tlsSubject', 'tlsIssuerDn', 'tlsSni']

    data['srcAddressPort'] = data['srcAddress'].astype(str) + data['srcPort'].astype(str)
    data['destAddressPort'] = data['destAddress'].astype(str) + data['destPort'].astype(str)
    cat_cols += ['srcAddressPort', 'destAddressPort']
    num_cols = ['bytesOut', 'bytesIn', 'pktsIn', 'pktsOut']
    data['bytesOut-bytesIn'] = data['bytesOut'] - data['bytesIn']
    data['pktsOut-pktsIn'] = data['pktsOut'] - data['pktsIn']
    
#     num_cols += ['bytesOut-bytesIn', 'pktsOut-pktsIn']
    
    tlsVersion_map = {
        'TLSv1': 1,
        'TLS 1.2': 1,
        'TLS 1.3': 1,
        'SSLv2': 2,
        'SSLv3': 3,
        '0x4854': 4,
        '0x4752': 4,
        'UNDETERMINED': 5
    }
    data['tlsVersion_map'] = data['tlsVersion'].map(tlsVersion_map)
    cat_cols.append('tlsVersion_map')

    count_encode(data, cat_cols)
    data = cross_cat_num(data, cat_cols, num_cols)

    for i in cat_cols:
        lbl = LabelEncoder()
        data[i] = lbl.fit_transform(data[i].astype(str))
        data[i] = data[i].astype('category')

#     for i in ['srcPort', 'destPort']:
#         data[i] = data[i].astype('category')
    
    used_cols = [i for i in data.columns if i not in ['eventId', 'label']]
    train = data.loc[data['label'].notnull(), :]
    test = data.loc[data['label'].isnull(), :]
    sub = test[['eventId']]

    y = train['label']
    X = train[used_cols]
    X_test = test[used_cols]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=2020)

    train_dataset = lgb.Dataset(X_train, y_train)
    valid_dataset = lgb.Dataset(X_valid, y_valid, reference=train_dataset)
    all_dataset = lgb.Dataset(train[used_cols], train['label'], reference=train_dataset)

    params = {'objective': 'binary',
              'boosting': 'gbdt',
              'metric': 'auc',
              # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
              'num_boost_round': 1000000,
              'learning_rate': 0.1,
              'num_leaves': 31,
              'lambda_l1': 0,
              'lambda_l2': 1,
              'num_threads': 23,
              'min_data_in_leaf': 20,
              'first_metric_only': True,
              'is_unbalance': True,
              'max_depth': -1,
              'seed': 2020}
    valid_model = lgb.train(params,
                            train_dataset,
                            valid_sets=[train_dataset, valid_dataset],
                            early_stopping_rounds=200,
                            verbose_eval=300)
    pred = valid_model.predict(X_valid)
    
    f1_best = 0
    for i in np.arange(0.1, 1, 0.01):
        y_valid_pred = np.where(pred > i, 1, 0)
        f1 = np.round(f1_score(y_valid, y_valid_pred), 5)
        if f1 > f1_best:
            threshold = i
            f1_best = f1
        
    print('threshold: ', threshold)
    y_valid_pred = np.where(pred > threshold, 1, 0)
    print('Valid F1: ', np.round(f1_score(y_valid, y_valid_pred), 5))
    print('Valid mean label: ', np.mean(y_valid_pred))
#     binary_classification_report(y_valid, pred)
    
#     plt.figure(figsize=(8, 4))
#     ax = sns.kdeplot(pred, color='Red', shade=True)
#     ax.set_xlabel('pred')
#     ax.set_ylabel('Frequency')

    params = {'objective': 'binary',
              'boosting': 'gbdt',
              'metric': 'auc',
              # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
              # 'num_boost_round': 1000000,
              'learning_rate': 0.1,
              'num_leaves': 31,
              'lambda_l1': 0,
              'lambda_l2': 1,
              'num_threads': 23,
              'min_data_in_leaf': 20,
              'first_metric_only': True,
              'is_unbalance': True,
              'max_depth': -1,
              'seed': 2020}
    train_model = lgb.train(params,
                            all_dataset,
                            num_boost_round=valid_model.best_iteration+100)
    y_test_pred = np.where(train_model.predict(X_test) > threshold, 1, 0)

    print('Test mean label: ', np.mean(y_test_pred))
    sub['label'] = y_test_pred
    sub.to_csv(save_path + '机器不学习原子弹也不学习_eta_submission_1014.csv', index=False)

In [5]:
if __name__ == '__main__':
    train_path = '../大数据队_eta_submission_1011/data/train.csv'
    test_path = '../大数据队_eta_submission_1011/data/test_1.csv'
    save_path = '../大数据队_eta_submission_1011/result/'
    train_func(train_path, test_path, save_path)

  0%|                                                                                            | 0/9 [00:00<?, ?it/s]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A

srcAddress
destAddress
tlsVersion
tlsSubject
tlsIssuerDn
tlsSni
srcAddressPort
destAddressPort
tlsVersion_map



 25%|█████████████████████                                                               | 1/4 [00:01<00:04,  1.64s/it][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:03<00:03,  1.64s/it][A
 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:04<00:01,  1.61s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.62s/it][A
 11%|█████████▎                                                                          | 1/9 [00:06<00:51,  6.49s/it]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
 25%|█████████████████████                                                               | 1/4 [00:02<00:06,  2.26s/it][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:04<00:04,  2.20s/it][A
 75%|█████████████

Training until validation scores don't improve for 200 rounds
[300]	training's auc: 1	valid_1's auc: 0.999912
[600]	training's auc: 1	valid_1's auc: 0.99992
[900]	training's auc: 1	valid_1's auc: 0.999921
Early stopping, best iteration is:
[793]	training's auc: 1	valid_1's auc: 0.999921
Evaluated only: auc
threshold:  0.17999999999999997
Valid F1:  0.99127
Valid mean label:  0.09381818181818181
Test mean label:  0.09127272727272727
