In [1]:
import numpy as np
import pandas as pd
import re
import gc
from tqdm import tqdm
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
def count_encode(df, cols=[]):
    """
    count编码
    @param df:
    @param cols:
    @return:
    """
    for col in cols:
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')
    return df


def cross_cat_num(df, cat_col, num_col):
    """
    类别特征与数据特征groupby统计
    @param df:
    @param cat_col: 类别特征
    @param num_col: 数值特征
    @return:
    """
    def max_min(s):
        return s.max() - s.min()
    def quantile(s, q=0.25):
        return s.quantile(q)
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            tmp = g[f2].agg({
                '{}_{}_count'.format(f1, f2): 'count',
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_sum'.format(f1, f2): 'sum',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_std'.format(f1, f2): 'std',
                '{}_{}_nunique'.format(f1, f2): 'nunique',
                '{}_{}_max_min'.format(f1, f2): max_min,
                '{}_{}_quantile_25'.format(f1, f2): lambda x: quantile(x, 0.25),
                '{}_{}_quantile_75'.format(f1, f2): lambda x: quantile(x, 0.75)
            })
            df = df.merge(tmp, on=f1, how='left')
            del tmp
            gc.collect()
    return df


def arithmetic(df, cross_features):
    """
    数值特征之间的加减乘除
    @param df:
    @param cross_features: 交叉用的数值特征
    @return:
    """
    for i in tqdm(range(len(cross_features))):
        for j in range(i + 1, len(cross_features)):
            colname_add = '{}_{}_add'.format(cross_features[i], cross_features[j])
            colname_substract = '{}_{}_subtract'.format(cross_features[i], cross_features[j])
            colname_multiply = '{}_{}c_multiply'.format(cross_features[i], cross_features[j])
            df[colname_add] = df[cross_features[i]] + df[cross_features[j]]
            df[colname_substract] = df[cross_features[i]] - df[cross_features[j]]
            df[colname_multiply] = df[cross_features[i]] * df[cross_features[j]]

    for f1 in tqdm(cross_features):
        for f2 in cross_features:
            if f1 != f2:
                colname_ratio = '{}_{}_ratio'.format(f1, f2)
                df[colname_ratio] = df[f1].values / (df[f2].values + 0.001)
    return df

In [3]:
def get_psi(c, x_train, x_test):
    psi_res = pd.DataFrame()
    psi_dict={}
    # for c in tqdm(f_cols):
    try:
        t_train = x_train[c].fillna(-998)
        t_test = x_test[c].fillna(-998)
        #获取切分点
        bins=[]
        for i in np.arange(0,1.1,0.2):
            bins.append(t_train.quantile(i))
        bins=sorted(set(bins))
        bins[0]=-np.inf
        bins[-1]=np.inf
        #计算psi
        t_psi = pd.DataFrame()
        t_psi['train'] = pd.cut(t_train,bins).value_counts().sort_index()
        t_psi['test'] = pd.cut(t_test,bins).value_counts()
        t_psi.index=[str(x) for x in t_psi.index]
        t_psi.loc['总计',:] = t_psi.sum()
        t_psi['train_rate'] = t_psi['train']/t_psi.loc['总计','train']
        t_psi['test_rate'] = t_psi['test']/t_psi.loc['总计','test']
        t_psi['psi'] = (t_psi['test_rate']-t_psi['train_rate'])*(np.log(t_psi['test_rate'])-np.log(t_psi['train_rate']))
        t_psi.loc['总计','psi'] = t_psi['psi'].sum()
        t_psi.index.name=c
        #汇总
        t_res = pd.DataFrame([[c,t_psi.loc['总计','psi']]],
                             columns=['变量名','PSI'])
        psi_res = pd.concat([psi_res,t_res])
        psi_dict[c]=t_psi
        print(c,'done')
    except:
        print(c,'error')
    return psi_res #, psi_dict

In [4]:
def auc_select(X_train, y_train, X_valid, y_valid, cols, threshold=0.52):
    """
    基于AUC的单特征筛选
    @param X_train:
    @param y_train:
    @param X_valid:
    @param y_valid:
    @param cols:
    @return:
    """
    useful_dict = dict()
    useless_dict = dict()
    params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'metric': 'auc',
        'learning_rate': 0.1,
        'num_leaves': 31,
        'lambda_l1': 0,
        'lambda_l2': 1,
        'num_threads': 23,
        'min_data_in_leaf': 20,
        'first_metric_only': True,
        'is_unbalance': True,
        'max_depth': -1,
        'seed': 2020
    }
    for i in cols:
        print(i)
        lgb_train = lgb.Dataset(X_train[[i]].values, y_train)
        lgb_valid = lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
        lgb_model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_valid, lgb_train],
            num_boost_round=1000,
            early_stopping_rounds=50,
            verbose_eval=500
        )
        print('*' * 10)
        print(lgb_model.best_score['valid_0']['auc'])
        if lgb_model.best_score['valid_0']['auc'] > threshold:
            useful_dict[i] = lgb_model.best_score['valid_0']['auc']
        else:
            useless_dict[i] = lgb_model.best_score['valid_0']['auc']
    useful_cols = list(useful_dict.keys())
    useless_cols = list(useless_dict.keys())
    return useful_dict, useless_dict, useful_cols, useless_cols

In [5]:
def correlation(df, useful_dict, threshold=0.98):
    """
    去除特征相关系数大于阈值的特征
    @param df:
    @param threshold:
    @param useful_dict:
    @return:
    """
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName_i = corr_matrix.columns[i]
                colName_j = corr_matrix.columns[j]
                if useful_dict[colName_i] >= useful_dict[colName_j]:
                    col_corr.add(colName_j)
                else:
                    col_corr.add(colName_i)
    return list(col_corr)

In [6]:
def train_func(train_path, test_path, save_path):
    # 请填写训练代码
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    data = pd.concat([train, test])
    del train, test
    gc.collect()

    single_cols = ['appProtocol']
    data.drop(single_cols, axis=1, inplace=True)
    gc.collect()

    cat_cols = ['srcAddress', 'destAddress',
                'tlsVersion', 'tlsSubject', 'tlsIssuerDn', 'tlsSni']

    data['srcAddressPort'] = data['srcAddress'].astype(str) + data['srcPort'].astype(str)
    data['destAddressPort'] = data['destAddress'].astype(str) + data['destPort'].astype(str)
    
    # srcAddress To destAddress
    tmp = data.groupby('srcAddress', as_index=False)['destAddress'].agg({
        's2d_count': 'count',
        's2d_nunique': 'nunique'
    })
    data = data.merge(tmp, on='srcAddress', how='left')
    del tmp
    gc.collect()
    
    # srcAddressPort To destAddressPort
    tmp = data.groupby('srcAddressPort', as_index=False)['destAddressPort'].agg({
        'sp2dp_count': 'count',
        'sp2dp_nunique': 'nunique'
    })
    data = data.merge(tmp, on='srcAddressPort', how='left')
    del tmp
    gc.collect()
    
    # srcAddress To destAddressPort
    tmp = data.groupby('srcAddress', as_index=False)['destAddressPort'].agg({
        's2dp_count': 'count',
        's2dp_nunique': 'nunique'
    })
    data = data.merge(tmp, on='srcAddress', how='left')
    del tmp
    gc.collect()
    
    # srcAddressPort To destAddress
    tmp = data.groupby('srcAddressPort', as_index=False)['destAddress'].agg({
        'sp2d_count': 'count',
        'sp2d_nunique': 'nunique'
    })
    data = data.merge(tmp, on='srcAddressPort', how='left')
    del tmp
    gc.collect()
    
    # destAddress To srcAddress
    tmp = data.groupby('destAddress', as_index=False)['srcAddress'].agg({
        'd2s_count': 'count',
        'd2s_nunique': 'nunique'
    })
    data = data.merge(tmp, on='destAddress', how='left')
    del tmp
    gc.collect()
    
    # destAddressPort To srcAddressPort
    tmp = data.groupby('destAddressPort', as_index=False)['srcAddressPort'].agg({
        'dp2sp_count': 'count',
        'dp2sp_nunique': 'nunique'
    })
    data = data.merge(tmp, on='destAddressPort', how='left')
    del tmp
    gc.collect()
    
    # destAddressPort To srcAddress
    tmp = data.groupby('destAddressPort', as_index=False)['srcAddress'].agg({
        'dp2s_count': 'count',
        'dp2s_nunique': 'nunique'
    })
    data = data.merge(tmp, on='destAddressPort', how='left')
    del tmp
    gc.collect()
    
    # destAddress To srcAddressProt
    tmp = data.groupby('destAddress', as_index=False)['srcAddressPort'].agg({
        'd2sp_count': 'count',
        'd2sp_nunique': 'nunique'
    })
    data = data.merge(tmp, on='destAddress', how='left')
    del tmp
    gc.collect()
    
    cat_cols += ['srcAddressPort', 'destAddressPort']
    num_cols = ['bytesOut', 'bytesIn', 'pktsIn', 'pktsOut']
#     data['bytesOut-bytesIn'] = data['bytesOut'] - data['bytesIn']
#     data['pktsOut-pktsIn'] = data['pktsOut'] - data['pktsIn']
    
    
    tlsVersion_map = {
        'TLSv1': 1,
        'TLS 1.2': 1,
        'TLS 1.3': 1,
        'SSLv2': 2,
        'SSLv3': 3,
        '0x4854': 4,
        '0x4752': 4,
        'UNDETERMINED': 5
    }
    data['tlsVersion_map'] = data['tlsVersion'].map(tlsVersion_map)
    cat_cols.append('tlsVersion_map')
    
    for i in cat_cols:
        print('-' * 20)
        print(i)
        lbl = LabelEncoder()
        data[i] = lbl.fit_transform(data[i].astype(str))
        data[i] = data[i].astype('category')

    data = count_encode(data, cat_cols)
    data = cross_cat_num(data, cat_cols, num_cols)
    data = arithmetic(data, num_cols)

#     for i in ['srcPort', 'destPort']:
#         data[i] = data[i].astype('category')
    
    used_cols = [i for i in data.columns if i not in ['eventId', 'label']]
    train = data.loc[data['label'].notnull(), :]
    test = data.loc[data['label'].isnull(), :]
    sub = test[['eventId']]
    
    print('*' * 20)
    print('srcAddress: ', 'srcAddress' in used_cols)
    print('destAddress: ', 'destAddress' in used_cols)
    print('tlsVersion: ', 'tlsVersion' in used_cols)
    print('tlsSubject: ', 'tlsSubject' in used_cols)
    print('tlsIssuerDn: ', 'tlsIssuerDn' in used_cols)
    print('tlsSni: ', 'tlsSni' in used_cols)
    print('srcAddressPort: ', 'destAddressPort' in used_cols)
    print('tlsVersion_map: ', 'tlsVersion_map' in used_cols)

    y = train['label']
    X = train[used_cols]
    X_test = test[used_cols]
    
    # 调用方法
    psi_res = Parallel(n_jobs=4)(delayed(get_psi)(c, X, X_test) for c in used_cols)
    psi_df = pd.concat(psi_res)
    psi_used_cols = list(psi_df[psi_df['PSI'] <= 0.2]['变量名'].values)
    psi_not_used_cols = list(psi_df[psi_df['PSI'] > 0.2]['变量名'].values)
    print('PSI used features: \n', psi_used_cols)
    print('PSI drop features: \n', psi_not_used_cols)
    print('Error drop features: \n', list(set(used_cols) - set(psi_used_cols)))
    
    X = X[psi_used_cols]
    X_test = X_test[psi_used_cols]
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=2020)
    
    print('y_train mean: ', y_train.mean())
    print('y_valid mean: ', y_valid.mean())
    
    print('*' * 20)
    print('srcAddress: ', 'srcAddress' in psi_used_cols)
    print('destAddress: ', 'destAddress' in psi_used_cols)
    print('tlsVersion: ', 'tlsVersion' in psi_used_cols)
    print('tlsSubject: ', 'tlsSubject' in psi_used_cols)
    print('tlsIssuerDn: ', 'tlsIssuerDn' in psi_used_cols)
    print('tlsSni: ', 'tlsSni' in psi_used_cols)
    print('srcAddressPort: ', 'destAddressPort' in psi_used_cols)
    print('tlsVersion_map: ', 'tlsVersion_map' in psi_used_cols)
    
    useful_dict, useless_dict, useful_cols, useless_cols = auc_select(X_train, y_train, X_valid, y_valid, psi_used_cols, threshold=0.52)
    print('AUC drop features: \n', useless_cols)
    
    X_train = X_train[useful_cols]
    X_valid = X_valid[useful_cols]
    X_test = X_test[useful_cols]
    
    col_corr = correlation(X_train, useful_dict, threshold=0.98)
    print('Correlation drop features: \n', col_corr)
    
    X_train.drop(col_corr, axis=1, inplace=True)
    X_valid.drop(col_corr, axis=1, inplace=True)
    X_test.drop(col_corr, axis=1, inplace=True)

    used_cols = X_train.columns.to_list()
    
    train_dataset = lgb.Dataset(X_train, y_train)
    valid_dataset = lgb.Dataset(X_valid, y_valid, reference=train_dataset)
    all_dataset = lgb.Dataset(train[used_cols], y, reference=train_dataset)

    params = {'objective': 'binary',
              'boosting': 'gbdt',
              'metric': 'auc',
              # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
              'num_boost_round': 1000000,
              'learning_rate': 0.1,
              'num_leaves': 31,
              'lambda_l1': 0,
              'lambda_l2': 1,
              'num_threads': 23,
              'min_data_in_leaf': 20,
              'first_metric_only': True,
              'is_unbalance': True,
              'max_depth': -1,
              'seed': 2020}
    valid_model = lgb.train(params,
                            train_dataset,
                            valid_sets=[valid_dataset, train_dataset],
                            early_stopping_rounds=200,
                            verbose_eval=300)
    pred = valid_model.predict(X_valid)
    
    f1_best = 0
    for i in np.arange(0.1, 1, 0.01):
        y_valid_pred = np.where(pred > i, 1, 0)
        f1 = np.round(f1_score(y_valid, y_valid_pred), 5)
#         print('f1: ', f1)
        if f1 > f1_best:
            threshold = i
            f1_best = f1
        
    print('threshold: ', threshold)
    y_valid_pred = np.where(pred > threshold, 1, 0)
    print('Valid F1: ', np.round(f1_score(y_valid, y_valid_pred), 5))
    print('Valid mean label: ', np.mean(y_valid_pred))
#     binary_classification_report(y_valid, pred)
    
#     plt.figure(figsize=(8, 4))
#     ax = sns.kdeplot(pred, color='Red', shade=True)
#     ax.set_xlabel('pred')
#     ax.set_ylabel('Frequency')

    params = {'objective': 'binary',
              'boosting': 'gbdt',
              'metric': 'auc',
              # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
              # 'num_boost_round': 1000000,
              'learning_rate': 0.1,
              'num_leaves': 31,
              'lambda_l1': 0,
              'lambda_l2': 1,
              'num_threads': 23,
              'min_data_in_leaf': 20,
              'first_metric_only': True,
              'is_unbalance': True,
              'max_depth': -1,
              'seed': 2020}
    train_model = lgb.train(params,
                            all_dataset,
                            num_boost_round=valid_model.best_iteration+20)
    y_test_pred = np.where(train_model.predict(X_test) > threshold, 1, 0)

    print('Test mean label: ', np.mean(y_test_pred))
    sub['label'] = y_test_pred
    sub.to_csv(save_path + '机器不学习原子弹也不学习_eta_submission_1020.csv', index=False)

In [7]:
if __name__ == '__main__':
    train_path = '../大数据队_eta_submission_1011/data/train.csv'
    test_path = '../大数据队_eta_submission_1011/data/test_1.csv'
    save_path = '../大数据队_eta_submission_1011/result/'
    train_func(train_path, test_path, save_path)

--------------------
srcAddress
--------------------
destAddress
--------------------
tlsVersion
--------------------
tlsSubject
--------------------
tlsIssuerDn
--------------------
tlsSni
--------------------
srcAddressPort
--------------------
destAddressPort
--------------------
tlsVersion_map
srcAddress
destAddress
tlsVersion
tlsSubject
tlsIssuerDn
tlsSni
srcAddressPort


  0%|                                                                                            | 0/9 [00:00<?, ?it/s]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A

destAddressPort
tlsVersion_map



 25%|█████████████████████                                                               | 1/4 [00:17<00:52, 17.34s/it][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:34<00:34, 17.32s/it][A
 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:52<00:17, 17.50s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:09<00:00, 17.47s/it][A
 11%|█████████▎                                                                          | 1/9 [01:09<09:18, 69.87s/it]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
 25%|█████████████████████                                                               | 1/4 [00:23<01:09, 23.31s/it][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:47<00:46, 23.47s/it][A
 75%|█████████████

********************
srcAddress:  True
destAddress:  True
tlsVersion:  True
tlsSubject:  True
tlsIssuerDn:  True
tlsSni:  True
srcAddressPort:  True
tlsVersion_map:  True
PSI used features: 
 ['srcPort', 'destPort', 'bytesOut', 'bytesIn', 'pktsIn', 'pktsOut', 's2d_count', 's2d_nunique', 'sp2dp_count', 'sp2dp_nunique', 's2dp_count', 's2dp_nunique', 'sp2d_count', 'sp2d_nunique', 'd2s_count', 'd2s_nunique', 'dp2sp_count', 'dp2sp_nunique', 'dp2s_count', 'dp2s_nunique', 'd2sp_count', 'd2sp_nunique', 'srcAddress_count', 'destAddress_count', 'tlsVersion_count', 'tlsSubject_count', 'tlsIssuerDn_count', 'tlsSni_count', 'srcAddressPort_count', 'destAddressPort_count', 'tlsVersion_map_count', 'srcAddress_bytesOut_count', 'srcAddress_bytesOut_max', 'srcAddress_bytesOut_min', 'srcAddress_bytesOut_median', 'srcAddress_bytesOut_mean', 'srcAddress_bytesOut_sum', 'srcAddress_bytesOut_skew', 'srcAddress_bytesOut_std', 'srcAddress_bytesOut_nunique', 'srcAddress_bytesOut_max_min', 'srcAddress_bytesOut_qua

Early stopping, best iteration is:
[4]	training's auc: 0.930248	valid_0's auc: 0.935111
Evaluated only: auc
**********
0.9351113534779094
destPort
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[118]	training's auc: 0.644376	valid_0's auc: 0.624224
Evaluated only: auc
**********
0.6242243234558043
bytesOut
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[31]	training's auc: 0.921089	valid_0's auc: 0.911678
Evaluated only: auc
**********
0.9116779464607414
bytesIn
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[10]	training's auc: 0.853758	valid_0's auc: 0.857234
Evaluated only: auc
**********
0.8572344217117371
pktsIn
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[14]	training's auc: 0.826356	valid_0's auc: 0.818092
Evaluated only: auc
**********
0.818091945740133
pktsOut
Training until valida

Early stopping, best iteration is:
[26]	training's auc: 0.82905	valid_0's auc: 0.844143
Evaluated only: auc
**********
0.8441425246613627
srcAddress_bytesOut_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[80]	training's auc: 0.902771	valid_0's auc: 0.895926
Evaluated only: auc
**********
0.8959260305187406
srcAddress_bytesOut_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	training's auc: 0.944988	valid_0's auc: 0.935245
Evaluated only: auc
**********
0.9352445686574286
srcAddress_bytesOut_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[131]	training's auc: 0.925043	valid_0's auc: 0.919013
Evaluated only: auc
**********
0.9190131559727727
srcAddress_bytesIn_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[90]	training's auc: 0.880897	valid_0's auc: 0.8

srcAddress_pktsOut_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[147]	training's auc: 0.874366	valid_0's auc: 0.865618
Evaluated only: auc
**********
0.8656178243468269
srcAddress_pktsOut_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[16]	training's auc: 0.859361	valid_0's auc: 0.861141
Evaluated only: auc
**********
0.8611414827005288
srcAddress_pktsOut_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[34]	training's auc: 0.851615	valid_0's auc: 0.849418
Evaluated only: auc
**********
0.8494175731076725
destAddress_bytesOut_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[305]	training's auc: 0.888345	valid_0's auc: 0.899283
Evaluated only: auc
**********
0.8992834815124987
destAddress_bytesOut_max
Training until validation scores don't improve for 50 rou

Early stopping, best iteration is:
[59]	training's auc: 0.873763	valid_0's auc: 0.877299
Evaluated only: auc
**********
0.8772994712292217
destAddress_pktsIn_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[15]	training's auc: 0.900812	valid_0's auc: 0.903208
Evaluated only: auc
**********
0.9032084603324537
destAddress_pktsIn_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[66]	training's auc: 0.85719	valid_0's auc: 0.855756
Evaluated only: auc
**********
0.855755616363654
destAddress_pktsOut_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[305]	training's auc: 0.888345	valid_0's auc: 0.899283
Evaluated only: auc
**********
0.8992834815124987
destAddress_pktsOut_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[80]	training's auc: 0.881057	valid_0's auc: 0.879432


Early stopping, best iteration is:
[46]	training's auc: 0.746003	valid_0's auc: 0.732783
Evaluated only: auc
**********
0.732783398739909
tlsVersion_bytesIn_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[37]	training's auc: 0.746003	valid_0's auc: 0.732783
Evaluated only: auc
**********
0.732783398739909
tlsVersion_pktsIn_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[43]	training's auc: 0.746003	valid_0's auc: 0.732784
Evaluated only: auc
**********
0.7327837882579779
tlsVersion_pktsIn_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[43]	training's auc: 0.746003	valid_0's auc: 0.732784
Evaluated only: auc
**********
0.7327837882579779
tlsVersion_pktsIn_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.746002	valid_0's auc: 0.732783
Evaluated only:

Early stopping, best iteration is:
[193]	training's auc: 0.982778	valid_0's auc: 0.977664
Evaluated only: auc
**********
0.9776644496596586
tlsSubject_bytesIn_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[140]	training's auc: 0.97326	valid_0's auc: 0.973665
Evaluated only: auc
**********
0.973665267647603
tlsSubject_bytesIn_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[51]	training's auc: 0.981415	valid_0's auc: 0.967201
Evaluated only: auc
**********
0.9672008257783058
tlsSubject_bytesIn_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[52]	training's auc: 0.97361	valid_0's auc: 0.965719
Evaluated only: auc
**********
0.965719099044707
tlsSubject_bytesIn_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[43]	training's auc: 0.979094	valid_0's auc: 0.969104
Evaluated only: 

Early stopping, best iteration is:
[14]	training's auc: 0.99031	valid_0's auc: 0.987472
Evaluated only: auc
**********
0.9874715408361006
tlsIssuerDn_bytesOut_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[284]	training's auc: 0.990885	valid_0's auc: 0.987643
Evaluated only: auc
**********
0.9876433183044279
tlsIssuerDn_bytesOut_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[43]	training's auc: 0.988908	valid_0's auc: 0.984237
Evaluated only: auc
**********
0.9842369827930393
tlsIssuerDn_bytesOut_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[76]	training's auc: 0.99044	valid_0's auc: 0.985438
Evaluated only: auc
**********
0.9854378669990554
tlsIssuerDn_bytesOut_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[102]	training's auc: 0.989973	valid_0's auc: 0.980628
Evaluat

Early stopping, best iteration is:
[16]	training's auc: 0.988348	valid_0's auc: 0.984588
Evaluated only: auc
**********
0.984588133332035
tlsIssuerDn_pktsOut_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[28]	training's auc: 0.854912	valid_0's auc: 0.845492
Evaluated only: auc
**********
0.84549181525158
tlsIssuerDn_pktsOut_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[24]	training's auc: 0.958266	valid_0's auc: 0.964925
Evaluated only: auc
**********
0.9649250664615204
tlsIssuerDn_pktsOut_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[113]	training's auc: 0.989633	valid_0's auc: 0.981694
Evaluated only: auc
**********
0.981694403599147
tlsIssuerDn_pktsOut_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[84]	training's auc: 0.99106	valid_0's auc: 0.986561
Evaluated only:

Early stopping, best iteration is:
[19]	training's auc: 0.850414	valid_0's auc: 0.850883
Evaluated only: auc
**********
0.8508831348414174
tlsSni_pktsIn_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[43]	training's auc: 0.883935	valid_0's auc: 0.883287
Evaluated only: auc
**********
0.8832873377413795
tlsSni_pktsIn_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[158]	training's auc: 0.933211	valid_0's auc: 0.934632
Evaluated only: auc
**********
0.9346322462533231
tlsSni_pktsIn_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[117]	training's auc: 0.928851	valid_0's auc: 0.926891
Evaluated only: auc
**********
0.9268911589136342
tlsSni_pktsIn_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[175]	training's auc: 0.929703	valid_0's auc: 0.933772
Evaluated only: auc
**********


Early stopping, best iteration is:
[10]	training's auc: 0.853757	valid_0's auc: 0.857234
Evaluated only: auc
**********
0.8572344217117371
srcAddressPort_bytesIn_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[10]	training's auc: 0.853757	valid_0's auc: 0.857234
Evaluated only: auc
**********
0.8572344217117371
srcAddressPort_bytesIn_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[64]	training's auc: 0.866612	valid_0's auc: 0.862873
Evaluated only: auc
**********
0.8628730852752432
srcAddressPort_bytesIn_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_0's auc: 0.5
Evaluated only: auc
**********
0.5
srcAddressPort_bytesIn_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.536863	valid_0's auc: 0.534905
Evaluated only: auc
********

Early stopping, best iteration is:
[169]	training's auc: 0.894324	valid_0's auc: 0.881083
Evaluated only: auc
**********
0.8810826654721446
destAddressPort_bytesOut_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[62]	training's auc: 0.893877	valid_0's auc: 0.881285
Evaluated only: auc
**********
0.881285409626939
destAddressPort_bytesOut_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[179]	training's auc: 0.923754	valid_0's auc: 0.92309
Evaluated only: auc
**********
0.9230902415985821
destAddressPort_bytesOut_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[140]	training's auc: 0.918182	valid_0's auc: 0.914459
Evaluated only: auc
**********
0.9144589107127207
destAddressPort_bytesOut_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[12]	training's auc: 0.845214	valid_0's auc

Early stopping, best iteration is:
[95]	training's auc: 0.906333	valid_0's auc: 0.894387
Evaluated only: auc
**********
0.8943872393880671
destAddressPort_pktsOut_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[52]	training's auc: 0.911269	valid_0's auc: 0.902792
Evaluated only: auc
**********
0.9027916759988704
destAddressPort_pktsOut_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[298]	training's auc: 0.919778	valid_0's auc: 0.921073
Evaluated only: auc
**********
0.9210729275204254
destAddressPort_pktsOut_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[256]	training's auc: 0.919649	valid_0's auc: 0.915879
Evaluated only: auc
**********
0.9158788988324196
destAddressPort_pktsOut_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	training's auc: 0.819649	valid_0's auc: 0.

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.505352	valid_0's auc: 0.506018
Evaluated only: auc
**********
0.5060180541624875
tlsVersion_map_pktsIn_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.505352	valid_0's auc: 0.506018
Evaluated only: auc
**********
0.5060180541624875
tlsVersion_map_pktsIn_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.505352	valid_0's auc: 0.506018
Evaluated only: auc
**********
0.5060180541624875
tlsVersion_map_pktsIn_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.505352	valid_0's auc: 0.506018
Evaluated only: auc
**********
0.5060180541624875
tlsVersion_map_pktsIn_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:

Early stopping, best iteration is:
[3]	training's auc: 0.792831	valid_0's auc: 0.787765
Evaluated only: auc
**********
0.7877654322189871
pktsIn_pktsOutc_multiply
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[41]	training's auc: 0.861911	valid_0's auc: 0.843885
Evaluated only: auc
**********
0.8438846636998374
bytesOut_bytesIn_ratio
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[15]	training's auc: 0.85025	valid_0's auc: 0.844126
Evaluated only: auc
**********
0.8441257753844057
bytesOut_pktsIn_ratio
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[233]	training's auc: 0.896224	valid_0's auc: 0.885914
Evaluated only: auc
**********
0.8859138580790916
bytesOut_pktsOut_ratio
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[22]	training's auc: 0.933428	valid_0's auc: 0.936526
Evaluated only: auc

Training until validation scores don't improve for 200 rounds
[300]	training's auc: 1	valid_0's auc: 0.999893
[600]	training's auc: 1	valid_0's auc: 0.999902
[900]	training's auc: 1	valid_0's auc: 0.999903
Early stopping, best iteration is:
[867]	training's auc: 1	valid_0's auc: 0.999905
Evaluated only: auc
threshold:  0.8299999999999996
Valid F1:  0.99122
Valid mean label:  0.09272727272727273
Test mean label:  0.09
