In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import lightgbm as lgb
import pickleshare


def count_encode(df, cols=[]):
    """
    count编码
    @param df:
    @param cols:
    @return:
    """
    for col in cols:
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')
    return df


def cross_cat_num(df, cat_col, num_col):
    """
    类别特征与数据特征groupby统计
    @param df:
    @param cat_col: 类别特征
    @param num_col: 数值特征
    @return:
    """
    def max_min(s):
        return s.max() - s.min()
    def quantile(s, q=0.25):
        return s.quantile(q)
    for f1 in cat_col:
        g = df.groupby(f1, as_index=False)
        for f2 in num_col:
            tmp = g[f2].agg({
                '{}_{}_count'.format(f1, f2): 'count',
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_sum'.format(f1, f2): 'sum',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_std'.format(f1, f2): 'std',
                '{}_{}_nunique'.format(f1, f2): 'nunique',
                '{}_{}_max_min'.format(f1, f2): max_min,
                '{}_{}_quantile_25'.format(f1, f2): lambda x: quantile(x, 0.25),
                '{}_{}_quantile_75'.format(f1, f2): lambda x: quantile(x, 0.75)
            })
            df = df.merge(tmp, on=f1, how='left')
    return df


def arithmetic(df, cross_features):
    """
    数值特征之间的加减乘除
    @param df:
    @param cross_features: 交叉用的数值特征
    @return:
    """
    for i in range(len(cross_features)):
        for j in range(i + 1, len(cross_features)):
            colname_add = '{}_{}_add'.format(cross_features[i], cross_features[j])
            colname_substract = '{}_{}_subtract'.format(cross_features[i], cross_features[j])
            colname_multiply = '{}_{}c_multiply'.format(cross_features[i], cross_features[j])
            df[colname_add] = df[cross_features[i]] + df[cross_features[j]]
            df[colname_substract] = df[cross_features[i]] - df[cross_features[j]]
            df[colname_multiply] = df[cross_features[i]] * df[cross_features[j]]

    for f1 in cross_features:
        for f2 in cross_features:
            if f1 != f2:
                colname_ratio = '{}_{}_ratio'.format(f1, f2)
                df[colname_ratio] = df[f1].values / (df[f2].values + 0.001)
    return df


def get_psi(c, x_train, x_test):
    psi_res = pd.DataFrame()
    psi_dict={}
    # for c in tqdm(f_cols):
    try:
        t_train = x_train[c].fillna(-998)
        t_test = x_test[c].fillna(-998)
        #获取切分点
        bins=[]
        for i in np.arange(0,1.1,0.2):
            bins.append(t_train.quantile(i))
        bins=sorted(set(bins))
        bins[0]=-np.inf
        bins[-1]=np.inf
        #计算psi
        t_psi = pd.DataFrame()
        t_psi['train'] = pd.cut(t_train,bins).value_counts().sort_index()
        t_psi['test'] = pd.cut(t_test,bins).value_counts()
        t_psi.index=[str(x) for x in t_psi.index]
        t_psi.loc['总计',:] = t_psi.sum()
        t_psi['train_rate'] = t_psi['train']/t_psi.loc['总计','train']
        t_psi['test_rate'] = t_psi['test']/t_psi.loc['总计','test']
        t_psi['psi'] = (t_psi['test_rate']-t_psi['train_rate'])*(np.log(t_psi['test_rate'])-np.log(t_psi['train_rate']))
        t_psi.loc['总计','psi'] = t_psi['psi'].sum()
        t_psi.index.name=c
        #汇总
        t_res = pd.DataFrame([[c,t_psi.loc['总计','psi']]],
                             columns=['变量名','PSI'])
        psi_res = pd.concat([psi_res,t_res])
        psi_dict[c]=t_psi
        print(c,'done')
    except:
        print(c,'error')
    return psi_res #, psi_dict


def auc_select(X_train, y_train, X_valid, y_valid, cols, threshold=0.52):
    """
    基于AUC的单特征筛选
    @param X_train:
    @param y_train:
    @param X_valid:
    @param y_valid:
    @param cols:
    @return:
    """
    useful_dict = dict()
    useless_dict = dict()
    params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'metric': 'auc',
        'learning_rate': 0.1,
        'num_leaves': 31,
        'lambda_l1': 0,
        'lambda_l2': 1,
        'num_threads': 23,
        'min_data_in_leaf': 20,
        'first_metric_only': True,
        'is_unbalance': True,
        'max_depth': -1,
        'seed': 2020
    }
    for i in cols:
        print(i)
        lgb_train = lgb.Dataset(X_train[[i]].values, y_train)
        lgb_valid = lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
        lgb_model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_valid, lgb_train],
            num_boost_round=1000,
            early_stopping_rounds=50,
            verbose_eval=500
        )
        print('*' * 10)
        print(lgb_model.best_score['valid_0']['auc'])
        if lgb_model.best_score['valid_0']['auc'] > threshold:
            useful_dict[i] = lgb_model.best_score['valid_0']['auc']
        else:
            useless_dict[i] = lgb_model.best_score['valid_0']['auc']
    useful_cols = list(useful_dict.keys())
    useless_cols = list(useless_dict.keys())
    return useful_dict, useless_dict, useful_cols, useless_cols


def correlation(df, useful_dict, threshold=0.98):
    """
    去除特征相关系数大于阈值的特征
    @param df:
    @param threshold:
    @param useful_dict:
    @return:
    """
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName_i = corr_matrix.columns[i]
                colName_j = corr_matrix.columns[j]
                if useful_dict[colName_i] >= useful_dict[colName_j]:
                    col_corr.add(colName_j)
                else:
                    col_corr.add(colName_i)
    return col_corr


def train_test_label_encode(df, cat_col, type='save', path='./'):
    """
    train和test分开label encode
    @param df:
    @param cat_col:
    @param type: 'save' 'load'
    @param path:
    @return:
    """
    def save_obj(obj, name):
        with open(name + '.pkl', 'wb') as f:
            pickleshare.dump(obj, f)

    def load_obj(name):
        with open(name + '.pkl', 'rb') as f:
            return pickleshare.load(f)

    if type == 'save':
        print(cat_col)
        d = dict(zip(df[cat_col].unique(), range(df[cat_col].nunique())))
        df[cat_col] = df[cat_col].map(d)
        np.save(path + '{}.npy'.format(cat_col), d)
        return df
    elif type == 'load':
        d = np.load(path + '{}.npy'.format(cat_col)).item()
        return d


def train_func(train_path):
    # 请填写训练代码
    train = pd.read_csv(train_path)

    single_cols = ['appProtocol']
    train.drop(single_cols, axis=1, inplace=True)

    cat_cols = ['srcAddress', 'destAddress',
                'tlsVersion', 'tlsSubject', 'tlsIssuerDn', 'tlsSni']

    train['srcAddressPort'] = train['srcAddress'].astype(str) + train['srcPort'].astype(str)
    train['destAddressPort'] = train['destAddress'].astype(str) + train['destPort'].astype(str)

    # srcAddress To destAddress
    tmp = train.groupby('srcAddress', as_index=False)['destAddress'].agg({
        's2d_count': 'count',
        's2d_nunique': 'nunique'
    })
    train = train.merge(tmp, on='srcAddress', how='left')

    # srcAddressPort To destAddressPort
    tmp = train.groupby('srcAddressPort', as_index=False)['destAddressPort'].agg({
        'sp2dp_count': 'count',
        'sp2dp_nunique': 'nunique'
    })
    train = train.merge(tmp, on='srcAddressPort', how='left')

    # srcAddress To destAddressPort
    tmp = train.groupby('srcAddress', as_index=False)['destAddressPort'].agg({
        's2dp_count': 'count',
        's2dp_nunique': 'nunique'
    })
    train = train.merge(tmp, on='srcAddress', how='left')

    # srcAddressPort To destAddress
    tmp = train.groupby('srcAddressPort', as_index=False)['destAddress'].agg({
        'sp2d_count': 'count',
        'sp2d_nunique': 'nunique'
    })
    train = train.merge(tmp, on='srcAddressPort', how='left')

    # destAddress To srcAddress
    tmp = train.groupby('destAddress', as_index=False)['srcAddress'].agg({
        'd2s_count': 'count',
        'd2s_nunique': 'nunique'
    })
    train = train.merge(tmp, on='destAddress', how='left')

    # destAddressPort To srcAddressPort
    tmp = train.groupby('destAddressPort', as_index=False)['srcAddressPort'].agg({
        'dp2sp_count': 'count',
        'dp2sp_nunique': 'nunique'
    })
    train = train.merge(tmp, on='destAddressPort', how='left')

    # destAddressPort To srcAddress
    tmp = train.groupby('destAddressPort', as_index=False)['srcAddress'].agg({
        'dp2s_count': 'count',
        'dp2s_nunique': 'nunique'
    })
    train = train.merge(tmp, on='destAddressPort', how='left')

    # destAddress To srcAddressProt
    tmp = train.groupby('destAddress', as_index=False)['srcAddressPort'].agg({
        'd2sp_count': 'count',
        'd2sp_nunique': 'nunique'
    })
    train = train.merge(tmp, on='destAddress', how='left')

    tlsVersion_map = {
        'TLSv1': 1,
        'TLS 1.2': 1,
        'TLS 1.3': 1,
        'SSLv2': 2,
        'SSLv3': 3,
        '0x4854': 4,
        '0x4752': 4,
        'UNDETERMINED': 5
    }
    train['tlsVersion_map'] = train['tlsVersion'].map(tlsVersion_map).astype('category')
    cat_cols.append('tlsVersion_map')

    cat_cols += ['srcAddressPort', 'destAddressPort']
    num_cols = ['bytesOut', 'bytesIn', 'pktsIn', 'pktsOut']

    train = count_encode(train, cat_cols)
    train = cross_cat_num(train, cat_cols, num_cols)
    train = arithmetic(train, num_cols)

    used_cols = [i for i in train.columns if i not in ['eventId', 'label']]
    y = train['label']
    train = train[used_cols].copy()

    psi_drop_cols = ['tlsSubject', 'destAddress', 'srcAddress', 'srcAddressPort', 'tlsIssuerDn', 'tlsSni',
                     'destAddressPort', 'tlsVersion']

    train.drop(psi_drop_cols, axis=1, inplace=True)

    X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.25, random_state=2020, stratify=y)

    print('y_train mean: ', y_train.mean())
    print('y_valid mean: ', y_valid.mean())

    used_cols = X_train.columns.to_list()

    useful_dict, useless_dict, useful_cols, useless_cols = auc_select(X_train, y_train, X_valid, y_valid, used_cols, threshold=0.52)
    print('AUC drop features: \n', useless_cols)

    X_train = X_train[useful_cols]
    X_valid = X_valid[useful_cols]

    col_corr = correlation(X_train, useful_dict, threshold=0.98)
    print('Correlation drop features: \n', col_corr)

    X_train.drop(col_corr, axis=1, inplace=True)
    X_valid.drop(col_corr, axis=1, inplace=True)

    used_cols = X_train.columns.to_list()
    print('*' * 20)
    print('used_cols: \n', used_cols)
    print('len(used_cols): \n', len(used_cols))

    train_dataset = lgb.Dataset(X_train, y_train)
    valid_dataset = lgb.Dataset(X_valid, y_valid, reference=train_dataset)
    all_dataset = lgb.Dataset(train[used_cols], y, reference=train_dataset)

    params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'metric': 'auc',
        # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
        'learning_rate': 0.1,
        'num_leaves': 31,
        'lambda_l1': 0,
        'lambda_l2': 1,
        'num_threads': 23,
        'min_data_in_leaf': 20,
        'first_metric_only': True,
        'is_unbalance': True,
        'max_depth': -1,
        'seed': 2020
    }
    valid_model = lgb.train(
        params,
        train_dataset,
        valid_sets=[valid_dataset, train_dataset],
        early_stopping_rounds=200,
        num_boost_round=1000000,
        verbose_eval=300
    )
    pred = valid_model.predict(X_valid)

    f1_best = 0
    for i in np.arange(0.1, 1, 0.01):
        y_valid_pred = np.where(pred > i, 1, 0)
        f1 = np.round(f1_score(y_valid, y_valid_pred), 5)
        if f1 > f1_best:
            threshold = i
            f1_best = f1

    print('threshold: ', threshold)
    np.save('threshold.npy', threshold)
    y_valid_pred = np.where(pred > threshold, 1, 0)
    print('Valid F1: ', np.round(f1_score(y_valid, y_valid_pred), 5))
    print('Valid mean label: ', np.mean(y_valid_pred))

    train_model = lgb.train(
        params,
        all_dataset,
        num_boost_round=valid_model.best_iteration + 20
    )
    train_model.save_model('./lgb.txt')


if __name__ == '__main__':
    train_path = '../../大数据队_eta_submission_1011/data/train.csv'
    train_func(train_path)

srcAddress
destAddress
tlsVersion
tlsSubject
tlsIssuerDn
tlsSni
tlsVersion_map
srcAddressPort
destAddressPort
y_train mean:  0.09090909090909091
y_valid mean:  0.09090909090909091
srcPort
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[9]	training's auc: 0.93278	valid_0's auc: 0.935548
Evaluated only: auc
**********
0.9355476
destPort
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[162]	training's auc: 0.645706	valid_0's auc: 0.621792
Evaluated only: auc
**********
0.6217918
bytesOut
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[19]	training's auc: 0.917491	valid_0's auc: 0.914051
Evaluated only: auc
**********
0.9140512
bytesIn
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[44]	training's auc: 0.864614	valid_0's auc: 0.85094
Evaluated only: auc
**********
0.8509402
pktsIn
Training until va

Early stopping, best iteration is:
[150]	training's auc: 0.904968	valid_0's auc: 0.894777
Evaluated only: auc
**********
0.8947774
srcAddress_bytesOut_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[152]	training's auc: 0.825844	valid_0's auc: 0.834562
Evaluated only: auc
**********
0.8345622
srcAddress_bytesOut_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[294]	training's auc: 0.915109	valid_0's auc: 0.905322
Evaluated only: auc
**********
0.9053218
srcAddress_bytesOut_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[89]	training's auc: 0.95173	valid_0's auc: 0.944173
Evaluated only: auc
**********
0.9441734
srcAddress_bytesOut_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[116]	training's auc: 0.920158	valid_0's auc: 0.91632
Evaluated only: auc
*****

Early stopping, best iteration is:
[380]	training's auc: 0.857913	valid_0's auc: 0.85726
Evaluated only: auc
**********
0.8572604
srcAddress_pktsOut_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[13]	training's auc: 0.859902	valid_0's auc: 0.859634
Evaluated only: auc
**********
0.8596338
srcAddress_pktsOut_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[133]	training's auc: 0.859148	valid_0's auc: 0.848236
Evaluated only: auc
**********
0.8482358
destAddress_bytesOut_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[17]	training's auc: 0.877934	valid_0's auc: 0.892818
Evaluated only: auc
**********
0.8928176
destAddress_bytesOut_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[93]	training's auc: 0.914481	valid_0's auc: 0.900776
Evaluated only: auc
**********
0

Early stopping, best iteration is:
[83]	training's auc: 0.902992	valid_0's auc: 0.889735
Evaluated only: auc
**********
0.889735
destAddress_pktsIn_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[29]	training's auc: 0.854389	valid_0's auc: 0.848086
Evaluated only: auc
**********
0.8480862
destAddress_pktsOut_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[17]	training's auc: 0.877934	valid_0's auc: 0.892818
Evaluated only: auc
**********
0.8928176
destAddress_pktsOut_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[174]	training's auc: 0.865486	valid_0's auc: 0.840688
Evaluated only: auc
**********
0.8406878
destAddress_pktsOut_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[18]	training's auc: 0.869095	valid_0's auc: 0.870233
Evaluated only: auc
**********
0.8702332
d

Early stopping, best iteration is:
[1]	training's auc: 0.749695	valid_0's auc: 0.721082
Evaluated only: auc
**********
0.7210824
tlsVersion_pktsIn_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.720607	valid_0's auc: 0.688373
Evaluated only: auc
**********
0.6883728
tlsVersion_pktsIn_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.743022	valid_0's auc: 0.711548
Evaluated only: auc
**********
0.711548
tlsVersion_pktsIn_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.749696	valid_0's auc: 0.721082
Evaluated only: auc
**********
0.7210824
tlsVersion_pktsIn_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.749695	valid_0's auc: 0.721082
Evaluated only: auc
**********
0.7210824
tlsVersion_pktsIn_sk

Early stopping, best iteration is:
[93]	training's auc: 0.981688	valid_0's auc: 0.972236
Evaluated only: auc
**********
0.9722356
tlsSubject_bytesIn_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[202]	training's auc: 0.981988	valid_0's auc: 0.96703
Evaluated only: auc
**********
0.96703
tlsSubject_bytesIn_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[88]	training's auc: 0.982368	valid_0's auc: 0.981319
Evaluated only: auc
**********
0.9813186
tlsSubject_bytesIn_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[80]	training's auc: 0.965364	valid_0's auc: 0.964829
Evaluated only: auc
**********
0.9648288
tlsSubject_bytesIn_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[78]	training's auc: 0.973046	valid_0's auc: 0.973075
Evaluated only: auc
**********
0.9730746
tlsSubject_byt

Early stopping, best iteration is:
[10]	training's auc: 0.986577	valid_0's auc: 0.987416
Evaluated only: auc
**********
0.9874156
tlsIssuerDn_bytesOut_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[50]	training's auc: 0.975864	valid_0's auc: 0.978967
Evaluated only: auc
**********
0.9789672
tlsIssuerDn_bytesOut_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[46]	training's auc: 0.977995	valid_0's auc: 0.980327
Evaluated only: auc
**********
0.9803268
tlsIssuerDn_bytesOut_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[51]	training's auc: 0.98521	valid_0's auc: 0.98823
Evaluated only: auc
**********
0.9882302
tlsIssuerDn_bytesOut_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[294]	training's auc: 0.99021	valid_0's auc: 0.99188
Evaluated only: auc
**********
0.99188
tl

Early stopping, best iteration is:
[82]	training's auc: 0.979144	valid_0's auc: 0.980223
Evaluated only: auc
**********
0.980223
tlsIssuerDn_pktsOut_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[18]	training's auc: 0.984038	valid_0's auc: 0.988464
Evaluated only: auc
**********
0.9884642
tlsIssuerDn_pktsOut_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[67]	training's auc: 0.989262	valid_0's auc: 0.992292
Evaluated only: auc
**********
0.992292
tlsIssuerDn_pktsOut_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[25]	training's auc: 0.93171	valid_0's auc: 0.942702
Evaluated only: auc
**********
0.9427022
tlsIssuerDn_pktsOut_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[102]	training's auc: 0.95836	valid_0's auc: 0.959457
Evaluated only: auc
**********

Early stopping, best iteration is:
[33]	training's auc: 0.905495	valid_0's auc: 0.90244
Evaluated only: auc
**********
0.9024398
tlsSni_pktsIn_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[26]	training's auc: 0.920089	valid_0's auc: 0.914295
Evaluated only: auc
**********
0.9142946
tlsSni_pktsOut_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[34]	training's auc: 0.887717	valid_0's auc: 0.881619
Evaluated only: auc
**********
0.8816194
tlsSni_pktsOut_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[69]	training's auc: 0.913932	valid_0's auc: 0.90958
Evaluated only: auc
**********
0.9095798
tlsSni_pktsOut_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[24]	training's auc: 0.828601	valid_0's auc: 0.835015
Evaluated only: auc
**********
0.835015
tlsSni_pktsOut_median
Tr

Early stopping, best iteration is:
[1]	training's auc: 0.505201	valid_0's auc: 0.5065
Evaluated only: auc
**********
0.5065
tlsVersion_map_pktsIn_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.505135	valid_0's auc: 0.5064
Evaluated only: auc
**********
0.5064
tlsVersion_map_pktsIn_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.505201	valid_0's auc: 0.5065
Evaluated only: auc
**********
0.5065
tlsVersion_map_pktsIn_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.505201	valid_0's auc: 0.5065
Evaluated only: auc
**********
0.5065
tlsVersion_map_pktsIn_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.505201	valid_0's auc: 0.5065
Evaluated only: auc
**********
0.5065
tlsVersion_map_pktsIn_skew
T

Early stopping, best iteration is:
[44]	training's auc: 0.864614	valid_0's auc: 0.85094
Evaluated only: auc
**********
0.8509402
srcAddressPort_bytesIn_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	training's auc: 0.867703	valid_0's auc: 0.857591
Evaluated only: auc
**********
0.857591
srcAddressPort_bytesIn_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_0's auc: 0.5
Evaluated only: auc
**********
0.5
srcAddressPort_bytesIn_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.5367	valid_0's auc: 0.5353
Evaluated only: auc
**********
0.5353
srcAddressPort_bytesIn_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_0's auc: 0.5
Evaluated only: auc
**********
0.5
srcAddressPort_bytesIn_max_min
Traini

Early stopping, best iteration is:
[302]	training's auc: 0.922894	valid_0's auc: 0.925119
Evaluated only: auc
**********
0.9251192
destAddressPort_bytesOut_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[90]	training's auc: 0.841767	valid_0's auc: 0.84689
Evaluated only: auc
**********
0.8468902
destAddressPort_bytesOut_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[264]	training's auc: 0.91901	valid_0's auc: 0.921151
Evaluated only: auc
**********
0.9211508
destAddressPort_bytesOut_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[41]	training's auc: 0.952725	valid_0's auc: 0.951432
Evaluated only: auc
**********
0.9514316
destAddressPort_bytesOut_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[65]	training's auc: 0.935244	valid_0's auc: 0.928347
Evaluat

Early stopping, best iteration is:
[50]	training's auc: 0.815296	valid_0's auc: 0.817017
Evaluated only: auc
**********
0.8170168
destAddressPort_pktsOut_max_min
Training until validation scores don't improve for 50 rounds
[500]	training's auc: 0.868539	valid_0's auc: 0.864179
Early stopping, best iteration is:
[552]	training's auc: 0.868549	valid_0's auc: 0.864194
Evaluated only: auc
**********
0.8641936
destAddressPort_pktsOut_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[14]	training's auc: 0.862332	valid_0's auc: 0.859148
Evaluated only: auc
**********
0.8591482
destAddressPort_pktsOut_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[213]	training's auc: 0.864192	valid_0's auc: 0.861265
Evaluated only: auc
**********
0.8612648
bytesOut_bytesIn_add
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[22]	training's auc

Correlation drop features: 
 {'srcAddressPort_pktsIn_quantile_75', 'tlsSubject_bytesIn_count', 'tlsSubject_pktsIn_min', 'srcAddress_pktsIn_max', 'srcAddressPort_bytesIn_quantile_75', 'destAddressPort_pktsOut_nunique', 'bytesIn_pktsOut_add', 'destAddressPort_pktsOut_max_min', 'destAddress_pktsOut_sum', 'srcAddress_pktsIn_max_min', 's2dp_count', 'tlsSni_bytesIn_median', 'bytesIn_pktsIn_add', 'dp2s_count', 'destAddress_pktsOut_max', 'tlsIssuerDn_pktsOut_median', 'srcAddressPort_bytesIn_count', 'tlsSubject_bytesOut_max', 'tlsSubject_bytesOut_count', 'tlsSni_bytesIn_mean', 'tlsSni_pktsOut_max_min', 'destAddress_count', 'destAddress_bytesOut_max', 'srcAddressPort_pktsOut_median', 'tlsVersion_bytesOut_max_min', 'srcAddressPort_bytesIn_max', 'destAddressPort_pktsOut_quantile_75', 'destAddressPort_bytesIn_skew', 'destAddressPort_bytesOut_std', 'destAddressPort_bytesOut_count', 'srcAddress_pktsIn_quantile_75', 'tlsVersion_bytesOut_min', 'srcAddressPort_pktsIn_quantile_25', 'destAddressPort_bytes

Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[42]	training's auc: 1	valid_0's auc: 0.99997
Evaluated only: auc
threshold:  0.5199999999999998
Valid F1:  0.99202
Valid mean label:  0.09127272727272727
