In [1]:
import numpy as np
import pandas as pd
import re
import gc
from tqdm import tqdm
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
def get_psi(c, train, test):
    psi_res = pd.DataFrame()
    psi_dict={}
    # for c in tqdm(f_cols):
    try:
        t_train = train[c].fillna(-998)
        t_test = test[c].fillna(-998)
        #获取切分点
        bins=[]
        for i in np.arange(0,1.1,0.2):
            bins.append(t_train.quantile(i))
        bins=sorted(set(bins))
        bins[0]=-np.inf
        bins[-1]=np.inf
        #计算psi
        t_psi = pd.DataFrame()
        t_psi['train'] = pd.cut(t_train,bins).value_counts().sort_index()
        t_psi['test'] = pd.cut(t_test,bins).value_counts()
        t_psi.index=[str(x) for x in t_psi.index]
        t_psi.loc['总计',:] = t_psi.sum()
        t_psi['train_rate'] = t_psi['train']/t_psi.loc['总计','train']
        t_psi['test_rate'] = t_psi['test']/t_psi.loc['总计','test']
        t_psi['psi'] = (t_psi['test_rate']-t_psi['train_rate'])*(np.log(t_psi['test_rate'])-np.log(t_psi['train_rate']))
        t_psi.loc['总计','psi'] = t_psi['psi'].sum()
        t_psi.index.name=c
        #汇总
        t_res = pd.DataFrame([[c,t_psi.loc['总计','psi']]],
                             columns=['变量名','PSI'])
        psi_res = pd.concat([psi_res,t_res])
        psi_dict[c]=t_psi
        print(c,'done')
    except:
        print(c,'error')
    return psi_res #, psi_dict

In [3]:
def arithmetic(df, cross_features):
    """
    数值特征之间的加减乘除
    @param df:
    @param cross_features: 交叉用的数值特征
    @return:
    """
    for i in tqdm(range(len(cross_features))):
        for j in range(i + 1, len(cross_features)):
            colname_add = '{}_{}_add'.format(cross_features[i], cross_features[j])
            colname_substract = '{}_{}_subtract'.format(cross_features[i], cross_features[j])
            colname_multiply = '{}_{}c_multiply'.format(cross_features[i], cross_features[j])
            df[colname_add] = df[cross_features[i]] + df[cross_features[j]]
            df[colname_substract] = df[cross_features[i]] - df[cross_features[j]]
            df[colname_multiply] = df[cross_features[i]] * df[cross_features[j]]

    for f1 in tqdm(cross_features):
        for f2 in cross_features:
            if f1 != f2:
                colname_ratio = '{}_{}_ratio'.format(f1, f2)
                df[colname_ratio] = df[f1].values / (df[f2].values + 0.001)
    return df

In [4]:
def correlation(df, threshold=0.98):
    """
    特征相关性计算
    @param df:
    @param threshold:
    @return:
    """
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName = corr_matrix.columns[i]
                col_corr.add(colName)
    return col_corr

In [5]:
def count_encode(df, cols=[]):
    for col in cols:
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')


# 交叉特征
def cross_cat_num(df, cat_col, num_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            df_new = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_sum'.format(f1, f2): 'sum',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_nunique'.format(f1, f2): 'nunique'
            })
            df = df.merge(df_new, on=f1, how='left')
            del df_new
            gc.collect()
    return df

In [6]:
def train_func(train_path, test_path, save_path):
    # 请填写训练代码
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    data = pd.concat([train, test])
    del train, test
    gc.collect()

    single_cols = ['appProtocol']
    data.drop(single_cols, axis=1, inplace=True)
    gc.collect()

    cat_cols = ['srcAddress', 'destAddress',
                'tlsSubject', 'tlsIssuerDn', 'tlsSni']

    data['srcAddressPort'] = data['srcAddress'].astype(str) + data['srcPort'].astype(str)
    data['destAddressPort'] = data['destAddress'].astype(str) + data['destPort'].astype(str)
    
    # srcAddress To destAddress
    tmp = data.groupby('srcAddress', as_index=False)['destAddress'].agg({
        's2d_count': 'count',
        's2d_nunique': 'nunique'
    })
    data = data.merge(tmp, on='srcAddress', how='left')
    del tmp
    gc.collect()
    
    # srcAddressPort To destAddressPort
    tmp = data.groupby('srcAddressPort', as_index=False)['destAddressPort'].agg({
        'sp2dp_count': 'count',
        'sp2dp_nunique': 'nunique'
    })
    data = data.merge(tmp, on='srcAddressPort', how='left')
    del tmp
    gc.collect()
    
    # srcAddress To destAddressPort
    tmp = data.groupby('srcAddress', as_index=False)['destAddressPort'].agg({
        's2dp_count': 'count',
        's2dp_nunique': 'nunique'
    })
    data = data.merge(tmp, on='srcAddress', how='left')
    del tmp
    gc.collect()
    
    # srcAddressPort To destAddress
    tmp = data.groupby('srcAddressPort', as_index=False)['destAddress'].agg({
        'sp2d_count': 'count',
        'sp2d_nunique': 'nunique'
    })
    data = data.merge(tmp, on='srcAddressPort', how='left')
    del tmp
    gc.collect()
    
    # destAddress To srcAddress
    tmp = data.groupby('destAddress', as_index=False)['srcAddress'].agg({
        'd2s_count': 'count',
        'd2s_nunique': 'nunique'
    })
    data = data.merge(tmp, on='destAddress', how='left')
    del tmp
    gc.collect()
    
    # destAddressPort To srcAddressPort
    tmp = data.groupby('destAddressPort', as_index=False)['srcAddressPort'].agg({
        'dp2sp_count': 'count',
        'dp2sp_nunique': 'nunique'
    })
    data = data.merge(tmp, on='destAddressPort', how='left')
    del tmp
    gc.collect()
    
    # destAddressPort To srcAddress
    tmp = data.groupby('destAddressPort', as_index=False)['srcAddress'].agg({
        'dp2s_count': 'count',
        'dp2s_nunique': 'nunique'
    })
    data = data.merge(tmp, on='destAddressPort', how='left')
    del tmp
    gc.collect()
    
    # destAddress To srcAddressProt
    tmp = data.groupby('destAddress', as_index=False)['srcAddressPort'].agg({
        'd2sp_count': 'count',
        'd2sp_nunique': 'nunique'
    })
    data = data.merge(tmp, on='destAddress', how='left')
    del tmp
    gc.collect()
    
    cat_cols += ['srcAddressPort', 'destAddressPort']
    num_cols = ['bytesOut', 'bytesIn', 'pktsIn', 'pktsOut']
    
    arithmetic(data, num_cols)
#     data['bytesOut-bytesIn'] = data['bytesOut'] - data['bytesIn']
#     data['pktsOut-pktsIn'] = data['pktsOut'] - data['pktsIn']
    
#     data['bytesOut/bytesIn'] = data['bytesOut'] / data['bytesIn']
#     data['pktsOut/pktsIn'] = data['pktsOut'] / data['pktsIn']
    
#     num_cols += ['bytesOut-bytesIn', 'pktsOut-pktsIn']
    
    tlsVersion_map = {
        'TLS 1.2': 1,
        'TLS 1.3': 2,
        'TLSv1': 3,
        'UNDETERMINED': 4,
        'TLS 1.1': 5,
        'SSLv2': 6,
        'SSLv3': 6,
        '0x4854': 6,
        '0x4752': 6
    }
    data['tlsVersion_map'] = data['tlsVersion'].map(tlsVersion_map)
    cat_cols.append('tlsVersion_map')
    data.drop('tlsVersion', axis=1, inplace=True)

    count_encode(data, cat_cols)
    data = cross_cat_num(data, cat_cols, num_cols)

    for i in cat_cols:
        lbl = LabelEncoder()
        data[i] = lbl.fit_transform(data[i].astype(str))
        data[i] = data[i].astype('category')

#     for i in ['srcPort', 'destPort']:
#         data[i] = data[i].astype('category')
    
    used_cols = [i for i in data.columns if i not in ['eventId', 'label']]
    train = data.loc[data['label'].notnull(), :]
    y = train['label']
    test = data.loc[data['label'].isnull(), :]
    sub = test[['eventId']]
    
    psi_res = Parallel(n_jobs=4)(delayed(get_psi)(c, train, test) for c in tqdm(used_cols))
    psi_df = pd.concat(psi_res)
    print('psi_df.head(): \n', psi_df.head())
    features = list(psi_df[psi_df['PSI'] <= 0.2]['变量名'].values)
    print('not used features: ', list(set(used_cols) - set(features)))
    
    train = train[features]
    test = test[features]
    
    col_corr = correlation(train, 0.98)
    print(col_corr)
    train.drop(list(col_corr), axis=1, inplace=True)
    test.drop(list(col_corr), axis=1, inplace=True)
    
    X = train.copy()
    X_test = test.copy()
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=2020)
    
    print('y_train mean: ', y_train.mean())
    print('y_valid mean: ', y_valid.mean())
    
    prediction = pd.DataFrame()
    kfold = StratifiedKFold(n_splits=5)
    for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(X, y)):
        print('\nFold_{} Training ================================\n'.format(fold_id + 1))
    
        X_train = X.iloc[trn_idx]
        Y_train = y.iloc[trn_idx]
    
        X_val = X.iloc[val_idx]
        Y_val = y.iloc[val_idx]
    
        lgb_train = lgb.Dataset(X_train, Y_train)
        lgb_valid = lgb.Dataset(X_val, Y_val, reference=lgb_train)
    
        params = {
            'objective': 'binary',
            'boosting': 'gbdt',
            'metric': 'auc',
            # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
            'learning_rate': 0.1,
            'num_leaves': 31,
            'lambda_l1': 0,
            'lambda_l2': 1,
            'num_threads': 23,
            'min_data_in_leaf': 20,
            'first_metric_only': True,
            'is_unbalance': True,
            'max_depth': -1,
            'seed': fold_id
        }
        lgb_model = lgb.train(params,
                              lgb_train,
                              num_boost_round=10000,
                              valid_sets=[lgb_valid, lgb_train],
                              early_stopping_rounds=200,
                              verbose_eval=200)
    
        pred_val = lgb_model.predict(X_val)
    
        f1_best = 0
        for i in np.arange(0.1, 1, 0.01):
            y_valid_pred = np.where(pred_val > i, 1, 0)
            f1 = np.round(f1_score(Y_val, y_valid_pred), 5)
            if f1 > f1_best:
                threshold = i
                f1_best = f1
        
        print('threshold: ', threshold)
        y_valid_pred = np.where(pred_val > threshold, 1, 0)
        print('Valid F1: ', np.round(f1_score(Y_val, y_valid_pred), 5))
        print('Valid mean label: ', np.mean(y_valid_pred))
        
        pred_test = lgb_model.predict(X_test)
        prediction['label_{}'.format(fold_id)] = pred_test
    
        del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
        gc.collect()
    
    prediction['label_sum'] = prediction.apply('sum', axis=1)
    prediction['label'] = np.where(prediction['label_sum'] > 2, 1, 0)
    
    print('Test mean label: ', np.mean(prediction['label']))
    sub['label'] = prediction['label']
    sub.to_csv(save_path + '机器不学习原子弹也不学习_eta_submission_1015.csv', index=False)
    
    
#     train_dataset = lgb.Dataset(X_train, y_train)
#     valid_dataset = lgb.Dataset(X_valid, y_valid, reference=train_dataset)
#     all_dataset = lgb.Dataset(X, y, reference=train_dataset)

#     params = {'objective': 'binary',
#               'boosting': 'gbdt',
#               'metric': 'auc',
#               # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
#               'num_boost_round': 1000000,
#               'learning_rate': 0.1,
#               'num_leaves': 31,
#               'lambda_l1': 0,
#               'lambda_l2': 1,
#               'num_threads': 23,
#               'min_data_in_leaf': 20,
#               'first_metric_only': True,
#               'is_unbalance': True,
#               'max_depth': -1,
#               'seed': 2020}
#     valid_model = lgb.train(params,
#                             train_dataset,
#                             valid_sets=[train_dataset, valid_dataset],
#                             early_stopping_rounds=200,
#                             verbose_eval=300)
#     pred = valid_model.predict(X_valid)
    
#     f1_best = 0
#     for i in np.arange(0.1, 1, 0.01):
#         y_valid_pred = np.where(pred > i, 1, 0)
#         f1 = np.round(f1_score(y_valid, y_valid_pred), 5)
# #         print('f1: ', f1)
#         if f1 > f1_best:
#             threshold = i
#             f1_best = f1
        
#     print('threshold: ', threshold)
#     y_valid_pred = np.where(pred > threshold, 1, 0)
#     print('Valid F1: ', np.round(f1_score(y_valid, y_valid_pred), 5))
#     print('Valid mean label: ', np.mean(y_valid_pred))
#     binary_classification_report(y_valid, pred)
    
#     plt.figure(figsize=(8, 4))
#     ax = sns.kdeplot(pred, color='Red', shade=True)
#     ax.set_xlabel('pred')
#     ax.set_ylabel('Frequency')

#     params = {'objective': 'binary',
#               'boosting': 'gbdt',
#               'metric': 'auc',
#               # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
#               # 'num_boost_round': 1000000,
#               'learning_rate': 0.1,
#               'num_leaves': 31,
#               'lambda_l1': 0,
#               'lambda_l2': 1,
#               'num_threads': 23,
#               'min_data_in_leaf': 20,
#               'first_metric_only': True,
#               'is_unbalance': True,
#               'max_depth': -1,
#               'seed': 2020}
#     train_model = lgb.train(params,
#                             all_dataset,
#                             num_boost_round=valid_model.best_iteration+20)
#     y_test_pred = np.where(train_model.predict(X_test) > threshold, 1, 0)

#     print('Test mean label: ', np.mean(y_test_pred))
#     sub['label'] = y_test_pred
#     sub.to_csv(save_path + '机器不学习原子弹也不学习_eta_submission_1014.csv', index=False)

In [7]:
if __name__ == '__main__':
    train_path = '../大数据队_eta_submission_1011/data/train.csv'
    test_path = '../大数据队_eta_submission_1011/data/test_1.csv'
    save_path = '../大数据队_eta_submission_1011/result/'
    train_func(train_path, test_path, save_path)

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 223.37it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 575.63it/s]
  0%|                                                                                            | 0/8 [00:00<?, ?it/s]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A

srcAddress
destAddress
tlsSubject
tlsIssuerDn
tlsSni
srcAddressPort
destAddressPort
tlsVersion_map



 25%|█████████████████████                                                               | 1/4 [00:01<00:04,  1.67s/it][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:03<00:03,  1.67s/it][A
 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:05<00:01,  1.69s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.69s/it][A
 12%|██████████▌                                                                         | 1/8 [00:06<00:47,  6.74s/it]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
 25%|█████████████████████                                                               | 1/4 [00:02<00:06,  2.20s/it][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:04<00:04,  2.21s/it][A
 75%|█████████████

psi_df.head(): 
         变量名       PSI
0   srcPort  0.000476
0  destPort  0.000010
0  bytesOut  0.000352
0   bytesIn  0.001010
0    pktsIn  0.001269
not used features:  ['destAddressPort', 'srcAddressPort', 'tlsSni', 'tlsSubject', 'srcAddress', 'tlsIssuerDn', 'destAddress', 'tlsVersion_map']
{'srcAddress_pktsOut_median', 'd2s_count', 'tlsSni_pktsOut_mean', 'destAddressPort_pktsOut_median', 'bytesIn_pktsInc_multiply', 'srcAddressPort_pktsIn_sum', 'srcAddressPort_pktsOut_max', 'tlsSubject_pktsOut_median', 'destAddressPort_pktsIn_skew', 's2dp_nunique', 'srcAddressPort_bytesIn_mean', 'tlsVersion_map_pktsIn_sum', 'destAddressPort_bytesIn_min', 'srcAddressPort_bytesIn_min', 'destAddressPort_pktsOut_min', 'destAddressPort_bytesOut_median', 'srcAddress_count', 'tlsVersion_map_bytesIn_mean', 'destAddressPort_pktsIn_median', 'destAddress_bytesIn_median', 'tlsVersion_map_bytesIn_median', 'srcAddressPort_pktsOut_nunique', 'srcAddress_pktsOut_min', 'tlsVersion_map_pktsOut_mean', 'tlsVersion_map_pkt