In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
def count_encode(df, cols=[]):
    """
    count编码
    @param df:
    @param cols:
    @return:
    """
    for col in cols:
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')
    return df


def cross_cat_num(df, cat_col, num_col):
    """
    类别特征与数据特征groupby统计
    @param df:
    @param cat_col: 类别特征
    @param num_col: 数值特征
    @return:
    """
    def max_min(s):
        return s.max() - s.min()
    def quantile(s, q=0.25):
        return s.quantile(q)
    for f1 in cat_col:
        g = df.groupby(f1, as_index=False)
        for f2 in num_col:
            tmp = g[f2].agg({
                '{}_{}_count'.format(f1, f2): 'count',
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_sum'.format(f1, f2): 'sum',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_std'.format(f1, f2): 'std',
                '{}_{}_nunique'.format(f1, f2): 'nunique',
                '{}_{}_max_min'.format(f1, f2): max_min,
                '{}_{}_quantile_25'.format(f1, f2): lambda x: quantile(x, 0.25),
                '{}_{}_quantile_75'.format(f1, f2): lambda x: quantile(x, 0.75)
            })
            df = df.merge(tmp, on=f1, how='left')
    return df


def arithmetic(df, cross_features):
    """
    数值特征之间的加减乘除
    @param df:
    @param cross_features: 交叉用的数值特征
    @return:
    """
    for i in range(len(cross_features)):
        for j in range(i + 1, len(cross_features)):
            colname_add = '{}_{}_add'.format(cross_features[i], cross_features[j])
            colname_substract = '{}_{}_subtract'.format(cross_features[i], cross_features[j])
            colname_multiply = '{}_{}c_multiply'.format(cross_features[i], cross_features[j])
            df[colname_add] = df[cross_features[i]] + df[cross_features[j]]
            df[colname_substract] = df[cross_features[i]] - df[cross_features[j]]
            df[colname_multiply] = df[cross_features[i]] * df[cross_features[j]]

    for f1 in cross_features:
        for f2 in cross_features:
            if f1 != f2:
                colname_ratio = '{}_{}_ratio'.format(f1, f2)
                df[colname_ratio] = df[f1].values / (df[f2].values + 0.001)
    return df


def get_psi(c, x_train, x_test):
    psi_res = pd.DataFrame()
    psi_dict={}
    # for c in tqdm(f_cols):
    try:
        t_train = x_train[c].fillna(-998)
        t_test = x_test[c].fillna(-998)
        #获取切分点
        bins=[]
        for i in np.arange(0,1.1,0.2):
            bins.append(t_train.quantile(i))
        bins=sorted(set(bins))
        bins[0]=-np.inf
        bins[-1]=np.inf
        #计算psi
        t_psi = pd.DataFrame()
        t_psi['train'] = pd.cut(t_train,bins).value_counts().sort_index()
        t_psi['test'] = pd.cut(t_test,bins).value_counts()
        t_psi.index=[str(x) for x in t_psi.index]
        t_psi.loc['总计',:] = t_psi.sum()
        t_psi['train_rate'] = t_psi['train']/t_psi.loc['总计','train']
        t_psi['test_rate'] = t_psi['test']/t_psi.loc['总计','test']
        t_psi['psi'] = (t_psi['test_rate']-t_psi['train_rate'])*(np.log(t_psi['test_rate'])-np.log(t_psi['train_rate']))
        t_psi.loc['总计','psi'] = t_psi['psi'].sum()
        t_psi.index.name=c
        #汇总
        t_res = pd.DataFrame([[c,t_psi.loc['总计','psi']]],
                             columns=['变量名','PSI'])
        psi_res = pd.concat([psi_res,t_res])
        psi_dict[c]=t_psi
        print(c,'done')
    except:
        print(c,'error')
    return psi_res #, psi_dict


def auc_select(X_train, y_train, X_valid, y_valid, cols, threshold=0.52):
    """
    基于AUC的单特征筛选
    @param X_train:
    @param y_train:
    @param X_valid:
    @param y_valid:
    @param cols:
    @return:
    """
    useful_dict = dict()
    useless_dict = dict()
    params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'metric': 'auc',
        'learning_rate': 0.1,
        'num_leaves': 31,
        'lambda_l1': 0,
        'lambda_l2': 1,
        'num_threads': 23,
        'min_data_in_leaf': 20,
        'first_metric_only': True,
        'is_unbalance': True,
        'max_depth': -1,
        'seed': 2020
    }
    for i in cols:
        print(i)
        lgb_train = lgb.Dataset(X_train[[i]].values, y_train)
        lgb_valid = lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
        lgb_model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_valid, lgb_train],
            num_boost_round=1000,
            early_stopping_rounds=50,
            verbose_eval=500
        )
        print('*' * 10)
        print(lgb_model.best_score['valid_0']['auc'])
        if lgb_model.best_score['valid_0']['auc'] > threshold:
            useful_dict[i] = lgb_model.best_score['valid_0']['auc']
        else:
            useless_dict[i] = lgb_model.best_score['valid_0']['auc']
    useful_cols = list(useful_dict.keys())
    useless_cols = list(useless_dict.keys())
    return useful_dict, useless_dict, useful_cols, useless_cols


def correlation(df, useful_dict, threshold=0.98):
    """
    去除特征相关系数大于阈值的特征
    @param df:
    @param threshold:
    @param useful_dict:
    @return:
    """
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName_i = corr_matrix.columns[i]
                colName_j = corr_matrix.columns[j]
                if useful_dict[colName_i] >= useful_dict[colName_j]:
                    col_corr.add(colName_j)
                else:
                    col_corr.add(colName_i)
    return col_corr

In [3]:
train = pd.read_csv('../大数据队_ad_submission_1009_1/data/train.csv')
test = pd.read_csv('../大数据队_ad_submission_1009_1/data/test_1.csv')
train.shape, test.shape

((10396, 37), (15978, 36))

In [4]:
data = pd.concat([train, test])

In [5]:
single_cols = ['transProtocol', 'appProtocol', 'name']
null_cols = ['srcGeoCity', 'srcGeoAddress', 'srcGeoLatitude', 'srcGeoLongitude', 'destGeoAddress']
cat_cols = ['txId', 'destGeoCountry', 'destGeoRegion', 'destGeoCity', 'catOutcome', 'destHostName', 'responseCode']
num_cols = ['bytesOut', 'bytesIn']

drop_cols = ['srcAddress', 'srcPort', 'destAddress', 'destPort', 'destGeoLatitude', 'destGeoLongitude',
             'requestUrlQuery', 'requestUrl', 'httpReferer', 'requestBody', 'startTime']

In [6]:
drop_cols = drop_cols + single_cols + null_cols
data.drop(drop_cols, axis=1, inplace=True)

In [7]:
data['requestMethod'] = data['requestMethod'].str.upper()
cat_cols.append('requestMethod')
data['requestMethod'].unique()

array(['GET', 'POST', 'PUT', 'HEAD', 'OPTIONS', 'DELETE'], dtype=object)

In [8]:
data['httpVersion'] = data['httpVersion'].str.upper()
cat_cols.append('httpVersion')
data['httpVersion'].unique()

array(['HTTP/1.1', 'HTTP/1.0', nan], dtype=object)

In [9]:
data['accessAgent'] = data['accessAgent'].map(lambda x: np.nan if pd.isnull(x) else str(x).split('/')[0])
cat_cols.append('accessAgent')
data['accessAgent'].unique()

array(['MicroMessenger Client', 'Mozilla',
       'netdisk;6.8.9.1;PC;PC-Windows;10.0.18362;WindowsBaiduYunGuanJia',
       'NeteaseMusic 7.0.10', 'Post_Multipart', 'NeteaseMusic 6.4.2',
       '%E8%99%BE%E7%B1%B3%E9%9F%B3%E4%B9%90', 'sogou_ime', 'SogouPSI',
       nan, 'NewsApp', 'Dalvik',
       '7E0BAEBADE98106C1C455C47059C43B2\tBKL-AL00\tcom.baidu.input_huawei-8.2.8.80\t20191211\t1556251334604\t10144',
       '%E9%92%89%E9%92%89', 'SogouIMEMiniSetup_imepopup',
       'Youdao Desktop Dict (Windows NT 10.0)',
       'Android10-AndroidPhone-10042-56-0-Avatar-wifi', 'QYPlayer',
       'NeteaseMusic 5.6.0', 'LogStatistic', 'HCDNClient_IOS;libcurl',
       '() { _; OpenVAS-VT; } >_[$($())] { echo Content-Type: text',
       'ttuploadersdk(15825329774632)', 'SOGOU_POPUP_NEWS', 'Linux',
       'NeteaseMusic 6.4.4', 'MGTV-iPhone-appstore',
       '%E7%BD%91%E6%98%93%E4%BA%91%E9%9F%B3%E4%B9%90',
       'netdisk;10.0.150;iPhoneXR;ios-iphone;13.3.1;zh_CN',
       'bdtb for iOS 11.1.1', 'Youdao

In [10]:
data['responseCode_0'] = data['responseCode'].map(lambda x: np.nan if pd.isnull(x) else str(x)[0])
cat_cols.append('responseCode_0')
data['responseCode_0'].unique()

array(['4', '2', '5', '3', nan, '1'], dtype=object)

In [11]:
data['requestHeader'] = data['requestHeader'].map(lambda x: np.nan if pd.isnull(x) else str(x).split(':')[0])
cat_cols.append('requestHeader')
data['requestHeader'].unique()

array(['Accept', 'Host', 'User-Agent', 'Origin', 'Accept-Encoding',
       'Connection', 'HOST', 'Referer', 'Content-MD5', 'Content-Type',
       'qyid', 'apkbus', 'RC-App-Key', 'Cookie', 'Authorization', 'X-Tk',
       'Content-Encoding', 'user-agent', 'Accept-Language', 'S-COOKIE',
       'X-Requested-With', 'wup_version', 'Content-Length', 'sign',
       'accept-language', 'retrofit_exec_time', 'Cache-Control', 'VER',
       'Q-UA2', nan, 'gzipped', 'et', 'Pragma', 'XRAY-TRACEID',
       'Accept-Charset', 'ext', 'townlocalid', 't', 'Add-To-Queue-Millis',
       'needginfo', 'host', 'connection', 'Charset', 'RTraceID',
       'x-r-i-i', 'client_ip', 'aps_c_key', 'Expect', 'content-type',
       'Content-type', 'SN-REQID', 'encrypt', 'X-Unity-Version',
       'platform'], dtype=object)

In [12]:
data['responseHeader'] = data['responseHeader'].map(lambda x: np.nan if pd.isnull(x) else str(x).split(':')[0])
cat_cols.append('responseHeader')
data['responseHeader'].unique()

array(['Connection', 'Server', '***,Content-Type', 'Content-Length',
       'Accept-Ranges', 'X-AREQUESTID', nan, 'Content-Type',
       'Cache-Control', 'Transfer-Encoding',
       'Access-Control-Allow-Origin',
       '***,Content-Type,Accept,Authorization,DNT,X-CustomHeader,Keep-Alive,User-Agent,If-Modified-Since,Cache-Control,Access-Control-Allow-Methods',
       'content-type', 'Content-Encoding',
       'Access-Control-Allow-Credentials', 'CONTENT-LENGTH', 'Date',
       'Set-Cookie', 'Pragma', 'Upgrade', 'Vary', 'status',
       'Content-Range',
       '***,Content-Type,Range,accept,origin,user-ssn,user-yd,check-d,user-c,user-u,user-l,user-n,user-id,user-lc,user-d,user-tk,user-da,X-Via',
       'Location', '***,Content-Type,Authorization,', 'Cache-control',
       'Abtest', 'Access-Control-Allow-Methods',
       'Content-Security-Policy'], dtype=object)

In [13]:
data['requestContentType'] = data['requestContentType'].map(lambda x: np.nan if pd.isnull(x) else str(x).split('/')[0])
cat_cols.append('requestContentType')
data['requestContentType'].unique()

array(['application', nan, 'multipart', 'text', 'binary', 'image',
       "%{(#nike='multipart",
       "%{#context['com.opensymphony.xwork2.dispatcher.HttpServletResponse'].addHeader('St2_Head3rIKQY4','St2_Valu36REHPmultipart",
       'x'], dtype=object)

In [14]:
data['responseContentType'] = data['responseContentType'].map(lambda x: np.nan if pd.isnull(x) else str(x).split('/')[0])
cat_cols.append('responseContentType')
data['responseContentType'].unique()

array(['application', 'image', 'text', nan, 'audio', 'request', 'video'],
      dtype=object)

In [15]:
data['destDetailCity'] = data['destGeoCountry'].astype(str) + data['destGeoRegion'].astype(str) + data['destGeoCity'].astype(str)
cat_cols.append('destDetailCity')
data['destDetailCity'].unique()

array(['中国广东深圳', '局域网局域网nan', '中国浙江杭州', '中国陕西西安', '中国江苏无锡', '中国上海上海',
       '中国河南郑州', '中国江苏苏州', '中国山东青岛', '中国浙江宁波', '中国北京北京', '中国浙江温州',
       '中国广西南宁', '中国湖北武汉', '中国江苏南京', '中国浙江金华', '中国辽宁沈阳', '中国江苏常州',
       '中国宁夏中卫', '中国山东枣庄', '中国天津天津', '日本东京都东京', '美国加利福尼亚州洛杉矶', '中国福建福州',
       '中国湖北荆州', '中国广东广州', '中国浙江湖州', '中国湖南长沙', '中国广东佛山', '中国浙江嘉兴',
       '中国江苏镇江', '中国浙江绍兴', '中国四川成都', '中国江苏扬州', '中国河南洛阳', '爱尔兰都柏林郡都柏林',
       '中国新疆乌鲁木齐', '美国美国nan', '中国香港nan', '中国江西赣州', '中国浙江衢州', '美国华盛顿州西雅图',
       'nannannan', '中国广东nan', '中国浙江丽水', '中国安徽合肥', '中国中国nan', '中国广东云浮',
       '中国河北廊坊', '中国山西长治', '中国河北石家庄', '中国江苏盐城', '新加坡新加坡nan',
       '美国加利福尼亚州圣克拉拉', '中国四川乐山', '中国江西上饶', 'GOOGLE.COMGOOGLE.COMnan',
       '中国浙江台州', '中国广东茂名', '中国安徽芜湖', '中国浙江舟山', '中国江苏徐州', '中国安徽马鞍山',
       '中国广西桂林', '中国黑龙江哈尔滨', 'CLOUDFLARE.COMCLOUDFLARE.COMnan', '中国四川自贡',
       '中国山东济南', '中国江苏南通', '中国海南保亭黎族苗族自治县', '美国怀俄明州夏延', '中国广东东莞',
       'DNSPOD.COMDNSPOD.COMnan', '中国江苏泰州', '中国台湾台北市', '中国湖南株洲', '中国山东烟台',
       '中国广西柳州', '美国弗吉尼亚州

In [16]:
print(cat_cols)

['txId', 'destGeoCountry', 'destGeoRegion', 'destGeoCity', 'catOutcome', 'destHostName', 'responseCode', 'requestMethod', 'httpVersion', 'accessAgent', 'responseCode_0', 'requestHeader', 'responseHeader', 'requestContentType', 'responseContentType', 'destDetailCity']


In [17]:
for i in cat_cols:
    print(i)
    le = LabelEncoder()
    data[i] = le.fit_transform(data[i].astype(str))
    data[i] = data[i].astype('category')

txId
destGeoCountry
destGeoRegion
destGeoCity
catOutcome
destHostName
responseCode
requestMethod
httpVersion
accessAgent
responseCode_0
requestHeader
responseHeader
requestContentType
responseContentType
destDetailCity


In [18]:
data = count_encode(data, cat_cols)
data = cross_cat_num(data, cat_cols, num_cols)
data = arithmetic(data, num_cols)

txId
destGeoCountry
destGeoRegion
destGeoCity
catOutcome
destHostName
responseCode
requestMethod
httpVersion
accessAgent
responseCode_0
requestHeader
responseHeader
requestContentType
responseContentType
destDetailCity


In [19]:
used_cols = [i for i in data.columns if i not in ['eventId', 'label']]
train = data.loc[data['label'].notnull(), :]
test = data.loc[data['label'].isnull(), :]
sub = test[['eventId']]

y = train['label']
X = train[used_cols]
X_test = test[used_cols]

In [20]:
# 调用方法
psi_res = Parallel(n_jobs=4)(delayed(get_psi)(c, X, X_test) for c in used_cols)
psi_df = pd.concat(psi_res)
psi_used_cols = list(psi_df[psi_df['PSI'] <= 0.2]['变量名'].values)
psi_not_used_cols = list(psi_df[psi_df['PSI'] > 0.2]['变量名'].values)
print('PSI used features: \n', psi_used_cols)
print('PSI drop features: \n', psi_not_used_cols)
print('Error drop features: \n', list(set(used_cols) - set(psi_used_cols)))

cat_ = ['accessAgent', 'responseContentType', 'requestContentType', 'httpVersion', 'requestMethod',
        'destGeoCity', 'destDetailCity', 'destGeoCountry', 'responseHeader', 'destGeoRegion',
        'requestHeader', 'destHostName', 'catOutcome', 'responseCode', 'responseCode_0']
psi_used_cols += cat_

X = X[psi_used_cols]
X_test = X_test[psi_used_cols]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=2020)

print('y_train mean: ', y_train.mean())
print('y_valid mean: ', y_valid.mean())

useful_dict, useless_dict, useful_cols, useless_cols = auc_select(X_train, y_train, X_valid, y_valid, psi_used_cols, threshold=0.52)
print('AUC drop features: \n', useless_cols)

X_train = X_train[useful_cols]
X_valid = X_valid[useful_cols]
X_test = X_test[useful_cols]

col_corr = correlation(X_train, useful_dict, threshold=0.98)
print('Correlation drop features: \n', col_corr)

X_train.drop(col_corr, axis=1, inplace=True)
X_valid.drop(col_corr, axis=1, inplace=True)
X_test.drop(col_corr, axis=1, inplace=True)

used_cols = X_train.columns.to_list()

train_dataset = lgb.Dataset(X_train, y_train)
valid_dataset = lgb.Dataset(X_valid, y_valid, reference=train_dataset)
all_dataset = lgb.Dataset(train[used_cols], y, reference=train_dataset)

params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
          'num_boost_round': 1000000,
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0,
          'lambda_l2': 1,
          'num_threads': 23,
          'min_data_in_leaf': 20,
          'first_metric_only': True,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}
valid_model = lgb.train(params,
                        train_dataset,
                        valid_sets=[valid_dataset, train_dataset],
                        early_stopping_rounds=200,
                        verbose_eval=300)
pred = valid_model.predict(X_valid)

f1_best = 0
for i in np.arange(0.1, 1, 0.01):
    y_valid_pred = np.where(pred > i, 1, 0)
    f1 = np.round(f1_score(y_valid, y_valid_pred), 5)
    if f1 > f1_best:
        threshold = i
        f1_best = f1
        
print('threshold: ', threshold)
y_valid_pred = np.where(pred > threshold, 1, 0)
print('Valid F1: ', np.round(f1_score(y_valid, y_valid_pred), 5))
print('Valid mean label: ', np.mean(y_valid_pred))

params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'auc',
    # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
    # 'num_boost_round': 1000000,
    'learning_rate': 0.1,
    'num_leaves': 31,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'num_threads': 23,
    'min_data_in_leaf': 20,
    'first_metric_only': True,
    'is_unbalance': True,
    'max_depth': -1,
    'seed': 2020
}
train_model = lgb.train(
    params,
    all_dataset,
    num_boost_round=valid_model.best_iteration+20
)
y_test_pred = np.where(train_model.predict(X_test) > threshold, 1, 0)

print('Test mean label: ', np.mean(y_test_pred))
sub['label'] = y_test_pred
sub.to_csv('../sub/机器不学习原子弹也不学习_ad_submission_1022.csv', index=False, encoding='utf-8')

PSI used features: 
 ['bytesOut', 'bytesIn', 'txId_count', 'destGeoCountry_count', 'destGeoRegion_count', 'destGeoCity_count', 'catOutcome_count', 'destHostName_count', 'responseCode_count', 'requestMethod_count', 'httpVersion_count', 'accessAgent_count', 'responseCode_0_count', 'requestHeader_count', 'responseHeader_count', 'requestContentType_count', 'responseContentType_count', 'destDetailCity_count', 'txId_bytesOut_count', 'txId_bytesOut_max', 'txId_bytesOut_min', 'txId_bytesOut_median', 'txId_bytesOut_mean', 'txId_bytesOut_sum', 'txId_bytesOut_skew', 'txId_bytesOut_std', 'txId_bytesOut_nunique', 'txId_bytesOut_max_min', 'txId_bytesOut_quantile_25', 'txId_bytesOut_quantile_75', 'txId_bytesIn_count', 'txId_bytesIn_max', 'txId_bytesIn_min', 'txId_bytesIn_median', 'txId_bytesIn_mean', 'txId_bytesIn_sum', 'txId_bytesIn_skew', 'txId_bytesIn_std', 'txId_bytesIn_nunique', 'txId_bytesIn_max_min', 'txId_bytesIn_quantile_25', 'txId_bytesIn_quantile_75', 'destGeoCountry_bytesOut_count', 'dest

Early stopping, best iteration is:
[256]	training's auc: 0.983016	valid_0's auc: 0.976567
Evaluated only: auc
**********
0.9765672741971918
bytesIn
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[223]	training's auc: 0.928302	valid_0's auc: 0.907369
Evaluated only: auc
**********
0.9073688340407319
txId_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[15]	training's auc: 0.581509	valid_0's auc: 0.593159
Evaluated only: auc
**********
0.5931593370664955
destGeoCountry_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.594929	valid_0's auc: 0.58005
Evaluated only: auc
**********
0.5800504690230767
destGeoRegion_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.641851	valid_0's auc: 0.629007
Evaluated only: auc
**********
0.6290066103125631
de

Early stopping, best iteration is:
[22]	training's auc: 0.581128	valid_0's auc: 0.594388
Evaluated only: auc
**********
0.5943877732995684
txId_bytesIn_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[65]	training's auc: 0.581338	valid_0's auc: 0.590765
Evaluated only: auc
**********
0.5907654765726242
txId_bytesIn_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	training's auc: 0.576516	valid_0's auc: 0.587789
Evaluated only: auc
**********
0.5877890525614962
txId_bytesIn_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[26]	training's auc: 0.579692	valid_0's auc: 0.591066
Evaluated only: auc
**********
0.5910662695723509
destGeoCountry_bytesOut_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.594929	valid_0's auc: 0.58005
Evaluated onl

Early stopping, best iteration is:
[16]	training's auc: 0.642309	valid_0's auc: 0.631276
Evaluated only: auc
**********
0.6312759329933064
destGeoRegion_bytesOut_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.610463	valid_0's auc: 0.607426
Evaluated only: auc
**********
0.6074259014873203
destGeoRegion_bytesOut_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.594417	valid_0's auc: 0.581159
Evaluated only: auc
**********
0.5811594202898551
destGeoRegion_bytesIn_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.641851	valid_0's auc: 0.629007
Evaluated only: auc
**********
0.6290066103125631
destGeoRegion_bytesIn_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.642492	valid_0's auc: 

Early stopping, best iteration is:
[115]	training's auc: 0.646985	valid_0's auc: 0.63239
Evaluated only: auc
**********
0.6323902343332025
catOutcome_bytesOut_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.78004	valid_0's auc: 0.771951
Evaluated only: auc
**********
0.7719507555492148
catOutcome_bytesOut_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.78004	valid_0's auc: 0.771951
Evaluated only: auc
**********
0.7719507555492148
catOutcome_bytesOut_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.779734	valid_0's auc: 0.772477
Evaluated only: auc
**********
0.7724774405250205
catOutcome_bytesOut_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.779734	valid_0's auc: 0.772477
Evaluated only: 

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.725183	valid_0's auc: 0.706858
Evaluated only: auc
**********
0.7068581992842791
destHostName_bytesIn_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[7]	training's auc: 0.735105	valid_0's auc: 0.704641
Evaluated only: auc
**********
0.7046405939770066
destHostName_bytesIn_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[441]	training's auc: 0.652475	valid_0's auc: 0.634664
Evaluated only: auc
**********
0.6346643126344949
destHostName_bytesIn_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[111]	training's auc: 0.746976	valid_0's auc: 0.716006
Evaluated only: auc
**********
0.7160056354103506
destHostName_bytesIn_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:


Early stopping, best iteration is:
[18]	training's auc: 0.504561	valid_0's auc: 0.504721
Evaluated only: auc
**********
0.5047211423000558
requestMethod_bytesOut_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[39]	training's auc: 0.540782	valid_0's auc: 0.546084
Evaluated only: auc
**********
0.5460843409304371
requestMethod_bytesOut_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[35]	training's auc: 0.540388	valid_0's auc: 0.542582
Evaluated only: auc
**********
0.5425818263960719
requestMethod_bytesOut_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.539498	valid_0's auc: 0.540375
Evaluated only: auc
**********
0.540375218461319
requestMethod_bytesOut_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.539498	valid_0's auc: 0.540375
E

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.502701	valid_0's auc: 0.502461
Evaluated only: auc
**********
0.5024610336341263
httpVersion_bytesIn_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.502701	valid_0's auc: 0.502461
Evaluated only: auc
**********
0.5024610336341263
httpVersion_bytesIn_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.502701	valid_0's auc: 0.502461
Evaluated only: auc
**********
0.5024610336341263
httpVersion_bytesIn_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.502701	valid_0's auc: 0.502461
Evaluated only: auc
**********
0.5024610336341263
httpVersion_bytesIn_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration 

[1]	training's auc: 0.783014	valid_0's auc: 0.773865
Evaluated only: auc
**********
0.7738648928202019
responseCode_0_bytesOut_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.783011	valid_0's auc: 0.773865
Evaluated only: auc
**********
0.7738648928202019
responseCode_0_bytesOut_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.782571	valid_0's auc: 0.774364
Evaluated only: auc
**********
0.7743639357515664
responseCode_0_bytesOut_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.783014	valid_0's auc: 0.773865
Evaluated only: auc
**********
0.7738648928202019
responseCode_0_bytesIn_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.783011	valid_0's auc: 0.773865
Evaluated only: au

Early stopping, best iteration is:
[4]	training's auc: 0.542022	valid_0's auc: 0.525292
Evaluated only: auc
**********
0.5252921734374815
requestHeader_bytesIn_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	training's auc: 0.5427	valid_0's auc: 0.523566
Evaluated only: auc
**********
0.5235658831781812
requestHeader_bytesIn_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[54]	training's auc: 0.5231	valid_0's auc: 0.516733
Evaluated only: auc
**********
0.5167332453543532
requestHeader_bytesIn_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.542922	valid_0's auc: 0.527161
Evaluated only: auc
**********
0.5271611323132527
responseHeader_bytesOut_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[52]	training's auc: 0.599511	valid_0's auc

[1]	training's auc: 0.567886	valid_0's auc: 0.55918
Evaluated only: auc
**********
0.5591801310173461
requestContentType_bytesOut_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.567824	valid_0's auc: 0.559263
Evaluated only: auc
**********
0.5592630571506699
requestContentType_bytesOut_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_0's auc: 0.5
Evaluated only: auc
**********
0.5
requestContentType_bytesIn_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.567886	valid_0's auc: 0.55918
Evaluated only: auc
**********
0.5591801310173461
requestContentType_bytesIn_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.567886	valid_0's auc: 0.55918
Evaluated only: auc
**********
0.55

Early stopping, best iteration is:
[1]	training's auc: 0.553046	valid_0's auc: 0.539102
Evaluated only: auc
**********
0.5391016038330302
responseContentType_bytesIn_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.552974	valid_0's auc: 0.539502
Evaluated only: auc
**********
0.5395016704117178
responseContentType_bytesIn_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.553016	valid_0's auc: 0.538898
Evaluated only: auc
**********
0.5388980038282746
destDetailCity_bytesOut_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[18]	training's auc: 0.647681	valid_0's auc: 0.63147
Evaluated only: auc
**********
0.6314703189832483
destDetailCity_bytesOut_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[11]	training's auc: 0.647897	v

[5]	training's auc: 0.646118	valid_0's auc: 0.632478
Evaluated only: auc
**********
0.6324776188607911
destDetailCity
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[22]	training's auc: 0.647759	valid_0's auc: 0.631313
Evaluated only: auc
**********
0.6313130862788459
destGeoCountry
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.594103	valid_0's auc: 0.581522
Evaluated only: auc
**********
0.5815217391304348
responseHeader
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[50]	training's auc: 0.598865	valid_0's auc: 0.583813
Evaluated only: auc
**********
0.583812759329933
destGeoRegion
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[13]	training's auc: 0.641693	valid_0's auc: 0.629858
Evaluated only: auc
**********
0.629858163617125
requestHeader
Training until validation s

Training until validation scores don't improve for 200 rounds
[300]	training's auc: 1	valid_0's auc: 0.995653
[600]	training's auc: 1	valid_0's auc: 0.995817
Early stopping, best iteration is:
[642]	training's auc: 1	valid_0's auc: 0.995857
Evaluated only: auc
threshold:  0.5199999999999998
Valid F1:  0.9769
Valid mean label:  0.4636398614851866
Test mean label:  0.43666291150331704
