> 主要目的是检查对浅橙使用random forest算法获取的maxent_id比例过低的问题：
- 模型在全集上的预测的结果，没有达到浅橙5%-10%的比例，需要检查一下原因。
- 理论上模型预测出的1都应该出现在后续给浅橙的maxent_id里面，而不是实际中的只有几个。

这一版针对的是给浅橙报告训练模型所对应的数据

### label为1的情况下的训练数据集聚合数据的检查

In [1]:
from __future__ import print_function, division
from Utils.qiancheng.get_data import read_data
from Utils.common.MultiColumnLabelEncoder import MultiColumnLabelEncoder
import numpy as np
import pandas as pd
from Algorithm.qiancheng_tree import qiancheng_tree_rf
from Utils.common.transfer_event_dev import transfer_event_to_device
import warnings
warnings.filterwarnings("ignore")



In [2]:
def df_os(df,os='ios'):
    df = df.loc[df.os == os]
    bool_cols = df.select_dtypes(include=[np.bool_]).columns.tolist()
    obj_cols = df.select_dtypes(include=[np.object_]).columns.tolist()
    encoder_cols = bool_cols + obj_cols
    if 'maxent_id' in encoder_cols:
        encoder_cols.remove('maxent_id')
    df = MultiColumnLabelEncoder(columns=encoder_cols).fit_transform(df)
    if os == 'ios':
        col_drop = ['os','maxent_id', 'aid_loan', 'imei_loan', 'mac_loan', 'imei_counts', 'mac_counts', 'aid_counts']
    else:
        col_drop = ['os','maxent_id', 'idfa_loan', 'idfa_counts', 'idfv_counts', 'imei_loan']

    df = df.drop(col_drop,axis=1)
    return df

In [3]:
file_name = "/Users/chaoxu/code/local-spark/Data/qiancheng_data/qiancheng_sample_new_merge_0.09/data.csv"
df = read_data(file_name)

In [23]:
df_o = pd.read_csv(file_name)

In [24]:
df_o.event_type.value_counts()

ACT    28569
Name: event_type, dtype: int64

#### 数据准备

##### 读入训练数据集

In [4]:
df_ios = df_os(df=df,os='ios')

In [5]:
df_ios_chk=df_ios.loc[df_ios.label == 1]

In [6]:
df_android = df_os(df=df,os='android')

In [7]:
df_android_chk=df_android.loc[df_android.label == 1]

##### 读入线上所有数据

In [8]:
import re

def read_data_ol(path):
    df = pd.read_csv(path)
    anormaly = re.compile('.*anomaly$')
    anormaly_match = np.vectorize(lambda x: bool(anormaly.match(x)))
    anormaly_cols = df.columns.values[anormaly_match(df.columns.values)]
    value = re.compile('.*value$')
    value_match = np.vectorize(lambda x: bool(value.match(x)))
    value_cols = df.columns.values[value_match(df.columns.values)]
    count = re.compile('.*counts$')
    count_match = np.vectorize(lambda x: bool(count.match(x)))
    count_cols = df.columns.values[count_match(df.columns.values)]
    loan = re.compile('.*loan$')
    loan_match = np.vectorize(lambda x: bool(loan.match(x)))
    loan_cols = df.columns.values[loan_match(df.columns.values)]
#     df[anormaly_cols] = df[anormaly_cols].fillna(1)
#     df[value_cols] = df[value_cols].fillna(1)
#     df[loan_cols] = df[loan_cols].fillna(0)
#     df[count_cols] = df[count_cols].fillna(0)
    df['label'] = df['label'].fillna(0)
    old_names = anormaly_cols.tolist() + value_cols.tolist()
    new_names = map(lambda x: x.replace("_", "."), old_names)
    df.rename(columns=dict(zip(old_names, new_names)), inplace=True)
    print(df.columns[df.isnull().any()])
    return df

In [9]:
file_ol =  "/Users/chaoxu/code/local-spark/Data/qiancheng_data/qiancheng_dev_merge/data.csv"

In [10]:
df_ol = read_data_ol(file_ol)

Index([u'cracked.anomaly', u'did.15m.anomaly', u'did.1d.anomaly',
       u'did.1h.anomaly', u'did.1m.anomaly', u'did.5m.anomaly',
       u'did.6h.anomaly', u'did.7d.anomaly', u'idcIP.anomaly',
       u'ipGeo.15m.anomaly', u'ipGeo.1d.anomaly', u'ipGeo.1h.anomaly',
       u'ipGeo.1m.anomaly', u'ipGeo.5m.anomaly', u'ipGeo.6h.anomaly',
       u'ipGeo.7d.anomaly', u'ipSeg24.15m.anomaly', u'ipSeg24.1d.anomaly',
       u'ipSeg24.1h.anomaly', u'ipSeg24.1m.anomaly', u'ipSeg24.5m.anomaly',
       u'ipSeg24.6h.anomaly', u'ipSeg24.7d.anomaly', u'maxentID.15m.anomaly',
       u'maxentID.1d.anomaly', u'maxentID.1h.anomaly', u'maxentID.1m.anomaly',
       u'maxentID.5m.anomaly', u'maxentID.6h.anomaly', u'maxentID.7d.anomaly',
       u'proxyIP.anomaly', u'uaMismatch.anomaly', u'aid_loan', u'idfa_loan',
       u'imei_loan', u'mac_loan', u'did.15m.value', u'did.1d.value',
       u'did.1h.value', u'did.1m.value', u'did.5m.value', u'did.6h.value',
       u'did.7d.value', u'ipGeo.15m.value', u'ipGeo.1d.val

In [11]:
df_ol_ios = df_os(df=df_ol,os='ios')

In [12]:
df_ol_ios_chk=df_ol_ios.loc[df_ol_ios.label == 1]

##### 检查ios

- anomaly columns检查

In [13]:
anormaly = re.compile('.*anomaly$')
anormaly_match = np.vectorize(lambda x: bool(anormaly.match(x)))
anormaly_cols = df_ios_chk.columns.values[anormaly_match(df_ios_chk.columns.values)]
value = re.compile('.*value$')
value_match = np.vectorize(lambda x: bool(value.match(x)))
value_cols = df_ios_chk.columns.values[value_match(df_ios_chk.columns.values)]
count = re.compile('.*counts$')
count_match = np.vectorize(lambda x: bool(count.match(x)))
count_cols = df_ios_chk.columns.values[count_match(df_ios_chk.columns.values)]
loan = re.compile('.*loan$')
loan_match = np.vectorize(lambda x: bool(loan.match(x)))
loan_cols = df_ios_chk.columns.values[loan_match(df_ios_chk.columns.values)]

In [14]:
print(df_ios_chk.shape)
print(df_ol_ios_chk.shape)

(159, 68)
(159, 68)


In [15]:
a = df_ios_chk['maxentID.6h.anomaly'].describe()
a.values

array([ 159.        ,    1.03773585,    0.19115877,    1.        ,
          1.        ,    1.        ,    1.        ,    2.        ])

In [16]:
anomaly_dif_cols = []
for col in anormaly_cols:
    print("column:{0} description:\n".format(col))
    local_desc = df_ios_chk[col].describe()
    print(local_desc)
    ol_desc = df_ol_ios_chk[col].describe()
    print(ol_desc)
    if not np.allclose(local_desc.values,ol_desc.values,rtol=1e-6):
        print("!!!{0} desc is not same".format(col))
        anomaly_dif_cols.append(col)
    print("-----------------------\n")
    print("\n")
print("oline anormaly cols is not same as local:\n{0}".format(anomaly_dif_cols))   

column:maxentID.6h.anomaly description:

count    159.000000
mean       1.037736
std        0.191159
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        2.000000
Name: maxentID.6h.anomaly, dtype: float64
count    159.000000
mean       1.037736
std        0.191159
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        2.000000
Name: maxentID.6h.anomaly, dtype: float64
-----------------------



column:did.5m.anomaly description:

count    159.0
mean       1.0
std        0.0
min        1.0
25%        1.0
50%        1.0
75%        1.0
max        1.0
Name: did.5m.anomaly, dtype: float64
count    159.0
mean       1.0
std        0.0
min        1.0
25%        1.0
50%        1.0
75%        1.0
max        1.0
Name: did.5m.anomaly, dtype: float64
-----------------------



column:maxentID.7d.anomaly description:

count    159.000000
mean       1.188679
std        0.955794
min        1.000000
25%        1.000000
50%       

发生变化的anormaly cols:

['maxentID.7d.anomaly', 'ipGeo.5m.anomaly', 'ipSeg24.1h.anomaly', 'did.7d.anomaly', 'ipGeo.15m.anomaly', 'ipGeo.1m.anomaly', 'ipSeg24.1d.anomaly', 'ipSeg24.6h.anomaly', 'ipSeg24.7d.anomaly', 'ipGeo.6h.anomaly', 'ipSeg24.5m.anomaly', 'ipGeo.1h.anomaly', 'ipSeg24.15m.anomaly']

- value columns检查

In [17]:
value_dif_cols = []
for col in value_cols:
    print("column:{0} description:\n".format(col))
    local_desc = df_ios_chk[col].describe()
    print(local_desc)
    ol_desc = df_ol_ios_chk[col].describe()
    print(ol_desc)
    if not np.allclose(local_desc.values,ol_desc.values,rtol=1e-6):
        print("!!!{0} desc is not same".format(col))
        value_dif_cols.append(col)
    print("-----------------------\n")
    print("\n")
print("oline value cols is not same as local:\n{0}".format(value_dif_cols)) 

column:ipSeg24.15m.value description:

count    159.000000
mean       3.137636
std        4.174800
min        1.000000
25%        1.000000
50%        1.500000
75%        3.500000
max       33.000000
Name: ipSeg24.15m.value, dtype: float64
count    159.000000
mean       3.137636
std        4.174800
min        1.000000
25%        1.000000
50%        1.500000
75%        3.500000
max       33.000000
Name: ipSeg24.15m.value, dtype: float64
-----------------------



column:cracked.value description:

count    159.000000
mean       0.031447
std        0.175073
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Name: cracked.value, dtype: float64
count    159.000000
mean       0.031447
std        0.175073
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Name: cracked.value, dtype: float64
-----------------------



column:idcIP.value description:

count    159.000000
mean       0.004193
std    

没有发生变化的value columns

- count columns检查

In [18]:
count_dif_cols = []
for col in count_cols:
    print("column:{0} description:\n".format(col))
    local_desc = df_ios_chk[col].describe()
    print(local_desc)
    ol_desc = df_ol_ios_chk[col].describe()
    print(ol_desc)
    if not np.allclose(local_desc.values,ol_desc.values,rtol=1e-6):
        print("!!!{0} desc is not same".format(col))
        count_dif_cols.append(col)
    print("-----------------------\n")
    print("\n")
print("oline value cols is not same as local:\n{0}".format(count_dif_cols)) 

column:mcid_counts description:

count    159.000000
mean       1.314465
std        0.607322
min        0.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        5.000000
Name: mcid_counts, dtype: float64
count    158.000000
mean       1.322785
std        0.600095
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        5.000000
Name: mcid_counts, dtype: float64
!!!mcid_counts desc is not same
-----------------------



column:idfv_counts description:

count    159.000000
mean       1.754717
std        1.344077
min        0.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        9.000000
Name: idfv_counts, dtype: float64
count    158.000000
mean       1.765823
std        1.341013
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        9.000000
Name: idfv_counts, dtype: float64
!!!idfv_counts desc is not same
-----------------------



column:idfa_counts description:

count  

没有发生变化的count values:

In [19]:
loan_dif_cols = []
for col in loan_cols:
    print("column:{0} description:\n".format(col))
    local_desc = df_ios_chk[col].describe()
    print(local_desc)
    ol_desc = df_ol_ios_chk[col].describe()
    print(ol_desc)
    if not np.allclose(local_desc.values,ol_desc.values,rtol=1e-6):
        print("!!!{0} desc is not same".format(col))
        loan_dif_cols.append(col)
    print("-----------------------\n")
    print("\n")
print("oline value cols is not same as local:\n{0}".format(loan_dif_cols)) 

column:idfa_loan description:

count    159.000000
mean       0.433962
std        0.497186
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: idfa_loan, dtype: float64
count    155.000000
mean       0.445161
std        0.498595
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: idfa_loan, dtype: float64
!!!idfa_loan desc is not same
-----------------------



oline value cols is not same as local:
['idfa_loan']


多头借贷的数据一致

In [20]:
print(df_ios_chk['proxy_ua'].describe())
print(df_ol_ios_chk['proxy_ua'].describe())

count    159.000000
mean       0.088050
std        0.284263
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Name: proxy_ua, dtype: float64
count    159.000000
mean       0.088050
std        0.284263
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Name: proxy_ua, dtype: float64


#### 结论：
ios主要是anomaly的特征变化了，体现为均值标准差都变小，整体label为1的数量都是159

In [21]:
df_ios_chk.columns[df_ios_chk.isnull().any()]

Index([], dtype='object')

In [22]:
df_ol_ios_chk.columns[df_ol_ios_chk.isnull().any()]

Index([u'idfa_loan', u'mcid_counts', u'idfa_counts', u'idfv_counts'], dtype='object')