> 主要目的是检查对浅橙使用random forest算法获取的maxent_id比例过低的问题：
- 模型在全集上的预测的结果，没有达到浅橙5%-10%的比例，需要检查一下原因。
- 理论上模型预测出的1都应该出现在后续给浅橙的maxent_id里面，而不是实际中的只有几个。

这一版使用的数据，针对前面检查发现的anomaly columns出现的特征不一致的问题，但是在pyspark代码里面没有找到对anomaly处理的不同点，只能判断为中间数据错位造成的，因此优化之前获取数据的流程，event到dev全部在线上完成

优化之前的Python代码，将共同的函数放到module里面打包，使用--py-files传递到各个worker，避免两个长的Python代码文件修改一处导致另一处没有起作用的问题。同时修改调度脚本，加入相关参数

初步判断原因是spark中fillna值，如果schema不一致的话会失败，比如往string类型中fillna 0值得话会失败，而我们beta环境原始数据中所有的数据类型都是string的；fillna失败，但是不会报错

以上问题已经解决，还有一个问题是之前的feature analysis是基于event分析的，实际上需要基于设备分析才是对的

### label为1的情况下的训练数据集聚合数据的检查

In [1]:
from __future__ import print_function, division
from Utils.qiancheng.get_data import read_data
from Utils.common.MultiColumnLabelEncoder import MultiColumnLabelEncoder
import numpy as np
import pandas as pd
from Algorithm.qiancheng_tree import qiancheng_tree_rf
from Utils.common.transfer_event_dev import transfer_event_to_device
import re
import warnings
warnings.filterwarnings("ignore")



In [2]:

def read_data_ol(path):
    df = pd.read_csv(path)
    anormaly = re.compile('.*anomaly$')
    anormaly_match = np.vectorize(lambda x: bool(anormaly.match(x)))
    anormaly_cols = df.columns.values[anormaly_match(df.columns.values)]
    value = re.compile('.*value$')
    value_match = np.vectorize(lambda x: bool(value.match(x)))
    value_cols = df.columns.values[value_match(df.columns.values)]
    count = re.compile('.*counts$')
    count_match = np.vectorize(lambda x: bool(count.match(x)))
    count_cols = df.columns.values[count_match(df.columns.values)]
    loan = re.compile('.*loan$')
    loan_match = np.vectorize(lambda x: bool(loan.match(x)))
    loan_cols = df.columns.values[loan_match(df.columns.values)]
    old_names = anormaly_cols.tolist() + value_cols.tolist()
    new_names = map(lambda x: x.replace("_", "."), old_names)
    df.rename(columns=dict(zip(old_names, new_names)), inplace=True)
    print(df.columns[df.isnull().any()])
    return df

#### 数据准备

##### 读入训练数据集

In [3]:
file_name = "/Users/chaoxu/code/local-spark/Data/qiancheng_dev/qiancheng_sample_new_merge_0.09/data.csv"

In [4]:
df = read_data_ol(path=file_name)

Index([], dtype='object')


In [5]:
df_ios = df.loc[df.os == 'ios']

In [6]:
df_android = df.loc[df.os == 'android']

In [7]:
df_ios_chk=df_ios.loc[df_ios.label == 1]

In [8]:
df_android_chk=df_android.loc[df_android.label == 1]

##### 读入线上所有数据

In [9]:
file_ol =  "/Users/chaoxu/code/local-spark/Data/qiancheng_dev/qiancheng_dev_merge/data.csv"

In [10]:
df_ol = read_data_ol(file_ol)

Index([], dtype='object')


In [11]:
df_ol_ios = df_ol.loc[df_ol.os == 'ios']

In [29]:
df_ol_android = df_ol.loc[df_ol.os == 'android']

In [12]:
df_ol_ios_chk=df_ol_ios.loc[df_ol_ios.label == 1]

In [30]:
df_ol_android_chk=df_ol_android.loc[df_ol_android.label == 1]

##### 检查ios

- anomaly columns检查

In [13]:
anormaly = re.compile('.*anomaly$')
anormaly_match = np.vectorize(lambda x: bool(anormaly.match(x)))
anormaly_cols = df_ios_chk.columns.values[anormaly_match(df_ios_chk.columns.values)]
value = re.compile('.*value$')
value_match = np.vectorize(lambda x: bool(value.match(x)))
value_cols = df_ios_chk.columns.values[value_match(df_ios_chk.columns.values)]
count = re.compile('.*counts$')
count_match = np.vectorize(lambda x: bool(count.match(x)))
count_cols = df_ios_chk.columns.values[count_match(df_ios_chk.columns.values)]
loan = re.compile('.*loan$')
loan_match = np.vectorize(lambda x: bool(loan.match(x)))
loan_cols = df_ios_chk.columns.values[loan_match(df_ios_chk.columns.values)]

In [14]:
print(df_ios_chk.shape)
print(df_ol_ios_chk.shape)

(159, 77)
(159, 77)


In [15]:
a = df_ios_chk['maxentID.6h.anomaly'].describe()
a.values

array([ 159.        ,    1.03773585,    0.19115877,    1.        ,
          1.        ,    1.        ,    1.        ,    2.        ])

In [16]:
anomaly_dif_cols = []
for col in anormaly_cols:
    print("column:{0} description:\n".format(col))
    local_desc = df_ios_chk[col].describe()
    print(local_desc)
    ol_desc = df_ol_ios_chk[col].describe()
    print(ol_desc)
    if not np.allclose(local_desc.values,ol_desc.values,rtol=1e-6):
        print("!!!{0} desc is not same".format(col))
        anomaly_dif_cols.append(col)
    print("-----------------------\n")
    print("\n")
print("oline anormaly cols is not same as local:\n{0}".format(anomaly_dif_cols))   

column:cracked.anomaly description:

count    159.0
mean       1.0
std        0.0
min        1.0
25%        1.0
50%        1.0
75%        1.0
max        1.0
Name: cracked.anomaly, dtype: float64
count    159.0
mean       1.0
std        0.0
min        1.0
25%        1.0
50%        1.0
75%        1.0
max        1.0
Name: cracked.anomaly, dtype: float64
-----------------------



column:did.15m.anomaly description:

count    159.000000
mean       1.018868
std        0.136488
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        2.000000
Name: did.15m.anomaly, dtype: float64
count    159.000000
mean       1.018868
std        0.136488
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        2.000000
Name: did.15m.anomaly, dtype: float64
-----------------------



column:did.1d.anomaly description:

count    159.000000
mean       1.075472
std        0.382318
min        1.000000
25%        1.000000
50%        1.000000
75% 

没有发生变化的anormaly cols:

- value columns检查

In [17]:
value_dif_cols = []
for col in value_cols:
    print("column:{0} description:\n".format(col))
    local_desc = df_ios_chk[col].describe()
    print(local_desc)
    ol_desc = df_ol_ios_chk[col].describe()
    print(ol_desc)
    if not np.allclose(local_desc.values,ol_desc.values,rtol=1e-6):
        print("!!!{0} desc is not same".format(col))
        value_dif_cols.append(col)
    print("-----------------------\n")
    print("\n")
print("oline value cols is not same as local:\n{0}".format(value_dif_cols)) 

column:did.15m.value description:

count    159.000000
mean       1.077805
std        0.160256
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        1.750000
Name: did.15m.value, dtype: float64
count    159.000000
mean       1.077805
std        0.160256
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        1.750000
Name: did.15m.value, dtype: float64
-----------------------



column:did.1d.value description:

count    159.000000
mean       1.235676
std        0.341117
min        1.000000
25%        1.000000
50%        1.000000
75%        1.500000
max        2.666667
Name: did.1d.value, dtype: float64
count    159.000000
mean       1.235676
std        0.341117
min        1.000000
25%        1.000000
50%        1.000000
75%        1.500000
max        2.666667
Name: did.1d.value, dtype: float64
-----------------------



column:did.1h.value description:

count    159.000000
mean       1.137737
std        0.217467
m

没有发生变化的value columns

- count columns检查

In [18]:
count_dif_cols = []
for col in count_cols:
    print("column:{0} description:\n".format(col))
    local_desc = df_ios_chk[col].describe()
    print(local_desc)
    ol_desc = df_ol_ios_chk[col].describe()
    print(ol_desc)
    if not np.allclose(local_desc.values,ol_desc.values,rtol=1e-6):
        print("!!!{0} desc is not same".format(col))
        count_dif_cols.append(col)
    print("-----------------------\n")
    print("\n")
print("oline value cols is not same as local:\n{0}".format(count_dif_cols)) 

column:mcid_counts description:

count    159.000000
mean       1.314465
std        0.607322
min        0.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        5.000000
Name: mcid_counts, dtype: float64
count    159.000000
mean       1.314465
std        0.607322
min        0.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        5.000000
Name: mcid_counts, dtype: float64
-----------------------



column:imei_counts description:

count    159.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: imei_counts, dtype: float64
count    159.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: imei_counts, dtype: float64
-----------------------



column:mac_counts description:

count    159.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: mac_counts, dtype: float64

没有发生变化的count values:

In [19]:
loan_dif_cols = []
for col in loan_cols:
    print("column:{0} description:\n".format(col))
    local_desc = df_ios_chk[col].describe()
    print(local_desc)
    ol_desc = df_ol_ios_chk[col].describe()
    print(ol_desc)
    if not np.allclose(local_desc.values,ol_desc.values,rtol=1e-6):
        print("!!!{0} desc is not same".format(col))
        loan_dif_cols.append(col)
    print("-----------------------\n")
    print("\n")
print("oline value cols is not same as local:\n{0}".format(loan_dif_cols)) 

column:aid_loan description:

count    159.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: aid_loan, dtype: float64
count    159.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: aid_loan, dtype: float64
-----------------------



column:idfa_loan description:

count    159.000000
mean       0.433962
std        0.497186
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: idfa_loan, dtype: float64
count    159.000000
mean       0.433962
std        0.497186
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: idfa_loan, dtype: float64
-----------------------



column:imei_loan description:

count    159.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: imei_loan, dtype: float64
count    159.0
m

多头借贷的数据一致

In [20]:
print(df_ios_chk['proxy_ua'].describe())
print(df_ol_ios_chk['proxy_ua'].describe())

count    159.000000
mean       0.088050
std        0.284263
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Name: proxy_ua, dtype: float64
count    159.000000
mean       0.088050
std        0.284263
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Name: proxy_ua, dtype: float64


In [21]:
df_ios_chk.columns[df_ios_chk.isnull().any()]

Index([], dtype='object')

In [22]:
df_ol_ios_chk.columns[df_ol_ios_chk.isnull().any()]

Index([], dtype='object')

#### 检查ios的特征变化

In [24]:
chk_cols = df_ios_chk.columns.difference(['maxent_id','os'])
# chk_cols

Index([u'aid_counts', u'aid_loan', u'cracked.anomaly', u'cracked.value',
       u'did.15m.anomaly', u'did.15m.value', u'did.1d.anomaly',
       u'did.1d.value', u'did.1h.anomaly', u'did.1h.value', u'did.1m.anomaly',
       u'did.1m.value', u'did.5m.anomaly', u'did.5m.value', u'did.6h.anomaly',
       u'did.6h.value', u'did.7d.anomaly', u'did.7d.value', u'event_num',
       u'idcIP.anomaly', u'idcIP.value', u'idfa_counts', u'idfa_loan',
       u'idfv_counts', u'imei_counts', u'imei_loan', u'ipGeo.15m.anomaly',
       u'ipGeo.15m.value', u'ipGeo.1d.anomaly', u'ipGeo.1d.value',
       u'ipGeo.1h.anomaly', u'ipGeo.1h.value', u'ipGeo.1m.anomaly',
       u'ipGeo.1m.value', u'ipGeo.5m.anomaly', u'ipGeo.5m.value',
       u'ipGeo.6h.anomaly', u'ipGeo.6h.value', u'ipGeo.7d.anomaly',
       u'ipGeo.7d.value', u'ipSeg24.15m.anomaly', u'ipSeg24.15m.value',
       u'ipSeg24.1d.anomaly', u'ipSeg24.1d.value', u'ipSeg24.1h.anomaly',
       u'ipSeg24.1h.value', u'ipSeg24.1m.anomaly', u'ipSeg24.1m.value'

In [25]:
ios_diff_cols = []
for col in chk_cols:
    print("column:{0} description:\n".format(col))
    local_desc = df_ios_chk[col].describe()
    print(local_desc)
    ol_desc = df_ol_ios_chk[col].describe()
    print(ol_desc)
    if not np.allclose(local_desc.values,ol_desc.values,rtol=1e-6):
        print("!!!{0} desc is not same".format(col))
        ios_diff_cols.append(col)
    print("-----------------------\n")
    print("\n")
print("oline value cols is not same as local:\n{0}".format(ios_diff_cols))

column:aid_counts description:

count    159.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: aid_counts, dtype: float64
count    159.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: aid_counts, dtype: float64
-----------------------



column:aid_loan description:

count    159.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: aid_loan, dtype: float64
count    159.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: aid_loan, dtype: float64
-----------------------



column:cracked.anomaly description:

count    159.0
mean       1.0
std        0.0
min        1.0
25%        1.0
50%        1.0
75%        1.0
max        1.0
Name: cracked.anomaly, dtype: float64
count    159.0
mean       1.0
std        0.0
min        1.0
25%        1.0
50%   

ios特征没有变化

#### 检查Android特征变化

In [27]:
android_chk_cols = df_android_chk.columns.difference(['maxent_id','os'])
android_chk_cols

Index([u'aid_counts', u'aid_loan', u'cracked.anomaly', u'cracked.value',
       u'did.15m.anomaly', u'did.15m.value', u'did.1d.anomaly',
       u'did.1d.value', u'did.1h.anomaly', u'did.1h.value', u'did.1m.anomaly',
       u'did.1m.value', u'did.5m.anomaly', u'did.5m.value', u'did.6h.anomaly',
       u'did.6h.value', u'did.7d.anomaly', u'did.7d.value', u'event_num',
       u'idcIP.anomaly', u'idcIP.value', u'idfa_counts', u'idfa_loan',
       u'idfv_counts', u'imei_counts', u'imei_loan', u'ipGeo.15m.anomaly',
       u'ipGeo.15m.value', u'ipGeo.1d.anomaly', u'ipGeo.1d.value',
       u'ipGeo.1h.anomaly', u'ipGeo.1h.value', u'ipGeo.1m.anomaly',
       u'ipGeo.1m.value', u'ipGeo.5m.anomaly', u'ipGeo.5m.value',
       u'ipGeo.6h.anomaly', u'ipGeo.6h.value', u'ipGeo.7d.anomaly',
       u'ipGeo.7d.value', u'ipSeg24.15m.anomaly', u'ipSeg24.15m.value',
       u'ipSeg24.1d.anomaly', u'ipSeg24.1d.value', u'ipSeg24.1h.anomaly',
       u'ipSeg24.1h.value', u'ipSeg24.1m.anomaly', u'ipSeg24.1m.value'

In [31]:
android_diff_cols = []
for col in chk_cols:
    print("column:{0} description:\n".format(col))
    local_desc = df_android_chk[col].describe()
    print(local_desc)
    ol_desc = df_ol_android_chk[col].describe()
    print(ol_desc)
    if not np.allclose(local_desc.values,ol_desc.values,rtol=1e-6):
        print("!!!{0} desc is not same".format(col))
        android_diff_cols.append(col)
    print("-----------------------\n")
    print("\n")
print("oline value cols is not same as local:\n{0}".format(android_diff_cols))

column:aid_counts description:

count    303.000000
mean       1.019802
std        0.256577
min        0.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        4.000000
Name: aid_counts, dtype: float64
count    303.000000
mean       1.019802
std        0.256577
min        0.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        4.000000
Name: aid_counts, dtype: float64
-----------------------



column:aid_loan description:

count    303.000000
mean       0.438944
std        0.497079
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: aid_loan, dtype: float64
count    303.000000
mean       0.438944
std        0.497079
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: aid_loan, dtype: float64
-----------------------



column:cracked.anomaly description:

count    303.0
mean       1.0
std        0.0
min        1.0
25%        1.0
50% 

android特征没有变化

In [32]:
from Utils.common.check_features import check_files

In [33]:
check_files(file1=file_name,file2=file_ol)

check ios
^_^ ios
check android
^_^ android
