主要负责检查之前的机器学习模型是否有效：
- 捞取的事件范围是20170801-20170820
- 从上述的事件里面的不重复maxent_id里面获取1000个maxent_id
- 如果上述的数据做为泛化测试集，检测出的欺诈准确率高于10%，或者至少高于5%，那么就说明模型是有效的
- 如果低于或者等于5%，那么说明模型失效

In [7]:
from __future__ import print_function,division
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from Pic.maxent_font import tick_font
from datetime import datetime, timedelta
from Utils.common.transfer_event_dev import transfer_event_to_device
from sklearn.svm import SVC
from Utils.common.MultiColumnLabelEncoder import MultiColumnLabelEncoder
# from Algorithm.qiancheng_stack_algorithm import stack_algorithm
from sklearn.externals import joblib
import warnings
warnings.filterwarnings("ignore")

In [8]:
def read_data(path):
    df = pd.read_csv(path)
    anormaly = re.compile('.*anomaly$')
    anormaly_match = np.vectorize(lambda x: bool(anormaly.match(x)))
    anormaly_cols = df.columns.values[anormaly_match(df.columns.values)]
    value = re.compile('.*value$')
    value_match = np.vectorize(lambda x: bool(value.match(x)))
    value_cols = df.columns.values[value_match(df.columns.values)]
    count = re.compile('.*counts$')
    count_match = np.vectorize(lambda x: bool(count.match(x)))
    count_cols = df.columns.values[count_match(df.columns.values)]
    loan = re.compile('.*loan$')
    loan_match = np.vectorize(lambda x: bool(loan.match(x)))
    loan_cols = df.columns.values[loan_match(df.columns.values)]
    df[anormaly_cols] = df[anormaly_cols].fillna(1)
    df[value_cols] = df[value_cols].fillna(1)
    df[loan_cols] = df[loan_cols].fillna(0)
    df[count_cols] = df[count_cols].fillna(0)
    df['label'] = df['label'].fillna(0)
    old_names = anormaly_cols.tolist() + value_cols.tolist()
    new_names = map(lambda x: x.replace("_", "."), old_names)
    df.rename(columns=dict(zip(old_names, new_names)), inplace=True)
    return df

def df_main(df,os='ios'):
    df = df.loc[df.os == os]
    bool_cols = df.select_dtypes(include=[np.bool_]).columns.tolist()
    obj_cols = df.select_dtypes(include=[np.object_]).columns.tolist()
    encoder_cols = bool_cols + obj_cols
    if 'maxent_id' in encoder_cols:
        encoder_cols.remove('maxent_id')
    df = MultiColumnLabelEncoder(columns=encoder_cols).fit_transform(df)
    if os == 'ios':
        col_drop = ['os', 'aid_loan', 'imei_loan', 'mac_loan', 'imei_counts', 'mac_counts', 'aid_counts']
    else:
        col_drop = ['os','idfa_loan', 'idfa_counts', 'idfv_counts', 'imei_loan']

    df = df.drop(col_drop,axis=1)
    return df

def get_maxent_ids(df,os,path,num=None):
    print("get {0} maxent_id to {1}".format(os,path))
    csv_file = path + "/{0}_maxent_id.csv".format(os)
    label_csv_file = path + "/{0}_label_maxent_id.csv".format(os)
    qiancheng_maxent_ids_file = path + "/qiancheng_fraud_maxent_id.csv"
    df = df_main(df=df,os=os)
    df = df.reset_index(drop=True)
    maxent_id_all = df['maxent_id']
    df = df.drop(['maxent_id'], axis=1)
    X_test = df.ix[:, df.columns != 'label']
    rf_model_path = '/Users/chaoxu/code/local-spark/Analysis/qiancheng/script/{0}_random_forest.pkl'\
        .format(os)
    clf_rf = joblib.load(rf_model_path)
    y_pred = clf_rf.predict(X_test)
    maxent_id_index = np.where(y_pred == 1)
    maxent_id = maxent_id_all.ix[maxent_id_index]
    qiancheng_maxent_ids = pd.read_csv(qiancheng_maxent_ids_file,names=['maxent_id'])
    if num is not None:
        maxent_id_sample = maxent_id.sample(n=num)
        fraud_qiancheng_index = qiancheng_maxent_ids['maxent_id'].isin(maxent_id_sample)
        fraud_qiancheng = qiancheng_maxent_ids[fraud_qiancheng_index]
        fraud_qiancheng.to_csv(path_or_buf=label_csv_file,index=False,header=False)
        maxent_id_sample.to_csv(path=csv_file,index=False,header=False)
    else:
        fraud_qiancheng_index = qiancheng_maxent_ids['maxent_id'].isin(maxent_id)
        fraud_qiancheng = qiancheng_maxent_ids[fraud_qiancheng_index]
        fraud_qiancheng.to_csv(path_or_buf=label_csv_file,index=False,header=False)
        maxent_id.to_csv(path=csv_file,index=False,header=False)

In [9]:
file = "/Users/chaoxu/code/local-spark/Data/qiancheng_data/qiancheng_dev_merge/data.csv"

In [10]:
df = read_data(path=file)

In [11]:
df.maxent_id.drop_duplicates().count()

1970042

In [13]:
df_android = df_main(df,os='android')

In [14]:
df_android.label.value_counts()

0.0    1327943
1.0        303
Name: label, dtype: int64

In [15]:
df_android.head()

Unnamed: 0,maxent_id,label,proxy_ua,cracked.anomaly,did.15m.anomaly,did.1d.anomaly,did.1h.anomaly,did.1m.anomaly,did.5m.anomaly,did.6h.anomaly,...,maxentID.1m.value,maxentID.5m.value,maxentID.6h.value,maxentID.7d.value,cracked.value,idcIP.value,mcid_counts,imei_counts,mac_counts,aid_counts
0,0003762805b2f59d07100d0ab4bd07ca,0.0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
1,000685036de8974f06df357480384980,0.0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
3,000fa917c56fcf97fac0bfd596089301,0.0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
4,00281f52f8742a1ac8e0dc0f7036c927,0.0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.333333,2.0,0.0,0.0,1.0,1.0,1.0,1.0
8,004f563311ff7e84b46f343c913c2dfd,0.0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
df.label.value_counts()

0.0    1969580
1.0        462
Name: label, dtype: int64

In [16]:
df_ios = df_main(df,os='ios')

In [17]:
df_ios.label.value_counts()

0.0    641600
1.0       159
Name: label, dtype: int64

In [18]:
df_ios.head()

Unnamed: 0,maxent_id,label,proxy_ua,cracked.anomaly,did.15m.anomaly,did.1d.anomaly,did.1h.anomaly,did.1m.anomaly,did.5m.anomaly,did.6h.anomaly,...,maxentID.1h.value,maxentID.1m.value,maxentID.5m.value,maxentID.6h.value,maxentID.7d.value,cracked.value,idcIP.value,mcid_counts,idfa_counts,idfv_counts
2,000eaefcfb24d0403430129a885ce77b,0.0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.5,1.5,0.0,0.0,1.0,1.0,1.0
5,003ab3f3deb8a635b4d04b6a717f92c0,0.0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.5,1.5,0.0,0.0,0.0,0.0,0.0
6,0042cb2fd7d76d48be647edbf326b7c3,0.0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
7,004beeefe19b2edc0db8c66bbe6f912f,0.0,0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,1.375,1.0,1.0,1.5,3.5,0.0,0.0,2.0,2.0,3.0
9,00542571f3c67a55e990515db054b0b6,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0


In [41]:
df_ios.loc[df_ios.label == 1]['did.15m.anomaly'].describe()

count    159.000000
mean       1.018868
std        0.136488
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        2.000000
Name: did.15m.anomaly, dtype: float64

In [19]:
train_file = "/Users/chaoxu/code/local-spark/Data/qiancheng_data/qiancheng_sample_new_merge_0.09/data.csv"

In [29]:
from Utils.qiancheng.get_data import read_data as tr_read_data
df_train = tr_read_data(train_file)

In [24]:
def get_os_train(df,os='ios'):
    df = df.loc[df.os == os]
    bool_cols = df.select_dtypes(include=[np.bool_]).columns.tolist()
    obj_cols = df.select_dtypes(include=[np.object_]).columns.tolist()
    encoder_cols = bool_cols + obj_cols
    if 'maxent_id' in encoder_cols:
        encoder_cols.remove('maxent_id')
    df = MultiColumnLabelEncoder(columns=encoder_cols).fit_transform(df)
    if os == 'ios':
        col_drop = ['os','maxent_id', 'aid_loan', 'imei_loan', 'mac_loan', 'imei_counts', 'mac_counts', 'aid_counts']
    else:
        col_drop = ['os','maxent_id', 'idfa_loan', 'idfa_counts', 'idfv_counts', 'imei_loan']

    df = df.drop(col_drop,axis=1)
    return df

In [30]:
df_train_ios = get_os_train(df_train)

In [31]:
df_train_ios.label.value_counts()

0    1501
1     159
Name: label, dtype: int64

In [42]:
df_train_ios.loc[df_train_ios.label == 1]['did.15m.anomaly'].describe()

count    159.000000
mean       1.018868
std        0.136488
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        2.000000
Name: did.15m.anomaly, dtype: float64

In [33]:
df_train_android = get_os_train(df_train,os='android')

In [34]:
df_train_android.label.value_counts()

0    3119
1     303
Name: label, dtype: int64

In [43]:
a = [u'maxentID.6h.anomaly', u'did.5m.anomaly', u'maxentID.7d.anomaly',
       u'maxentID.5m.anomaly', u'mcid_counts', u'cracked.anomaly',
       u'ipSeg24.15m.value', u'ipGeo.1d.anomaly', u'ipGeo.5m.anomaly',
       u'cracked.value', u'idcIP.value', u'did.15m.anomaly',
       u'maxentID.1h.value', u'maxentID.5m.value', u'ipSeg24.1h.anomaly',
       u'idfv_counts', u'ipGeo.7d.anomaly', u'uaMismatch.anomaly',
       u'ipGeo.1d.value', u'did.7d.anomaly', u'maxentID.15m.anomaly',
       u'idfa_loan', u'ipGeo.7d.value', u'ipGeo.15m.value', u'did.5m.value',
       u'ipSeg24.1h.value', u'maxentID.6h.value', u'ipSeg24.5m.value',
       u'did.6h.anomaly', u'did.7d.value', u'maxentID.1d.anomaly',
       u'ipGeo.15m.anomaly', u'ipGeo.5m.value', u'did.1h.anomaly', u'proxy_ua',
       u'did.6h.value', u'idfa_counts', u'idcIP.anomaly', u'ipGeo.1m.anomaly',
       u'ipGeo.1h.value', u'ipSeg24.1d.value', u'maxentID.1m.value',
       u'proxyIP.anomaly', u'maxentID.1d.value', u'maxentID.1h.anomaly',
       u'ipSeg24.6h.value', u'ipSeg24.1d.anomaly', u'did.1d.value',
       u'ipSeg24.6h.anomaly', u'did.1m.value', u'ipGeo.6h.value',
       u'did.1d.anomaly', u'ipSeg24.1m.value', u'ipSeg24.7d.anomaly',
       u'ipGeo.1m.value', u'ipGeo.6h.anomaly', u'ipSeg24.1m.anomaly',
       u'maxentID.15m.value', u'ipSeg24.7d.value', u'ipSeg24.5m.anomaly',
       u'did.1m.anomaly', u'maxentID.7d.value', u'did.1h.value',
       u'maxentID.1m.anomaly', u'did.15m.value', u'ipGeo.1h.anomaly',
       u'ipSeg24.15m.anomaly']

In [44]:
b = [u'maxentID.6h.anomaly', u'did.5m.anomaly', u'maxentID.7d.anomaly',
       u'maxentID.5m.anomaly', u'mcid_counts', u'cracked.anomaly',
       u'ipSeg24.15m.value', u'ipGeo.1d.anomaly', u'ipGeo.5m.anomaly',
       u'cracked.value', u'idcIP.value', u'did.15m.anomaly',
       u'maxentID.1h.value', u'maxentID.5m.value', u'ipSeg24.1h.anomaly',
       u'idfv_counts', u'ipGeo.7d.anomaly', u'uaMismatch.anomaly',
       u'ipGeo.1d.value', u'did.7d.anomaly', u'maxentID.15m.anomaly',
       u'idfa_loan', u'ipGeo.7d.value', u'ipGeo.15m.value', u'did.5m.value',
       u'ipSeg24.1h.value', u'maxentID.6h.value', u'ipSeg24.5m.value',
       u'did.6h.anomaly', u'did.7d.value', u'maxentID.1d.anomaly',
       u'ipGeo.15m.anomaly', u'ipGeo.5m.value', u'did.1h.anomaly', u'proxy_ua',
       u'did.6h.value', u'idfa_counts', u'idcIP.anomaly', u'ipGeo.1m.anomaly',
       u'ipGeo.1h.value', u'ipSeg24.1d.value', u'maxentID.1m.value',
       u'proxyIP.anomaly', u'maxentID.1d.value', u'maxentID.1h.anomaly',
       u'ipSeg24.6h.value', u'ipSeg24.1d.anomaly', u'did.1d.value',
       u'ipSeg24.6h.anomaly', u'did.1m.value', u'ipGeo.6h.value',
       u'did.1d.anomaly', u'ipSeg24.1m.value', u'ipSeg24.7d.anomaly',
       u'ipGeo.1m.value', u'ipGeo.6h.anomaly', u'ipSeg24.1m.anomaly',
       u'maxentID.15m.value', u'ipSeg24.7d.value', u'ipSeg24.5m.anomaly',
       u'did.1m.anomaly', u'maxentID.7d.value', u'did.1h.value',
       u'maxentID.1m.anomaly', u'did.15m.value', u'ipGeo.1h.anomaly',
       u'ipSeg24.15m.anomaly']

In [46]:
set(a) - set(b)

set()

In [49]:
len(a)

67