In [1]:
from semi_auto_ml import AutoCreate,AutoSelect,DataCheck,ModelSelect
import featuretools as ft
import pandas as pd
import numpy as np
import sklearn as sk
from itertools import chain



In [2]:
path = r'E:/新建文件夹/'
target_df = pd.read_csv(path+'target_df.csv').drop('Unnamed: 0',axis=1)
fact1_df = pd.read_csv(path+'fact1_df.csv').drop('Unnamed: 0',axis=1)
dim1_df = pd.read_csv(path+'dim1_df.csv').drop('Unnamed: 0',axis=1)
dim2_df = pd.read_csv(path+'dim2_df.csv').drop('Unnamed: 0',axis=1)

## 原始数据清洗
#### 使用Datacheck

In [4]:
# 通过checks 发现异常值，当进行预测时首先通过 checks model 如果标记为-1 则为异常值 不进入模型进行预测

## 特征生成

In [3]:
auto_c = AutoCreate('ISL')
auto_c.create_entity('target',target_df,index ='key_id')
auto_c.create_entity('fact',fact1_df,index ='apply_id',make_index=True)
auto_c.create_entity('dim1',dim1_df,index ='dim1_id',make_index=True)
auto_c.create_entity('dim2',dim2_df,index ='dim2_id',make_index=True)
relationships = ['target.key_id','fact.key_id','fact.work_id','dim1.work_id',
                    'fact.customer_id','dim2.customer_id',]
auto_c.add_relation(relationships)

In [4]:
#得到计算后的特征，feature_f作为model
feature_m,feature_f=auto_c.make_features(target_entity="target",drop_contains=['_id'])



In [5]:
train_y=feature_m['is_loan']
train_matrix,train_def = auto_c.remove_features(['is_loan'],feature_m,feature_f)
num_matrix,num_def = auto_c.clean_features(train_matrix,train_def)
num_matrix,num_def = auto_c.remove_features(['MODE(dim2.cust_category)','MODE(dim2.issue)','MODE(dim2.sex)','COUNT(fact)','COUNT(dim1)','COUNT(dim2)'],num_matrix,features_enc=num_def)

In [6]:
num_matrix,num_def=auto_c.get_final_data(num_matrix,num_def)

In [7]:
#保存特征生成模型
auto_c.deploy_features_create(num_def,'make_features.json')

In [8]:
#对空值和负值进行处理
from sklearn.preprocessing import FunctionTransformer
def transforme(X,fill_value=0,map_func=abs):
    X=X.astype('float').fillna(fill_value).applymap(map_func)
    return X
sp_clf = FunctionTransformer(transforme,kw_args={'fill_value':0,'map_func':abs})

In [9]:
num_matrix=sp_clf.transform(num_matrix)

In [10]:
from semi_auto_ml.utils.extract_funcs import load_sk_model,save_sk_model

In [11]:
save_sk_model(sp_clf,'model_file/f1.joblib')

## 特征选择

In [12]:
auto_s = AutoSelect(num_matrix,train_y)
sk_fm = auto_s.sk_feature_importances(0.99)

In [13]:
auto_s.plotly_feature_importances(sk_fm)

In [14]:
features_to_drop = set(chain(*auto_s.removed_features))

In [15]:
def drop_columns(X,remove_features):
    kf = [item for item in X.columns if item not in remove_features]
    return X[kf]
rm1_clf = FunctionTransformer(drop_columns,kw_args={'remove_features':features_to_drop})

In [16]:
train_matrix = rm1_clf.transform(num_matrix)

In [17]:
save_sk_model(rm1_clf,'model_file/f2.joblib')

In [18]:
train_matrix.head()

Unnamed: 0_level_0,MAX(fact.apply_amount),MAX(dim1.salary),SUM(dim1.salary),MAX(dim2.regis_date_key),MAX(fact.SUM(dim1.salary)),MAX(dim1.fact.apply_amount),STD(dim2.fact.apply_amount),SUM(dim2.fact.apply_amount)
key_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.0,100000.0,20000.0,20000.0,20201020.0,6300.0,100000.0,0.0,100000.0
2.0,20000.0,6000.0,6000.0,20200316.0,7000.0,20000.0,0.0,20000.0
4.0,150000.0,15000.0,15000.0,20201126.0,0.0,150000.0,0.0,150000.0
6.0,70000.0,10500.0,10500.0,20200710.0,3500.0,70000.0,0.0,70000.0
7.0,50000.0,6000.0,6000.0,20201008.0,4420.0,50000.0,0.0,50000.0


## 模型检索

In [19]:
ms = ModelSelect('binary',max_batches=2)
ranking = ms.search(train_matrix,train_y)

`X` passed was not a DataTable. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead.
`y` passed was not a DataColumn. EvalML will try to convert the input as a Woodwork DataTable and types will be inferred. To control this behavior, please pass in a Woodwork DataTable instead.
Generating pipelines to search over...
*****************************
* Beginning pipeline search *
*****************************

Optimizing for Log Loss Binary. 
Lower score is better.

Searching up to 2 batches for a total of 14 pipelines. 
Allowed model families: extra_trees, random_forest, linear_model, catboost, decision_tree, xgboost, lightgbm

Batch 1: (1/14) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 7.045
Batch 1: (2/14) Decision Tree Classifier w/ Imputer      Elapsed:00:00
	Starting cross validation
	Finished

In [20]:
#获取最优模型
bm = ms.auto_ml.best_pipeline

In [21]:
pipline = bm.fit(train_matrix,train_y)



In [22]:
pipline.save('m3.joblib')

## LOAD MODEL PREDICT

In [25]:
#get source data
#完成model_deploy 部分将修改load model的代码
auto_c2 = AutoCreate('PISL')
auto_c2.create_entity('target',target_df.sample(n=200),index ='key_id')
auto_c2.create_entity('fact',fact1_df,index ='apply_id',make_index=True)
auto_c2.create_entity('dim1',dim1_df,index ='dim1_id',make_index=True)
auto_c2.create_entity('dim2',dim2_df,index ='dim2_id',make_index=True)
relationships = ['target.key_id','fact.key_id','fact.work_id','dim1.work_id',
                    'fact.customer_id','dim2.customer_id',]
auto_c2.add_relation(relationships)

In [26]:
feature_df_load = AutoCreate.load_features_create('make_features.json')

In [27]:
feature_m=auto_c2.make_features(target_entity="target",features=feature_df_load)

In [29]:
lsp_clf = load_sk_model('model_file/f1.joblib')

In [30]:
lfs_clf = load_sk_model('model_file/f2.joblib')

In [32]:
l_model = load_sk_model('m3.joblib')

In [33]:
test_m = lsp_clf.transform(feature_m)

In [34]:
test_m = lfs_clf.transform(test_m)

In [35]:
# 进行预测
l_model.predict(test_m).value_counts()

已放款    169
未放款     31
dtype: int64

In [36]:
from sklearn.pipeline import Pipeline
# 根据path 取一次加载model,可以使用model_deploy 进行 模型保存与加载
pipe = Pipeline([('f1', lsp_clf), ('f2', lfs_clf),('m1',l_model)])

In [37]:
pipe.predict(feature_m)

0      已放款
1      已放款
2      已放款
3      已放款
4      已放款
      ... 
195    已放款
196    已放款
197    未放款
198    已放款
199    已放款
Length: 200, dtype: object