Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

select best from batch algorithms and autoclf introduction #25

Open
mylamour opened this issue May 3, 2018 · 0 comments
Open

select best from batch algorithms and autoclf introduction #25

mylamour opened this issue May 3, 2018 · 0 comments

Comments

@mylamour
Copy link
Owner

mylamour commented May 3, 2018

无论是在参加比赛,还是在实际工程中,把数据处理完之后,肯定不会只用一种算法进行测试,不可避免的采用多种算法进行比较。而autoclf就是这次在阿里风险支付大赛进行中,花了两天撸出来的一个简单框架。

目录结构为

├── [4.0K]  clf
│   │
│   ├── [4.0K]  nn
│
├── [4.0K]  data
│
├── [4.0K]  pipe
│
└── [4.0K]  saved

clf 目录下是常见的或者自定义的算法, 而nn 目录下是自定义的深度学习算法,和sklearn接口绑定的,自定义的算法是以类的形式存在的,只需要实现fit,score,predict即可,但是如果自己的fit使用了
sklearn的训练,就无需再自定义score,predict了。例如这样: (clf/isvc.py)

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest,chi2

from sklearn.decomposition import PCA, NMF
from sklearn.svm import SVC



class IGridSVC():
    N_FEATURES_OPTIONS = [2, 4]
    C_OPTIONS = [1, 10, 100, 1000]
    param_grid = [
            {
            'reduce_dim': [PCA(iterated_power=7), NMF()],
            'reduce_dim__n_components': N_FEATURES_OPTIONS,
            'classify__C': C_OPTIONS
        },
            {
            'reduce_dim': [SelectKBest(chi2)],
            'reduce_dim__k': N_FEATURES_OPTIONS,
            'classify__C': C_OPTIONS
        },]

    pipe = Pipeline([
        ('reduce_dim', PCA()),
        ('classify', SVC( kernel="linear", probability=True))
    ])

    def __init__(self):
        self.model = None

    def fit(self,x_train, y_train):
        self.model = GridSearchCV(IGridSVC.pipe, cv=3, n_jobs=-1, param_grid=IGridSVC.param_grid)
        self.model = self.model.fit(x_train,y_train)

    def score(self,x_test,y_test):
        return self.model.score(x_test,y_test)
    
    def predict(self, x_test):
        return self.model.predict(x_test)

    def predict_proba(self, x_test):
        return self.model.predict_proba(x_test)

然后在train.py里调用即可,工程还在继续完善,准备增加命令行接口。然而在训练中最重要的无疑是数据预处理和特征选择,而在train.py中传入的数据就是来自pipe文件夹里定义的预处理函数。
train.py

import os

import pandas as pd
import numpy as np

# Sklearn Common Import
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

# Decomposition
# PCA 无监督, LDA 有监督
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# Some Classifier Algorithms

from sklearn.ensemble import  RandomForestClassifier, AdaBoostClassifier,\
                             ExtraTreesClassifier,GradientBoostingClassifier,VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression


def train(x_train,y_train,x_test,y_test):

    clf1 = DecisionTreeClassifier()     #max_depth=4a
    clf1_1 = RandomForestClassifier()
    clf2 = KNeighborsClassifier(n_neighbors=7)
    clf3 = SVC(kernel='rbf', probability=True)      # So slowly
    clf4 = LogisticRegression(random_state=1)
    clf5 = XGBClassifier()
    clf6 = GaussianNB()
    clf7 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                            algorithm="SAMME",n_estimators=200)

    from clf import IGridSVC
    c_clf1 = IGridSVC()

    voting1 = VotingClassifier(
        estimators=[
            ('dt',clf1),
            ('knn', clf2),
            ('svc', clf3),
            ('lg',clf4)    
        ],
        voting='soft'
    )

    voting2 = VotingClassifier(
        estimators=[
            ('dt',clf1),
            ('knn', clf2),
            ('svc', clf3),
            ('lg',clf4),    
            ('xgb',clf5)
        ],
        voting='soft'
    )

    clfs = [  
        c_clf1,
        clf3,
        clf1,clf1_1,clf2,clf4,clf5,clf6,clf7,voting1,voting2
    ]

    for clf in clfs:
        name = clf.__class__.__name__
        modeldumpname = "saved/{}.pkl".format(name.lower())

        print("[*] Now Training With {:<10s}".format(name))

        try:
            clf.fit(x_train,y_train)
            score = clf.score(x_test,y_test)
            if os.path.isfile(modeldumpname):
                print("[x] {} Already Exists".format(modeldumpname))
                modeldumpname = "{}.second".format(modeldumpname)
                print("[-] Rename {}".format(modeldumpname))
                
            joblib.dump(clf,modeldumpname)

            print("[+] Saving Model {:<10s} with accuracy: {}".format(modeldumpname,score))

        except KeyboardInterrupt:
            print("[-] Skip {}".format(name))
        # if not name.startswith("i"):    # custom class not implement cross valdation 
        #     score = np.mean(cross_val_score(clf, x_train, y_train, cv=10))
        # else:
        #     score = np.mean(clf.cross_val_score(x_train,y_train,cv=10))


if __name__ == '__main__':
    
    print('Loading Data....',end='',flush=True)
    from pipe import iload_iris_pipe
    x_train, y_train, x_test, y_test = iload_iris_pipe()
    print('\tDone')
    train(x_train, y_train, x_test, y_test)

pipe/iload_aliaetc.py

import pandas as pd
import os

from sklearn.model_selection import train_test_split

train_data_path = 'data/atec_anti_fraud_train.csv'
predict_data_path = 'data/atec_anti_fraud_test_a.csv'

DROPCOLUMS = ["id","label","date"]
# 0 .... 1, 0 is safe / 1 is not safe

def iload_aliatec_pipe():

    if os.path.isfile(train_data_path) and os.path.isfile(predict_data_path):
        print("[√] Path Checked, File Exists")
    else: 
        print("[X] Please Make Sure Your Datasets Was Exists")
        import sys
        sys.exit(1)
        
    data = pd.read_csv(train_data_path)
    data = data.fillna(0)
    unlabeled = data[data['label'] == -1]
    labeled = data[data['label'] != -1]

    train, test = train_test_split(labeled, test_size=0.2, random_state=42)

    cols = [c for c in DROPCOLUMS if c in train.columns]
    x_train = train.drop(cols,axis=1)

    cols = [c for c in DROPCOLUMS if c in test.columns]
    x_test = test.drop(cols,axis=1)

    y_train = train['label']
    y_test = test['label']
    return x_train, y_train, x_test, y_test

def iload_predict_data():
    upload_test = pd.read_csv(predict_data_path)
    upload_test = upload_test.fillna(0)
    upload_id = upload_test['id']
    
    cols = [c for c in DROPCOLUMS if c in upload_test.columns]
    upload_test = upload_test.drop(cols,axis=1)

    return upload_id, upload_test


def isave_predict_data(data_id,predict,filename):
    p = pd.DataFrame(predict,columns=["score"])
    res = pd.concat([data_id,p],axis=1)
    res.to_csv(filename,index=False)
    print("[+] Save Predict Result To {} Sucessful".format(filename))

而预测是单独的predict.py文件,会自动加载由train.py训练完成后的模型,然后进行批量预测,并保存到相应的文件夹中。

ps: 1332 人参赛,目前居然只有36人提交结果。大佬都是在后面提交的啊。自己还是真的菜。不过让我想不明白的是,这个比赛每天只启动一次系统测评,不像之前的腾讯广告回流的那个,每次提交都会即时评测,可以方便调优,但这个又不是,搞不懂。不过我已经有了新的思路去做这个了。关键点不一定是那些个-1的标签,但一定是个突破点。

screenshot from 2018-05-03 23-58-02

Resources

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

1 participant