We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
无论是在参加比赛,还是在实际工程中,把数据处理完之后,肯定不会只用一种算法进行测试,不可避免的采用多种算法进行比较。而autoclf就是这次在阿里风险支付大赛进行中,花了两天撸出来的一个简单框架。
目录结构为
├── [4.0K] clf │ │ │ ├── [4.0K] nn │ ├── [4.0K] data │ ├── [4.0K] pipe │ └── [4.0K] saved
clf 目录下是常见的或者自定义的算法, 而nn 目录下是自定义的深度学习算法,和sklearn接口绑定的,自定义的算法是以类的形式存在的,只需要实现fit,score,predict即可,但是如果自己的fit使用了 sklearn的训练,就无需再自定义score,predict了。例如这样: (clf/isvc.py)
clf
nn
sklearn
fit
score
predict
clf/isvc.py
from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.feature_selection import SelectKBest,chi2 from sklearn.decomposition import PCA, NMF from sklearn.svm import SVC class IGridSVC(): N_FEATURES_OPTIONS = [2, 4] C_OPTIONS = [1, 10, 100, 1000] param_grid = [ { 'reduce_dim': [PCA(iterated_power=7), NMF()], 'reduce_dim__n_components': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, { 'reduce_dim': [SelectKBest(chi2)], 'reduce_dim__k': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS },] pipe = Pipeline([ ('reduce_dim', PCA()), ('classify', SVC( kernel="linear", probability=True)) ]) def __init__(self): self.model = None def fit(self,x_train, y_train): self.model = GridSearchCV(IGridSVC.pipe, cv=3, n_jobs=-1, param_grid=IGridSVC.param_grid) self.model = self.model.fit(x_train,y_train) def score(self,x_test,y_test): return self.model.score(x_test,y_test) def predict(self, x_test): return self.model.predict(x_test) def predict_proba(self, x_test): return self.model.predict_proba(x_test)
然后在train.py里调用即可,工程还在继续完善,准备增加命令行接口。然而在训练中最重要的无疑是数据预处理和特征选择,而在train.py中传入的数据就是来自pipe文件夹里定义的预处理函数。 train.py
train.py
pipe
import os import pandas as pd import numpy as np # Sklearn Common Import from sklearn.metrics import confusion_matrix, roc_auc_score from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV from sklearn.feature_selection import SelectKBest, SelectFromModel from sklearn.pipeline import Pipeline from sklearn.externals import joblib # Decomposition # PCA 无监督, LDA 有监督 from sklearn.decomposition import PCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA # Some Classifier Algorithms from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,\ ExtraTreesClassifier,GradientBoostingClassifier,VotingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from xgboost import XGBClassifier from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression def train(x_train,y_train,x_test,y_test): clf1 = DecisionTreeClassifier() #max_depth=4a clf1_1 = RandomForestClassifier() clf2 = KNeighborsClassifier(n_neighbors=7) clf3 = SVC(kernel='rbf', probability=True) # So slowly clf4 = LogisticRegression(random_state=1) clf5 = XGBClassifier() clf6 = GaussianNB() clf7 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME",n_estimators=200) from clf import IGridSVC c_clf1 = IGridSVC() voting1 = VotingClassifier( estimators=[ ('dt',clf1), ('knn', clf2), ('svc', clf3), ('lg',clf4) ], voting='soft' ) voting2 = VotingClassifier( estimators=[ ('dt',clf1), ('knn', clf2), ('svc', clf3), ('lg',clf4), ('xgb',clf5) ], voting='soft' ) clfs = [ c_clf1, clf3, clf1,clf1_1,clf2,clf4,clf5,clf6,clf7,voting1,voting2 ] for clf in clfs: name = clf.__class__.__name__ modeldumpname = "saved/{}.pkl".format(name.lower()) print("[*] Now Training With {:<10s}".format(name)) try: clf.fit(x_train,y_train) score = clf.score(x_test,y_test) if os.path.isfile(modeldumpname): print("[x] {} Already Exists".format(modeldumpname)) modeldumpname = "{}.second".format(modeldumpname) print("[-] Rename {}".format(modeldumpname)) joblib.dump(clf,modeldumpname) print("[+] Saving Model {:<10s} with accuracy: {}".format(modeldumpname,score)) except KeyboardInterrupt: print("[-] Skip {}".format(name)) # if not name.startswith("i"): # custom class not implement cross valdation # score = np.mean(cross_val_score(clf, x_train, y_train, cv=10)) # else: # score = np.mean(clf.cross_val_score(x_train,y_train,cv=10)) if __name__ == '__main__': print('Loading Data....',end='',flush=True) from pipe import iload_iris_pipe x_train, y_train, x_test, y_test = iload_iris_pipe() print('\tDone') train(x_train, y_train, x_test, y_test)
pipe/iload_aliaetc.py
import pandas as pd import os from sklearn.model_selection import train_test_split train_data_path = 'data/atec_anti_fraud_train.csv' predict_data_path = 'data/atec_anti_fraud_test_a.csv' DROPCOLUMS = ["id","label","date"] # 0 .... 1, 0 is safe / 1 is not safe def iload_aliatec_pipe(): if os.path.isfile(train_data_path) and os.path.isfile(predict_data_path): print("[√] Path Checked, File Exists") else: print("[X] Please Make Sure Your Datasets Was Exists") import sys sys.exit(1) data = pd.read_csv(train_data_path) data = data.fillna(0) unlabeled = data[data['label'] == -1] labeled = data[data['label'] != -1] train, test = train_test_split(labeled, test_size=0.2, random_state=42) cols = [c for c in DROPCOLUMS if c in train.columns] x_train = train.drop(cols,axis=1) cols = [c for c in DROPCOLUMS if c in test.columns] x_test = test.drop(cols,axis=1) y_train = train['label'] y_test = test['label'] return x_train, y_train, x_test, y_test def iload_predict_data(): upload_test = pd.read_csv(predict_data_path) upload_test = upload_test.fillna(0) upload_id = upload_test['id'] cols = [c for c in DROPCOLUMS if c in upload_test.columns] upload_test = upload_test.drop(cols,axis=1) return upload_id, upload_test def isave_predict_data(data_id,predict,filename): p = pd.DataFrame(predict,columns=["score"]) res = pd.concat([data_id,p],axis=1) res.to_csv(filename,index=False) print("[+] Save Predict Result To {} Sucessful".format(filename))
而预测是单独的predict.py文件,会自动加载由train.py训练完成后的模型,然后进行批量预测,并保存到相应的文件夹中。
ps: 1332 人参赛,目前居然只有36人提交结果。大佬都是在后面提交的啊。自己还是真的菜。不过让我想不明白的是,这个比赛每天只启动一次系统测评,不像之前的腾讯广告回流的那个,每次提交都会即时评测,可以方便调优,但这个又不是,搞不懂。不过我已经有了新的思路去做这个了。关键点不一定是那些个-1的标签,但一定是个突破点。
The text was updated successfully, but these errors were encountered:
No branches or pull requests
无论是在参加比赛,还是在实际工程中,把数据处理完之后,肯定不会只用一种算法进行测试,不可避免的采用多种算法进行比较。而autoclf就是这次在阿里风险支付大赛进行中,花了两天撸出来的一个简单框架。
目录结构为
clf
目录下是常见的或者自定义的算法, 而nn
目录下是自定义的深度学习算法,和sklearn
接口绑定的,自定义的算法是以类的形式存在的,只需要实现fit
,score
,predict
即可,但是如果自己的fit
使用了sklearn
的训练,就无需再自定义score
,predict
了。例如这样: (clf/isvc.py
)然后在
train.py
里调用即可,工程还在继续完善,准备增加命令行接口。然而在训练中最重要的无疑是数据预处理和特征选择,而在train.py
中传入的数据就是来自pipe
文件夹里定义的预处理函数。train.py
pipe/iload_aliaetc.py
而预测是单独的predict.py文件,会自动加载由
train.py
训练完成后的模型,然后进行批量预测,并保存到相应的文件夹中。ps: 1332 人参赛,目前居然只有36人提交结果。大佬都是在后面提交的啊。自己还是真的菜。不过让我想不明白的是,这个比赛每天只启动一次系统测评,不像之前的腾讯广告回流的那个,每次提交都会即时评测,可以方便调优,但这个又不是,搞不懂。不过我已经有了新的思路去做这个了。关键点不一定是那些个-1的标签,但一定是个突破点。
Resources
The text was updated successfully, but these errors were encountered: