In [1]:
# 1、数据获取
import pandas as pd
from sklearn.model_selection import learning_curve, train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv('/Users/zhangqihao/Desktop/PYTHON/机器学习/信用卡/UCI_Credit_Card.csv')

In [2]:
# 2、数据探索
#print(data.shape) # 查看数据集大小
#print(data.describe()) # 数据集概览
data.drop(['ID'], inplace=True, axis =1) 

# 目标集
# 目标集选择
target = data['default.payment.next.month'].values

# 特征集
# 选择特征
columns = data.columns.tolist()
# 去除特征中的目标
columns.remove('default.payment.next.month')
# 特征集选择
features = data[columns].values

In [3]:
# 4、数据集划分
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.30, 
                                                    stratify = target, random_state = 1)

In [4]:
# 5、流程

# 构造各种分类器
# model
classifiers = [
    # SVM
    SVC(random_state = 1, kernel = 'rbf'),    
    # 决策树
    DecisionTreeClassifier(random_state = 1, criterion = 'gini'),
    # 随机森林
    RandomForestClassifier(random_state = 1, criterion = 'gini'),
    # KNN
    KNeighborsClassifier(metric = 'minkowski'),
    # AdaBoost
    AdaBoostClassifier(random_state = 1)
]

# 分类器名称
# model_name
classifier_names = [
            'svc', 
            'decisiontreeclassifier',
            'randomforestclassifier',
            'kneighborsclassifier',
            'adaboostclassifier'
]

# 分类器参数
# model_param_grid
classifier_param_grid = [
            {'svc__C':[1], 'svc__gamma':[0.01]},
            {'decisiontreeclassifier__max_depth':[6,9,11]},
            {'randomforestclassifier__n_estimators':[3,5,6]} ,
            {'kneighborsclassifier__n_neighbors':[4,6,8]},
            {'adaboostclassifier__n_estimators':[10,50,100]}
]

In [5]:
# 对具体的分类器进行GridSearchCV参数调优
def GridSearchCV_work(pipeline, x_train, x_test, y_train, y_test, param_grid, score = 'accuracy'):
    
    response = {}
    gridsearch = GridSearchCV(estimator = pipeline, param_grid = param_grid, scoring = score)
   
    # 寻找最优的参数 和最优的准确率分数
    search = gridsearch.fit(x_train, y_train)
    print("GridSearch最优参数：", search.best_params_)
    print("GridSearch最优分数： %0.4lf" %search.best_score_)
    y_predict = gridsearch.predict(x_test)
    print("准确率 %0.4lf" %accuracy_score(y_test, y_predict))
    
    response['y_predict'] = y_predict
    response['accuracy_score'] = accuracy_score(y_test,y_predict)
    return response

In [6]:
for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid):
    pipeline = Pipeline([
            ('scaler', StandardScaler()),
            (model_name, model)
    ])
    result = GridSearchCV_work(pipeline, x_train, x_test, y_train, y_test, model_param_grid , score = 'accuracy')

GridSearch最优参数： {'svc__C': 1, 'svc__gamma': 0.01}
GridSearch最优分数： 0.8174
准确率 0.8172
GridSearch最优参数： {'decisiontreeclassifier__max_depth': 6}
GridSearch最优分数： 0.8186
准确率 0.8113
GridSearch最优参数： {'randomforestclassifier__n_estimators': 6}
GridSearch最优分数： 0.7998
准确率 0.7994
GridSearch最优参数： {'kneighborsclassifier__n_neighbors': 8}
GridSearch最优分数： 0.8040
准确率 0.8036
GridSearch最优参数： {'adaboostclassifier__n_estimators': 10}
GridSearch最优分数： 0.8187
准确率 0.8129
