# Env pycaret

In [None]:
from pycaret.classification import *
from pycaret.datasets import get_data
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import numpy as np
import shap

#### 1.1 data info

In [None]:
data_selected = pd.read_csv("train.csv", encoding='gb18030')
print(data_selected.head(2))

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)  
for train_index, test_index in split.split(data_selected, data_selected['y']):
    validation_indices = test_index

#### 1.2 set env

In [None]:
eda_env = setup(data = data_selected, target = 'y' ,transformation = True, normalize = True,
                use_gpu = False, fold = 5,train_size = 0.8, session_id=seed,  
                data_split_shuffle = True, data_split_stratify = True,   
                categorical_imputation = 'mode', numeric_imputation = 'median')

#### 1.3 model train and select

In [None]:
model_params = {

    'AdaBoostClassifier': {
        'n_estimators': [10,10000],
        'learning_rate': [0.1, 1],
    },
    
    'BaggingClassifier': {
        'n_estimators': [10, 100, 1000, 10000],
        'max_samples' : [1, 10, 100, 1000],
    },
    
    'GradientBoostingClassifier': {
        'n_estimators': [10, 100, 1000, 10000],
        'learning_rate': [0.001, 0.01, 0.1, 1]
    },
    
    'LassoRegression':{
        'alpha': [0.01, 0.1, 0.5, 1]
    },
    
    'RandomForestClassifier': {
        'n_estimators': [10, 100],
        'max_features': ['auto'],
        'max_depth': [5, 10],
        'criterion': ['gini']
    },
    
    'XGBClassifier': {
        'max_depth': [2, 3],
        'n_estimators': [100, 1000],
        'learning_rate': [0.1, 0.2],
        'eval_metric' : ['mlogloss']
    },
    
    'ExtraTreesClassifier': {
        'n_estimators': [100, 1000, 10000],
        'criterion' : ['gini', 'entropy'],
        'max_features': ['auto', 'log2'],
        'max_depth': [2, 5, 10, 20, 50, 100]
    },
    
    'LogisticRegression': {
        'penalty' : ['l1', 'l2'],
        'C' : [0.1, 1, 10, 100, 200, 500, 1000]
    },
    
    'PassiveAggressiveClassifier': {
        'C' : [0.0001, 0.0003, 0.001, 0.003, 0.01],
        'loss': ['hinge', 'squared_hinge'],
        'n_iter_no_change': [5, 10, 30, 100, 300]
    },
    
    'SGDClassifier': {
        'loss': ['modified_huber'],
        'alpha': [0.01, 0.1, 0.5, 1],
        'penalty': ['l2', 'l1', None]
    },
    
    'Perceptron': {
        'alpha': [0.0001, 0.001, 0.01],
        'penalty': ['l2', 'l1', None]
    },
    
    'BernoulliNB': {
        'alpha': np.linspace(0.1,1,10),
        'binarize': [0, None],
        'fit_prior': [True, False]
    },
    
    'GaussianNB': {},

    'KNeighborsClassifier': {
        'n_neighbors' : [3, 5, 10, 20],
        'leaf_size' : [2, 5, 10, 20],
        'p' : [0.5, 1, 2, 5],
        'weights' : ['uniform', 'distance'],
        'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    
    'LinearSVC': {
        'penalty': ['l1', 'l2'],
        'loss': ['hinge', 'squared_hinge'],
        'C': [0.1, 0.5, 1, 5, 10]
    },
    
    'DecisionTreeClassifier': {
        'criterion': ['entropy','gini'],
        'splitter' : ['random', 'best'],
        'max_depth':[1, 2, 3, 5, 6, 8, 10],
        'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10]
    },
      
    'LinearDiscriminantAnalysis': {
        'solver': ['svd', 'lsqr', 'eigen'],
        'tol': [0.00001,0.0001,0.0003]
    },
    
    'QuadraticDiscriminantAnalysis': {
        'reg_param': [0.1, 0.5, 0.7, 0.9],
        'tol': [0.00001,0.0001,0.0003]
    },
    
    'MLPClassifier': {
        'solver': ['lbfgs', 'adam'],
        'learning_rate': ['constant', 'invscaling', 'adaptive'],
        'hidden_layer_sizes': [(10, 7, 3), (30, 20, 12), (50, 35, 25), (70, 50, 35)],
        'activation': ['identity', 'logistic', 'tanh', 'relu']
    }
}

In [None]:
import re
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from scipy import stats 

def string_match_(params,string_dict):
    result=[]
    list_key=re.split(r'[;,\s()]\s*',string_dict)
    for key, value in params.items():
        for string_o in list_key:
            if key in string_o:
                result.append(string_o)
    return result

def bootstrap_auc(y, pred, classes, bootstraps = 100): 
    fold_size = len(y)
    statistics = np.zeros((len(classes), bootstraps))
    for c in range(len(classes)):
        dataframe = pd.DataFrame(columns=['y', 'pred'])
        dataframe.loc[:, 'y'] = y[:]
        dataframe.loc[:, 'pred'] = pred[:]
        df_pos = dataframe[dataframe.y == 1]
        df_neg = dataframe[dataframe.y == 0]
        prevalence = len(df_pos) / len(dataframe)
        for i in range(bootstraps):
            pos_sample = df_pos.sample(n = int(fold_size * prevalence), replace=True)
            neg_sample = df_neg.sample(n = int(fold_size * (1-prevalence)), replace=True)
            y_sample = np.concatenate([pos_sample.y.values, neg_sample.y.values])
            pred_sample = np.concatenate([pos_sample.pred.values, neg_sample.pred.values])
            score = roc_auc_score(y_sample, pred_sample)
            statistics[c][i] = score
        CI = stats.t.interval(alpha=0.95, df=len(statistics[c]) - 1, loc=np.mean(statistics[c]), 
                              scale=stats.sem(statistics[c]))  #stats.sem / np.std
    return CI

In [None]:
data_all = []
val_all1 = []
val_all2 = []
data_proba = {}
val_proba1 = {}
val_proba2 = {}
no_grid = ['nb']
para_model = [['lr','LogisticRegression',],         #- Logistic Regression
             ['nb','GaussianNB'],                   #- Naive Bayes                    
             ['mlp','MLPClassifier'],               #- MLP Classifier
             ['rf','RandomForestClassifier'],       #- Random Forest Classifier
             ['ada','AdaBoostClassifier'],          #- Ada Boost Classifier
             ['xgboost','XGBClassifier'],           #- Extreme Gradient Boosting       
             ['lda','LinearDiscriminantAnalysis'],  #- Linear Discriminant Analysis
             ['knn','KNeighborsClassifier']         #- K Neighbors Classifier
             ]      
for single_model,single_para in para_model:
    model = create_model(single_model)
    params = model_params[single_para]
    if single_model in no_grid:
        tuned_model = tune_model(model)
    else:
        tuned_model = tune_model(model, custom_grid = params, optimize='AUC',   # optimize：Accuracy/AUC/F1/Recall/Precision
                                 search_library = 'scikit-learn', search_algorithm = 'grid',choose_better = True, n_iter = 50)
    tuned_model_result = pull()
    pred = predict_model(tuned_model, probability_threshold = 0.5)
    tuned_model_result_2 = pull()
    data_proba[single_para] = [1-a if list(pred['prediction_label'])[i]==0 
                               else a for i,a in enumerate(list(pred['prediction_score'])) ] 
    data_proba[single_para+"_y"] = list(pred['y'])
    
    data_single = [single_para,tuned_model_result_2['AUC'],
                   bootstrap_auc(np.array(data_proba[single_para+"_y"]),np.array(data_proba[single_para]), 
                                 [single_para]),string_match_(params,str(tuned_model))] 
    data_all.append(data_single)
    
    # finalize model :
    final_model = finalize_model(tuned_model)
    
    
    # 外部验证：
    test1 = pd.read_csv("test.csv", encoding='gb18030')
    val_pred1 = predict_model(final_model, data = test1)
    final_model_result_1 = pull()
    val_proba1[single_para] = [1-a if list(val_pred1['prediction_label'])[i]==0 
                              else a for i,a in enumerate(list(val_pred1['prediction_score'])) ] 
    val_proba1[single_para+"_y"] = list(val_pred1['y'])
    final_single1 = [single_para, final_model_result_1['AUC'],
                    bootstrap_auc(np.array(val_proba1[single_para+"_y"]),np.array(val_proba1[single_para]), [single_para])] 
    val_all1.append(final_single1)

In [None]:
#  内部验证结果：
df = pd.DataFrame(data_all,columns=['Models','AUC_M','AUC','best_hyper'], dtype=float)    
df.to_csv('C:/Users/mialu/Desktop/AUC_ivs.csv')      
df_pro = pd.DataFrame(data_proba)   
df_pro.to_csv('C:/Users/mialu/Desktop/prob_ivs.csv')  

#  外部验证结果：
val1 = pd.DataFrame(val_all1,columns=['Models','AUC_M','AUC'], dtype=float)   
val1.to_csv('C:/Users/mialu/Desktop/AUC_val1.csv')     
val_pro1 = pd.DataFrame(val_proba1)   
val_pro1.to_csv('C:/Users/mialu/Desktop/prob_val1.csv') 