In [None]:
import numpy as np
from scipy.stats import randint
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV

In [None]:
import numpy as np
from scipy.stats import randint
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

cleaned_data["y"] = df['y']

# target
y = cleaned_data['y']

# features
cleaned_data.drop(['y'], axis = 1, inplace = True)
X = cleaned_data

#Instantiate CatBoostClassifier
cbc = CatBoostClassifier(class_weights={0:1, 1:6})

# # Creating the hyperparameter grid
# param_dist = { "learning_rate": np.linspace(0,0.2,5),
#                "max_depth": randint(3, 10)}
# #Instantiate RandomSearchCV object
# rscv = RandomizedSearchCV(cbc , param_dist, scoring='accuracy', cv =5)
# {'learning_rate': 0.5, 'max_depth': 6, 'n_estimators': 1000}

grid = {'max_depth': [3,4,5,6], 'n_estimators':[200, 500, 1000], "learning_rate": [0.01, 0.5, 1, 1.5]}   
#Instantiate GridSearchCV
gscv = GridSearchCV(estimator = cbc, param_grid = grid, scoring ='accuracy', cv = 5)


#Fit the model
categorical_features_indices = np.where(X.dtypes != np.float)[0]
gscv.fit(X,y,cat_features=categorical_features_indices, plot = True)

# Print the tuned parameters and score
print(gscv.best_params_)
print(gscv.best_score_)

In [None]:
def reporting(ensem_preds, targets):
    best_th = 0
    best_score = 0

    for th in np.arange(0.0, 0.6, 0.01):
        pred = (ensem_preds > th).astype(int)
        score = f1_score(targets, pred)
        if score > best_score:
            best_th = th
            best_score = score

    print(f"\nAUC score: {roc_auc_score(targets, ensem_preds):12.4f}")
    print(f"Best threshold {best_th:12.4f}")

    preds = (ensem_preds > best_th).astype(int)
    # print(classification_report(targets, preds, digits=3))

    cm1 = confusion_matrix(targets, preds)
    print('\nConfusion Matrix : \n', cm1)
    total1=sum(sum(cm1))

    print('\n=============')
    accuracy1=(cm1[0,0]+cm1[1,1])/total1
    print (f'Accuracy    : {accuracy1:12.4f}')

    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    print(f'Sensitivity : {sensitivity1:12.4f}')

    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print(f'Specificity : {specificity1:12.4f}')

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import ExtraTreesClassifier

class Model(BaseEstimator, ClassifierMixin):  
    """mixed of lgb, xgb, et"""

    def __init__(self, seed=0, nest_lgb=1.0, nest_xgb=1.0, nest_et=1.0, cbt=0.75, ss=0.75, alpha=0.5, pos_scale=1.):
        """
        Top 3 tree models
        """
        self.classes_ = [0,1]
        self.models = [
                         lgb.LGBMClassifier(class_weight='balanced', 
                                            num_leaves=31, 
                                            max_depth=-1, 
                                            min_child_samples=2, 
                                            learning_rate=0.02,
                                            n_estimators=int(100*nest_lgb), 
                                            colsample_bytree=cbt, 
                                            subsample=ss, 
                                            n_jobs=-1, 
                                            random_state=0+seed
                                           ),
                         lgb.LGBMClassifier(class_weight=None, 
                                            num_leaves=31, 
                                            max_depth=-1, 
                                            min_child_samples=2, 
                                            learning_rate=0.01,
                                            n_estimators=int(200*nest_lgb), 
                                            colsample_bytree=cbt, 
                                            subsample=ss, 
                                            n_jobs=-1, 
                                            random_state=1+seed
                                           ),
                         xgb.XGBClassifier(max_depth=12,
                                           scale_pos_weight=1.,
                                           learning_rate=0.01, 
                                           n_estimators=int(100 * nest_xgb),
                                           subsample=ss, # 0.75
                                           colsample_bytree=cbt, # 0.75
                                           nthread=-1,
                                           seed=0+seed,
                                           eval_metric='logloss',
                                           use_label_encoder=False
                                          ),
                         xgb.XGBClassifier(max_depth=6,
                                           scale_pos_weight=pos_scale,
                                           learning_rate=0.01,
                                           n_estimators=int(200 * nest_xgb),
                                           subsample=ss, # 0.75
                                           colsample_bytree=cbt, # 0.75
                                           nthread=-1,
                                           seed=1+seed,
                                           eval_metric='logloss',
                                           use_label_encoder=False
                                          ),
                         ExtraTreesClassifier(class_weight='balanced', 
                                              bootstrap=False, 
                                              criterion='entropy', 
                                              max_features=cbt, 
                                              min_samples_leaf=4, 
                                              min_samples_split=3, 
                                              n_estimators= int(100 * nest_et), 
                                              random_state=0+seed, 
                                              n_jobs=-1),
                         
                      ]
        self.weights = [(1-alpha)*1, (1-alpha)*1, (1-alpha)*1, (1-alpha)*1, (1-alpha)*1]


    def fit(self, X, y=None):
        for t, clf in enumerate(self.models):
            clf.fit(X, y)
        return self

    def predict(self, X):
        suma = 0.0
        for t, clf in enumerate(self.models):
            a = clf.predict_proba(X)[:, 1]
            suma += (self.weights[t] * a)
        return (suma / sum(self.weights))
            
    def predict_proba(self, X):      
        return (self.predict(X))