In [None]:
#import numpy as np
#import pandas as pd
#import pandas_profiling as pd_pi
#import seaborn as sns
#%matplotlib inline
#from sklearn import preprocessing as pre
#from sklearn.pipeline import Pipeline as pipe
#from sklearn.model_selection import train_test_split
#from sklearn.linear_model import RidgeClassifier
##from sklearn.linear_model import Lasso
#from sklearn.linear_model import LogisticRegression
#from sklearn.linear_model import SGDClassifier
#from sklearn import metrics
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.model_selection import GridSearchCV
#from sklearn.model_selection import RandomizedSearchCV
#from scipy import stats
#import random
#from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import cross_val_predict
#from sklearn.model_selection import cross_validate
#from sklearn.ensemble import AdaBoostClassifier
#from sklearn.ensemble import GradientBoostingClassifier
#from sklearn.model_selection import StratifiedKFold
#from sklearn.metrics import roc_auc_score
#from sklearn.metrics import cohen_kappa_score
#import matplotlib.pyplot as plt

In [1]:
#ORANGE JUICE DATA
###################
%matplotlib inline
import pandas as pd
data = pd.read_csv('OJ.csv')
data = data.drop(columns='Id')
data['Purchase'] = data['Purchase'].replace(['CH','MM'],[1,0])
data['Store7'] = data['Store7'].replace(['Yes','No'],[1,0])
data = data.drop('STORE',axis = 1)
data_dummy = pd.get_dummies(data)

In [2]:
def setup(data, 
          target, 
          split=0.7):
    
    from sklearn.model_selection import train_test_split
    X = data.drop(target,axis=1)
    y = data[target]
    global X_train, X_test, y_train, y_test, seed
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-split)
    import random
    seed = random.randint(150,900)
    return X_train, X_test, y_train, y_test, seed

In [3]:
s = setup(data, 'Purchase')

In [4]:
def create_model(estimator = None, 
                 ensemble = False, 
                 method = 'Bagging', 
                 fold = 10, 
                 round = 4,  
                 verbose = True):
    
     
    """  
     
  Description:
  ------------
  This function creates a model and scores it using Stratified Cross Validation. 
  The output prints the score grid that shows Accuracy, AUC, Recall, Precision, 
  F1 and Kappa by fold (default = 10). 
  
  Function also returns a trained model object that can be used for further 
  processing in pycaret or can be used to call any method available in sklearn. 
  
  setup() function must be called before using create_model()
  
    Example
    -------
    lr = create_model('lr')
    
    This will return trained Logistic Regression.
  
  Parameters
  ----------
  
  estimator : string, default = None
  
  Enter abbreviated string of the estimator class. List of estimators supported:
  
  Estimator                   Abbreviated String     Original Implementation 
  ---------                   ------------------     -----------------------
  Logistic Regression         'lr'                   linear_model.LogisticRegression
  K Nearest Neighbour         'knn'                  neighbors.KNeighborsClassifier
  Naives Bayes                'nb'                   naive_bayes.GaussianNB
  Decision Tree               'dt'                   tree.DecisionTreeClassifier
  SVM (Linear)                'svm'                  linear_model.SGDClassifier
  SVM (RBF)                   'rbfsvm'               svm.SVC
  Gaussian Process            'gpc'                  gaussian_process.GPC
  Multi Level Perceptron      'mlp'                  neural_network.MLPClassifier
  Ridge Classifier            'ridge'                linear_model.RidgeClassifier
  Random Forest               'rf'                   ensemble.RandomForestClassifier
  Quadratic Disc. Analysis    'qda'                  discriminant_analysis.QDA
  AdaBoost                    'ada'                  ensemble.AdaBoostClassifier
  Gradient Boosting           'gbc'                  ensemble.GradientBoostingClassifier
  Linear Disc. Analysis       'lda'                  discriminant_analysis.LDA
  Extra Trees Classifier      'et'                   ensemble.ExtraTreesClassifier
  
  ensemble: Boolean, default = False
  True would enable ensembling of models through Bagging/Boosting method to be defined by 'method'.
  
  method: String, 'Bagging' or 'Boosting', default = Bagging
  method comes into effect only when ensemble = True. Default is set to Bagging.
  
  fold: integer, default = 10
  Number of folds will determine how many folds would be done in the Kfold cross validation.
  
  round: integer, default = 4
  The number indicates the number of decimal places metrics will be rounded to. 

  verbose: Boolean, default = True
  Score grid is not printed when verbose is set to False.
  
  Returns:
  --------
  
  score grid:   A table containing the scores of the model across the kfolds. 
  -----------   Scoring metrics used are Accuracy, AUC, Recall, Precision, F1 
                and Kappa. Mean and standard deviation of the scores across the 
                folds is also returned.
  
  model:        trained model object
  -----------

  Warnings:
  ---------
  None
  
    """
    
    #progress bar
    import ipywidgets as ipw
    from IPython.display import display, HTML, clear_output
    progress = ipw.IntProgress(value=0, min=0, max=fold+3, step=1 , description='Processing: ')
    display(progress)
    
    #defining X_train and y_train called from setup() into variable data_X and data_y to be used in cross validation   
    data_X = X_train
    data_y = y_train
  
    #ignore warnings
    import warnings
    warnings.filterwarnings('ignore') 
  
    #dependencies
    import numpy as np
    import pandas as pd
    from sklearn import metrics
    from sklearn.model_selection import StratifiedKFold
    import sys
    
    progress.value += 1
    
    #setting cross validation
    kf = StratifiedKFold(fold, random_state=seed)

    score_auc =np.empty((0,0))
    score_acc =np.empty((0,0))
    score_recall =np.empty((0,0))
    score_precision =np.empty((0,0))
    score_f1 =np.empty((0,0))
    score_kappa =np.empty((0,0))
    avgs_auc =np.empty((0,0))
    avgs_acc =np.empty((0,0))
    avgs_recall =np.empty((0,0))
    avgs_precision =np.empty((0,0))
    avgs_f1 =np.empty((0,0))
    avgs_kappa =np.empty((0,0))
  
    #error handling
    
    if estimator == None:
        print("Please enter your custom model as on object or choose from model library. If you have previously defined the estimator, the output is generated using the same estimator") 
        sys.exit('Exception Handling XXX')

    elif estimator == 'lr':

        from sklearn.linear_model import LogisticRegression
        model = LogisticRegression(random_state=seed)
        full_name = 'Logistic Regression'

    elif estimator == 'knn':

        from sklearn.neighbors import KNeighborsClassifier
        model = KNeighborsClassifier()
        full_name = 'K Nearest Neighbours'

    elif estimator == 'nb':

        from sklearn.naive_bayes import GaussianNB
        model = GaussianNB()
        full_name = 'Naive Bayes'

    elif estimator == 'dt':

        from sklearn.tree import DecisionTreeClassifier
        model = DecisionTreeClassifier(random_state=seed)
        full_name = 'Decision Tree'

    elif estimator == 'svm':

        from sklearn.linear_model import SGDClassifier
        model = SGDClassifier(max_iter=1000, tol=0.001, random_state=seed)
        full_name = 'Support Vector Machine'

    elif estimator == 'rbfsvm':

        from sklearn.svm import SVC
        model = SVC(gamma='auto', C=1, probability=True, kernel='rbf', random_state=seed)
        full_name = 'RBF SVM'

    elif estimator == 'gpc':

        from sklearn.gaussian_process import GaussianProcessClassifier
        model = GaussianProcessClassifier(random_state=seed)
        full_name = 'Gaussian Process Classifier'

    elif estimator == 'mlp':

        from sklearn.neural_network import MLPClassifier
        model = MLPClassifier(max_iter=500, random_state=seed)
        full_name = 'Multi Level Perceptron'    

    elif estimator == 'ridge':

        from sklearn.linear_model import RidgeClassifier
        model = RidgeClassifier(random_state=seed)
        full_name = 'Ridge Classifier'        

    elif estimator == 'rf':

        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=10, random_state=seed)
        full_name = 'Random Forest Classifier'    

    elif estimator == 'qda':

        from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
        model = QuadraticDiscriminantAnalysis()
        full_name = 'Quadratic Discriminant Analysis' 

    elif estimator == 'ada':

        from sklearn.ensemble import AdaBoostClassifier
        model = AdaBoostClassifier(random_state=seed)
        full_name = 'AdaBoost Classifier'        

    elif estimator == 'gbc':

        from sklearn.ensemble import GradientBoostingClassifier    
        model = GradientBoostingClassifier(random_state=seed)
        full_name = 'Gradient Boosting Classifier'    

    elif estimator == 'lda':

        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
        model = LinearDiscriminantAnalysis()
        full_name = 'Linear Discriminant Analysis'

    elif estimator == 'et':

        from sklearn.ensemble import ExtraTreesClassifier 
        model = ExtraTreesClassifier(random_state=seed)
        full_name = 'Extra Trees Classifier'

    else:
        model = estimator
        full_name = str(model).split("(")[0]
    
    progress.value += 1
    
    #checking ensemble method

    if ensemble and method == 'Bagging':
        
        from sklearn.ensemble import BaggingClassifier
        model = BaggingClassifier(model,bootstrap=True,n_estimators=10, random_state=seed)

    elif ensemble and method == 'Boosting':

        from sklearn.ensemble import AdaBoostClassifier
        model = AdaBoostClassifier(model, random_state=seed)

    elif method == 'Boosting':

        from sklearn.ensemble import AdaBoostClassifier
        model = AdaBoostClassifier(model, random_state=seed)
     
    for train_i , test_i in kf.split(data_X,data_y):
    
        Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
        ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]
        
    
        if hasattr(model, 'predict_proba'):
        
            model.fit(Xtrain,ytrain)
            pred_prob = model.predict_proba(Xtest)
            pred_prob = pred_prob[:,1]
            pred_ = model.predict(Xtest)
            sca = metrics.accuracy_score(ytest,pred_)
            sc = metrics.roc_auc_score(ytest,pred_prob)
            recall = metrics.recall_score(ytest,pred_)
            precision = metrics.precision_score(ytest,pred_)
            kappa = metrics.cohen_kappa_score(ytest,pred_)
            f1 = metrics.f1_score(ytest,pred_)
            score_acc = np.append(score_acc,sca)
            score_auc = np.append(score_auc,sc)
            score_recall = np.append(score_recall,recall)
            score_precision = np.append(score_precision,precision)
            score_f1 =np.append(score_f1,f1)
            score_kappa =np.append(score_kappa,kappa)

        else:
            
            model.fit(Xtrain,ytrain)
            pred_prob = 0.00
            pred_prob = 0.00
            pred_ = model.predict(Xtest)
            sca = metrics.accuracy_score(ytest,pred_)
            sc = 0.00
            recall = metrics.recall_score(ytest,pred_)
            precision = metrics.precision_score(ytest,pred_) #change pred_prob to pred_
            kappa = metrics.cohen_kappa_score(ytest,pred_)
            f1 = metrics.f1_score(ytest,pred_)
            score_acc = np.append(score_acc,sca)
            score_auc = np.append(score_auc,sc)
            score_recall = np.append(score_recall,recall)
            score_precision = np.append(score_precision,precision)
            score_f1 =np.append(score_f1,f1)
            score_kappa =np.append(score_kappa,kappa) 
       
        progress.value += 1
        
    mean_acc=np.mean(score_acc)
    mean_auc=np.mean(score_auc)
    mean_recall=np.mean(score_recall)
    mean_precision=np.mean(score_precision)
    mean_f1=np.mean(score_f1)
    mean_kappa=np.mean(score_kappa)
    std_acc=np.std(score_acc)
    std_auc=np.std(score_auc)
    std_recall=np.std(score_recall)
    std_precision=np.std(score_precision)
    std_f1=np.std(score_f1)
    std_kappa=np.std(score_kappa)
    
    avgs_acc = np.append(avgs_acc, mean_acc)
    avgs_acc = np.append(avgs_acc, std_acc) 
    avgs_auc = np.append(avgs_auc, mean_auc)
    avgs_auc = np.append(avgs_auc, std_auc)
    avgs_recall = np.append(avgs_recall, mean_recall)
    avgs_recall = np.append(avgs_recall, std_recall)
    avgs_precision = np.append(avgs_precision, mean_precision)
    avgs_precision = np.append(avgs_precision, std_precision)
    avgs_f1 = np.append(avgs_f1, mean_f1)
    avgs_f1 = np.append(avgs_f1, std_f1)
    avgs_kappa = np.append(avgs_kappa, mean_kappa)
    avgs_kappa = np.append(avgs_kappa, std_kappa)
    
    progress.value += 1
    
    model_results = pd.DataFrame({'Accuracy': score_acc, 'AUC': score_auc, 'Recall' : score_recall, 'Prec.' : score_precision , 
                     'F1' : score_f1, 'Kappa' : score_kappa})
    model_results_unpivot = pd.melt(model_results,value_vars=['Accuracy', 'AUC', 'Recall', 'Prec.', 'F1', 'Kappa'])
    model_results_unpivot.columns = ['Metric', 'Measure']
    model_avgs = pd.DataFrame({'Accuracy': avgs_acc, 'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision , 
                     'F1' : avgs_f1, 'Kappa' : avgs_kappa},index=['Mean', 'SD'])

    model_results = model_results.append(model_avgs)
    model_results = model_results.round(round)  
 
    if verbose:
        clear_output()
        display(HTML(model_results.to_html()))
        return model
    else:
        clear_output()
        return model

In [5]:
def ensemble_model(estimator,
                   method = 'Bagging', 
                   fold = 10,
                   n_estimators = 10,
                   round = 4,  
                   verbose = True):
    """
    
  Description:
  ------------
  This function ensemble the trained base estimator using method defined in 'method' 
  param. The output prints the score grid that shows Accuracy, AUC, Recall, Precision, 
  F1 and Kappa by fold (default = 10). 
  
  Function also returns a trained model object that can be used for further 
  processing in pycaret or can be used to call any method available in sklearn. 
  
  model must be created using create_model() or tune_model() in pycaret or using any
  other package that returns sklearn object.
  
    Example:
    --------
    
    ensembled_lr = ensemble_model(lr)
    
    This will return ensembled Logistic Regression.
    variable 'lr' is created used lr = create_model('lr')
    Using ensemble = True in create_model() is equivalent to using ensemble_model(lr)
    
  
  Parameters
  ----------
  
  estimator : object, default = None
     
  method: String, default = 'Bagging' 
  Bagging implementation is based on sklearn.ensemble.BaggingClassifier
  Boosting implementation is based on sklearn.ensemble.AdaBoostClassifier
  
  fold: integer, default = 10
  Number of folds will determine how many folds would be done in the Kfold cross validation.
  
  round: integer, default = 4
  The number of decimal places metrics will be rounded to. 

  n_estimators: integer, default = 10
  The number of base estimators in the ensemble.
  In case of perfect fit, the learning procedure is stopped early.
  
  verbose: Boolean, default = True
  Score grid is not printed when verbose is set to False.
  
  
  Returns:
  --------
  
  score grid:   A table containing the scores of the model across the kfolds. 
  -----------   Scoring metrics used are Accuracy, AUC, Recall, Precision, F1 
                and Kappa. Mean and standard deviation of the scores across the 
                folds is also returned.
  
  model:        trained ensembled model object
  -----------
  
  Warnings:
  ---------
  None
      
    
    """
    
    #progress bar
    import ipywidgets as ipw
    from IPython.display import display, HTML, clear_output
    progress = ipw.IntProgress(value=0, min=0, max=fold+3, step=1 , description='Processing: ')
    display(progress)
    
    #dependencies
    import numpy as np
    import pandas as pd
    from sklearn import metrics
    from sklearn.model_selection import StratifiedKFold
    import sys    
    
    #defining X_train and y_train    
    data_X = X_train
    data_y = y_train
  
    #ignore co-linearity warnings for qda and lda 
    import warnings
    warnings.filterwarnings('ignore') 
    
    progress.value += 1
    
    #defining estimator as model
    model = estimator
     
    if method == 'Bagging':
        from sklearn.ensemble import BaggingClassifier
        model = BaggingClassifier(model,bootstrap=True,n_estimators=n_estimators, random_state=seed)
        
    else:
        from sklearn.ensemble import AdaBoostClassifier
        model = AdaBoostClassifier(model, random_state=seed)
    
    progress.value += 1
    
    kf = StratifiedKFold(fold, random_state=seed)
    
    score_auc =np.empty((0,0))
    score_acc =np.empty((0,0))
    score_recall =np.empty((0,0))
    score_precision =np.empty((0,0))
    score_f1 =np.empty((0,0))
    score_kappa =np.empty((0,0))
    avgs_auc =np.empty((0,0))
    avgs_acc =np.empty((0,0))
    avgs_recall =np.empty((0,0))
    avgs_precision =np.empty((0,0))
    avgs_f1 =np.empty((0,0))
    avgs_kappa =np.empty((0,0))
    
    for train_i , test_i in kf.split(data_X,data_y):
        
        Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
        ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]
    
        if hasattr(model, 'predict_proba'):
        
            model.fit(Xtrain,ytrain)
            pred_prob = model.predict_proba(Xtest)
            pred_prob = pred_prob[:,1]
            pred_ = model.predict(Xtest)
            sca = metrics.accuracy_score(ytest,pred_)
            sc = metrics.roc_auc_score(ytest,pred_prob)
            recall = metrics.recall_score(ytest,pred_)
            precision = metrics.precision_score(ytest,pred_)
            kappa = metrics.cohen_kappa_score(ytest,pred_)
            f1 = metrics.f1_score(ytest,pred_)
            score_acc = np.append(score_acc,sca)
            score_auc = np.append(score_auc,sc)
            score_recall = np.append(score_recall,recall)
            score_precision = np.append(score_precision,precision)
            score_f1 =np.append(score_f1,f1)
            score_kappa =np.append(score_kappa,kappa)
        
        else:
        
            model.fit(Xtrain,ytrain)
            pred_prob = 0.00
            pred_prob = 0.00
            pred_ = model.predict(Xtest)
            sca = metrics.accuracy_score(ytest,pred_)
            sc = 0.00
            recall = metrics.recall_score(ytest,pred_)
            precision = metrics.precision_score(ytest,pred_) #change pred_prob to pred_
            kappa = metrics.cohen_kappa_score(ytest,pred_)
            f1 = metrics.f1_score(ytest,pred_)
            score_acc = np.append(score_acc,sca)
            score_auc = np.append(score_auc,sc)
            score_recall = np.append(score_recall,recall)
            score_precision = np.append(score_precision,precision)
            score_f1 =np.append(score_f1,f1)
            score_kappa =np.append(score_kappa,kappa) 
        
        progress.value += 1
        
    mean_acc=np.mean(score_acc)
    mean_auc=np.mean(score_auc)
    mean_recall=np.mean(score_recall)
    mean_precision=np.mean(score_precision)
    mean_f1=np.mean(score_f1)
    mean_kappa=np.mean(score_kappa)
    std_acc=np.std(score_acc)
    std_auc=np.std(score_auc)
    std_recall=np.std(score_recall)
    std_precision=np.std(score_precision)
    std_f1=np.std(score_f1)
    std_kappa=np.std(score_kappa)

    avgs_acc = np.append(avgs_acc, mean_acc)
    avgs_acc = np.append(avgs_acc, std_acc) 
    avgs_auc = np.append(avgs_auc, mean_auc)
    avgs_auc = np.append(avgs_auc, std_auc)
    avgs_recall = np.append(avgs_recall, mean_recall)
    avgs_recall = np.append(avgs_recall, std_recall)
    avgs_precision = np.append(avgs_precision, mean_precision)
    avgs_precision = np.append(avgs_precision, std_precision)
    avgs_f1 = np.append(avgs_f1, mean_f1)
    avgs_f1 = np.append(avgs_f1, std_f1)
    avgs_kappa = np.append(avgs_kappa, mean_kappa)
    avgs_kappa = np.append(avgs_kappa, std_kappa)

    model_results = pd.DataFrame({'Accuracy': score_acc, 'AUC': score_auc, 'Recall' : score_recall, 'Prec.' : score_precision , 
                     'F1' : score_f1, 'Kappa' : score_kappa})
    model_results_unpivot = pd.melt(model_results,value_vars=['Accuracy', 'AUC', 'Recall', 'Prec.', 'F1', 'Kappa'])
    model_results_unpivot.columns = ['Metric', 'Measure']
    model_avgs = pd.DataFrame({'Accuracy': avgs_acc, 'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision , 
                     'F1' : avgs_f1, 'Kappa' : avgs_kappa},index=['Mean', 'SD'])

    model_results = model_results.append(model_avgs)
    model_results = model_results.round(round)  
    
    progress.value += 1
    
    model = model
    
    
    if verbose:
        clear_output()
        display(model_results)
        return model
    else:
        clear_output()
        return model    

In [6]:
def plot_model(estimator, 
               plot = 'auc',  
               manifold='tsne', 
               features=5): 
    
    """
   
  Description:
  ------------
  This function takes a trained model object and returns the plot on test set.
  Model may get re-trained in the process, as required in certain cases.
  See list of plots supported below. 

    Example:
    --------
    
    plot_model(lr)
    
    This will return AUC plot of trained Logistic Regression.
    variable 'lr' is created used lr = create_model('lr')

  
  Parameters
  ----------
  
  estimator : object, default=none
  
  A trained model object should be passed as an estimator. 
  Model must be created using create_model() or tune_model() in pycaret or using any
  other package that returns sklearn object.
  
  plot : string, default=auc
  Enter abbreviation of type of plot. The current list of plots supported are:
  
  Name                        Abbreviated String     Original Implementation 
  ---------                   ------------------     -----------------------
  Area Under the Curve         'auc'                 .. / rocauc.html
  Discrimination Threshold     'threshold'           .. / threshold.html
  Precision Recall Curve       'pr'                  .. / prcurve.html
  Confusion Matrix             'confusion_matrix'    .. / confusion_matrix.html
  Class Prediction Error       'error'               .. / class_prediction_error.html
  Classification Report        'class_report'        .. / classification_report.html
  Decision Boundary            'boundary'            .. / boundaries.html
  Recursive Feat. Selection    'rfe'                 .. / rfecv.html
  Learning Curve               'learning'            .. / learning_curve.html
  Manifold Learning            'manifold'            .. / manifold.html
  Calibration Curve            'calibration'         .. / calibration_curve.html
  Validation Curve             'vc'                  .. / validation_curve.html
  Dimension Learning           'dimension'           .. / radviz.html
  Feature Importance           'feature'             ..... N/A .....
  
  ** https://www.scikit-yb.org/en/latest/api/classifier/<reference>
  
  manifold: string, default = 'tsne'
  This parameter is only needed for 'manifold' plot. 
  Other options for this parameter are:
  'lle', 'ltsa', 'hessian', 'modified', 'isomap', 'mds' and 'spectral'

  feature: integer, default = 5
  This parameter is only needed for 'dimension' plot. It is used to reduce the 
  dimensionality of feature set.

  Returns:
  --------
  
  Visual Plot:  Prints the visual plot. Returns an object of type None.  
  
  Warnings:
  ---------
  None
    
    """  
    
    #progress bar
    import ipywidgets as ipw
    from IPython.display import display, HTML, clear_output
    progress = ipw.IntProgress(value=0, min=0, max=5, step=1 , description='Processing: ')
    display(progress)
    
    #dependencies
    import numpy as np
    import pandas as pd
    import sys
    import matplotlib.pyplot as plt
    
    progress.value += 1
    
    model = estimator
    
    progress.value += 1
    
    if plot == 'auc':
        
        
        from yellowbrick.classifier import ROCAUC
        progress.value += 1
        visualizer = ROCAUC(model)
        visualizer.fit(X_train, y_train)
        progress.value += 1
        visualizer.score(X_test, y_test)
        progress.value += 1
        clear_output()
        visualizer.poof()
        
    elif plot == 'threshold':
        
        from yellowbrick.classifier import DiscriminationThreshold
        progress.value += 1
        visualizer = DiscriminationThreshold(model, random_state=seed)
        visualizer.fit(X_train, y_train)
        progress.value += 1
        visualizer.score(X_test, y_test)
        progress.value += 1
        clear_output()
        visualizer.poof()
    
    elif plot == 'pr':
        
        from yellowbrick.classifier import PrecisionRecallCurve
        progress.value += 1
        visualizer = PrecisionRecallCurve(model, random_state=seed)
        visualizer.fit(X_train, y_train)
        progress.value += 1
        visualizer.score(X_test, y_test)
        progress.value += 1
        clear_output()
        visualizer.poof()

    elif plot == 'confusion_matrix':
        
        from yellowbrick.classifier import ConfusionMatrix
        progress.value += 1
        visualizer = ConfusionMatrix(model, random_state=seed, fontsize=15, cmap="Greens")
        visualizer.fit(X_train, y_train)
        progress.value += 1
        visualizer.score(X_test, y_test)
        progress.value += 1
        clear_output()
        visualizer.poof()
    
    elif plot == 'error':
        
        from yellowbrick.classifier import ClassPredictionError
        progress.value += 1
        visualizer = ClassPredictionError(model, random_state=seed)
        visualizer.fit(X_train, y_train)
        progress.value += 1
        visualizer.score(X_test, y_test)
        progress.value += 1
        clear_output()
        visualizer.poof()

    elif plot == 'class_report':
        
        from yellowbrick.classifier import ClassificationReport
        progress.value += 1
        visualizer = ClassificationReport(model, random_state=seed, support=True)
        visualizer.fit(X_train, y_train)
        progress.value += 1
        visualizer.score(X_test, y_test)
        progress.value += 1
        clear_output()
        visualizer.poof()
        
    elif plot == 'boundary':
        
        from sklearn.preprocessing import StandardScaler
        from sklearn.decomposition import PCA
        from yellowbrick.contrib.classifier import DecisionViz        
        
        progress.value += 1
        
        X_train_transformed = X_train.select_dtypes(include='float64')
        X_test_transformed = X_test.select_dtypes(include='float64')
        X_train_transformed = StandardScaler().fit_transform(X_train_transformed)
        X_test_transformed = StandardScaler().fit_transform(X_test_transformed)
        pca = PCA(n_components=2, random_state = seed)
        X_train_transformed = pca.fit_transform(X_train_transformed)
        X_test_transformed = pca.fit_transform(X_test_transformed)
        
        progress.value += 1
        
        y_train_transformed = np.array(y_train)
        y_test_transformed = np.array(y_test)
        
        model_transformed = model
        
        viz = DecisionViz(model_transformed)
        viz.fit(X_train_transformed, y_train_transformed, features=['Feature One', 'Feature Two'], classes=['A', 'B'])
        viz.draw(X_test_transformed, y_test_transformed)
        progress.value += 1
        clear_output()
        viz.poof()
        
    elif plot == 'rfe':
        
        from yellowbrick.model_selection import RFECV 
        progress.value += 1
        visualizer = RFECV(model, cv=10)
        progress.value += 1
        visualizer.fit(X_train, y_train)
        progress.value += 1
        clear_output()
        visualizer.poof()
           
    elif plot == 'learning':
        
        from yellowbrick.model_selection import LearningCurve
        progress.value += 1
        sizes = np.linspace(0.3, 1.0, 10)  
        visualizer = LearningCurve(model, cv=10, scoring='f1_weighted', train_sizes=sizes, n_jobs=1, random_state=seed)
        progress.value += 1
        visualizer.fit(X_train, y_train)
        progress.value += 1
        clear_output()
        visualizer.poof()
        
    elif plot == 'manifold':
        
        from yellowbrick.features import Manifold
        progress.value += 1
        X_train_transformed = X_train.select_dtypes(include='float64') 
        visualizer = Manifold(manifold=manifold, random_state = seed)
        progress.value += 1
        visualizer.fit_transform(X_train_transformed, y_train)
        progress.value += 1
        clear_output()
        visualizer.poof()       
        
    elif plot == 'calibration':      
                
        from sklearn.calibration import calibration_curve
        
        model_name = str(model).split("(")[0]
        
        plt.figure(figsize=(7, 6))
        ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)

        ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
        progress.value += 1
        prob_pos = model.predict_proba(X_test)[:, 1]
        prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
        fraction_of_positives, mean_predicted_value = calibration_curve(y_test, prob_pos, n_bins=10)
        progress.value += 1
        ax1.plot(mean_predicted_value, fraction_of_positives, "s-",label="%s" % (model_name, ))
    
        ax1.set_ylabel("Fraction of positives")
        ax1.set_ylim([0, 1])
        ax1.set_xlim([0, 1])
        ax1.legend(loc="lower right")
        ax1.set_title('Calibration plots  (reliability curve)')
        ax1.set_facecolor('white')
        ax1.grid(b=True, color='grey', linewidth=0.5, linestyle = '-')
        plt.tight_layout()
        progress.value += 1
        clear_output()
        plt.show() 
        
    elif plot == 'vc':
    
        if hasattr(model, 'max_depth'):
            param_name='max_depth'
        else:
            param_name='xxx'
        progress.value += 1
        
        from yellowbrick.model_selection import ValidationCurve
        viz = ValidationCurve(model, param_name=param_name, param_range=np.arange(1,11), scoring='f1_weighted',cv=10, 
                              random_state=seed)
        progress.value += 1
        viz.fit(X_train, y_train)
        progress.value += 1
        clear_output()
        viz.poof()
        
    elif plot == 'dimension':
    
        from yellowbrick.features import RadViz
        from sklearn.preprocessing import StandardScaler
        from sklearn.decomposition import PCA
        progress.value += 1
        X_train_transformed = X_train.select_dtypes(include='float64') 
        X_train_transformed = StandardScaler().fit_transform(X_train_transformed)
        y_train_transformed = np.array(y_train)

        pca = PCA(n_components=features, random_state=seed)
        X_train_transformed = pca.fit_transform(X_train_transformed)
        progress.value += 1
        classes = ["1", "0"]
        visualizer = RadViz(classes=classes, alpha=0.25)
        visualizer.fit(X_train_transformed, y_train_transformed)     
        visualizer.transform(X_train_transformed)
        progress.value += 1
        clear_output()
        visualizer.poof()
        
    elif plot == 'feature':
        variables = abs(model.coef_[0])
        col_names = np.array(X_train.columns)
        coef_df = pd.DataFrame({'Variable': X_train.columns, 'Value': variables})
        sorted_df = coef_df.sort_values(by='Value')
        my_range=range(1,len(sorted_df.index)+1)
        progress.value += 1
        plt.figure(figsize=(8,5))
        plt.hlines(y=my_range, xmin=0, xmax=sorted_df['Value'], color='skyblue')
        plt.plot(sorted_df['Value'], my_range, "o")
        progress.value += 1
        plt.yticks(my_range, sorted_df['Variable'])
        plt.title("Feature Importance Plot")
        progress.value += 1
        clear_output()
        plt.xlabel('Variable Importance')
        plt.ylabel('Features') 
        var_imp = sorted_df.reset_index(drop=True)
        var_imp_array = np.array(var_imp['Variable'])
        var_imp_array_top_n = var_imp_array[0:len(var_imp_array)]

In [13]:
def compare_models(model_library = 'All', 
                   fold = 10, 
                   round = 4, 
                   sort = 'Accuracy', 
                   blacklist = None):
    
    """
   
  Description:
  ------------
  This function creates multiple model and scores it using Stratified Cross Validation. 
  The output prints the score grid that shows Accuracy, AUC, Recall, Precision, 
  F1 and Kappa by fold (default = 10) of all the available model in model library. 
  
  List of models in Model Library
  
  Estimator                   Abbreviated String     sklearn Implementation 
  ---------                   ------------------     -----------------------
  Logistic Regression         'lr'                   linear_model.LogisticRegression
  K Nearest Neighbour         'knn'                  neighbors.KNeighborsClassifier
  Naives Bayes                'nb'                   naive_bayes.GaussianNB
  Decision Tree               'dt'                   tree.DecisionTreeClassifier
  SVM (Linear)                'svm'                  linear_model.SGDClassifier
  SVM (RBF)                   'rbfsvm'               svm.SVC
  Gaussian Process            'gpc'                  gaussian_process.GPC
  Multi Level Perceptron      'mlp'                  neural_network.MLPClassifier
  Ridge Classifier            'ridge'                linear_model.RidgeClassifier
  Random Forest               'rf'                   ensemble.RandomForestClassifier
  Quadratic Disc. Analysis    'qda'                  discriminant_analysis.QDA 
  AdaBoost                    'ada'                  ensemble.AdaBoostClassifier
  Gradient Boosting           'gbc'                  ensemble.GradientBoostingClassifier
  Linear Disc. Analysis       'lda'                  discriminant_analysis.LDA 
  Extra Trees Classifier      'et'                   ensemble.ExtraTreesClassifier
  
    Example:
    --------
    
    compare_models() 
    
    This will return score grid of all the models. 
    ** all other parameters (see below) for compare_models are optional.
    
      Alternate use of compare_models() could be:
    
      compare_models( [lr, rf] )
      where lr and rf variable is created used create_model()
      
      If used this way, the function will return averaged result
      comparison of lr and rf object instead of all models.
 
  Parameters
  ----------
  
  model_library : string or object, default = 'All'
  ** Only 'All' can be passed as string. 
  
  fold: integer, default = 10
  Number of folds will determine how many folds would be done in the Kfold cross validation.
  
  round: integer, default = 4
  The number of decimal places metrics will be rounded to.

  sort: string, default = 'Accuracy'
  The scoring measure specified is used for sorting the models based on their 
  performance score on the specified scoring measure. 
  Other options are 'AUC', 'Recall', 'Prec.', 'F1' and 'Kappa'

  blacklist: string, default = None
  In order to omit certain models from the comparison, the abbreviation string 
  of such models (see above list) can be passed as  list of strings. This is 
  normally done to be more efficient with time. By default, None is chosen, 
  which means no models are black listed.
  
    Example
    -------
    compare_models( blacklist = [ 'rbfsvm', 'mlp' ] ) 
    
    This will return comparison of all models except 
    Support Vector Machine (RBF) and Multi Level Perceptron.
  
  Returns:
  --------
  
  score grid:   A table containing the scores of the model across the kfolds. 
  -----------   Scoring metrics used are Accuracy, AUC, Recall, Precision, F1 
                and Kappa. Mean and standard deviation of the scores across the 
                folds is also returned.

  Warnings:
  ---------
  compare_all() though attractive, might be time consuming with large datasets
  and users might want to limit the models they chose to compare by either
  blacklisting certain models or only passing certain models as object 
  in model_library parameter.
  
    
    """
    #progress bar
    import ipywidgets as ipw
    from IPython.display import display, HTML, clear_output
    progress = ipw.IntProgress(value=0, min=0, max=(fold*15)+5, step=1 , description='Processing: ')
    display(progress)
    
    #ignore warnings
    import warnings
    warnings.filterwarnings('ignore') 

    #defining X_train and y_train
    data_X = X_train
    data_y=y_train

    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.svm import SVC
    from sklearn.gaussian_process.kernels import RBF
    from sklearn.gaussian_process import GaussianProcessClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.model_selection import StratifiedKFold
    from sklearn import preprocessing as pre
    from sklearn.pipeline import Pipeline as pipe
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import RidgeClassifier
    from sklearn.linear_model import Lasso
    from sklearn.linear_model import LogisticRegression
    from sklearn.linear_model import SGDClassifier
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
    from sklearn import metrics
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import RandomizedSearchCV
    
    progress.value += 1
    
    from scipy import stats
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import cross_val_predict
    from sklearn.model_selection import cross_validate
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import BaggingClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import roc_auc_score
    from sklearn.metrics import cohen_kappa_score
    import numpy as np
    import pandas as pd
    import pandas_profiling as pd_p
    import seaborn as sns
    import random
    import pandas.io.formats.style
    
    progress.value += 1
    
    lr = LogisticRegression(random_state=seed)
    knn = KNeighborsClassifier()
    nb = GaussianNB()
    dt = DecisionTreeClassifier(random_state=seed)
    svm = SGDClassifier(max_iter=1000, tol=0.001, random_state=seed)
    rbfsvm = SVC(gamma='auto', C=1, probability=True, kernel='rbf', random_state=seed)
    gpc = GaussianProcessClassifier(random_state=seed)
    mlp = MLPClassifier(max_iter=500, random_state=seed)
    ridge = RidgeClassifier(random_state=seed)
    rf = RandomForestClassifier(n_estimators=10, random_state=seed)
    qda = QuadraticDiscriminantAnalysis()
    ada = AdaBoostClassifier(random_state=seed)
    gbc = GradientBoostingClassifier(random_state=seed)
    lda = LinearDiscriminantAnalysis()
    et = ExtraTreesClassifier(random_state=seed)
    
    progress.value += 1
    
    #blacklist models

    if model_library != 'All':
        
        model_library = model_library
    
        model_names = []
    
        for names in model_library:
        
            model_names = np.append(model_names, str(names).split("(")[0])
        
            import re 
        
            def putSpace(input):
                words = re.findall('[A-Z][a-z]*', input)
                words = ' '.join(words)
                return words  

            model_names_modified = []
            
            for i in model_names:
                model_names_modified.append(putSpace(i))

            model_names_modified = []
            
            for i in model_names:
                model_names_modified.append(putSpace(i))

            model_names = model_names_modified

            model_names_final = []
            
            for j in model_names:
                
                if j == 'Gaussian N B':
                    model_names_final.append('Naive Bayes')
                elif j == 'M L P Classifier':
                    model_names_final.append('MLP Classifier')
                elif j == 'S G D Classifier':
                    model_names_final.append('SVM - Linear Kernel')
                elif j == 'S V C':
                    model_names_final.append('SVM - Radial Kernel')
                else: 
                    model_names_final.append(j)

                model_names = model_names_final    

    else:
        
        if blacklist == None:
        
            model_library = [lr, knn, nb, dt, svm, rbfsvm, gpc, mlp, ridge, rf, qda, ada, gbc, lda, et]

            model_names = []

            for names in model_library:
                model_names = np.append(model_names, str(names).split("(")[0])

            import re 

            def putSpace(input):
                words = re.findall('[A-Z][a-z]*', input)
                words = ' '.join(words)
                return words  

            model_names_modified = []
            for i in model_names:
                model_names_modified.append(putSpace(i))

            model_names = model_names_modified

            model_names_final = []
            for j in model_names:
                if j == 'Gaussian N B':
                    model_names_final.append('Naive Bayes')
                elif j == 'M L P Classifier':
                    model_names_final.append('MLP Classifier')
                elif j == 'S G D Classifier':
                    model_names_final.append('SVM - Linear Kernel')
                elif j == 'S V C':
                    model_names_final.append('SVM - Radial Kernel')
                else: 
                    model_names_final.append(j)

            model_names = model_names_final

        else:
        
            model_library_values = ['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 
                        'ada', 'gbc', 'lda', 'et']

            location = []

            for item in blacklist:
                location.append(model_library_values.index(item))

            model_library = [lr, knn, nb, dt, svm, rbfsvm, gpc, mlp, ridge, rf, qda, ada, gbc, lda, et]

            for i in location:
                del model_library[i]

            model_names = []

            for names in model_library:
                model_names = np.append(model_names, str(names).split("(")[0])

            import re

            def putSpace(input):
                words = re.findall('[A-Z][a-z]*', input)
                words = ' '.join(words)
                return words  

            model_names_modified = []
            for i in model_names:
                model_names_modified.append(putSpace(i))

            model_names = model_names_modified

            model_names_final = []
            for j in model_names:
                if j == 'Gaussian N B':
                    model_names_final.append('Naive Bayes')
                elif j == 'M L P Classifier':
                    model_names_final.append('MLP Classifier')
                elif j == 'S G D Classifier':
                    model_names_final.append('SVM - Linear Kernel')
                elif j == 'S V C':
                    model_names_final.append('SVM - Radial Kernel')
                else: 
                    model_names_final.append(j)

            model_names = model_names_final

    progress.value += 1
    
    kf = StratifiedKFold(fold, random_state=seed)

    score_acc =np.empty((0,0))
    score_auc =np.empty((0,0))
    score_recall =np.empty((0,0))
    score_precision =np.empty((0,0))
    score_f1 =np.empty((0,0))
    score_kappa =np.empty((0,0))
    score_acc_running = np.empty((0,0)) ##running total
    avg_acc = np.empty((0,0))
    avg_auc = np.empty((0,0))
    avg_recall = np.empty((0,0))
    avg_precision = np.empty((0,0))
    avg_f1 = np.empty((0,0))
    avg_kappa = np.empty((0,0))
      
    for model in model_library:
 
        for train_i , test_i in kf.split(data_X,data_y):
     
            Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
            ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]
        
            if hasattr(model, 'predict_proba'):               
        
                model.fit(Xtrain,ytrain)
                pred_prob = model.predict_proba(Xtest)
                pred_prob = pred_prob[:,1]
                pred_ = model.predict(Xtest)
                sca = metrics.accuracy_score(ytest,pred_)
                sc = metrics.roc_auc_score(ytest,pred_prob)
                recall = metrics.recall_score(ytest,pred_)
                precision = metrics.precision_score(ytest,pred_)
                kappa = cohen_kappa_score(ytest,pred_)
                f1 = metrics.f1_score(ytest,pred_)
                score_acc = np.append(score_acc,sca)
                score_auc = np.append(score_auc,sc)
                score_recall = np.append(score_recall,recall)
                score_precision = np.append(score_precision,precision)
                score_f1 =np.append(score_f1,f1)
                score_kappa =np.append(score_kappa,kappa)              
        
            else:        

                model.fit(Xtrain,ytrain)
                pred_prob = 0.00
                pred_prob = 0.00
                pred_ = model.predict(Xtest)
                sca = metrics.accuracy_score(ytest,pred_)
                sc = 0.00
                recall = metrics.recall_score(ytest,pred_)
                precision = metrics.precision_score(ytest,pred_) #change pred_prob to pred_
                kappa = cohen_kappa_score(ytest,pred_)
                f1 = metrics.f1_score(ytest,pred_)
                score_acc = np.append(score_acc,sca)
                score_auc = np.append(score_auc,sc)
                score_recall = np.append(score_recall,recall)
                score_precision = np.append(score_precision,precision)
                score_f1 =np.append(score_f1,f1)
                score_kappa =np.append(score_kappa,kappa) 
            
            progress.value += 1
        
        progress.value += 1
        
        avg_acc = np.append(avg_acc,np.mean(score_acc))
        avg_auc = np.append(avg_auc,np.mean(score_auc))
        avg_recall = np.append(avg_recall,np.mean(score_recall))
        avg_precision = np.append(avg_precision,np.mean(score_precision))
        avg_f1 = np.append(avg_f1,np.mean(score_f1))
        avg_kappa = np.append(avg_kappa,np.mean(score_kappa))
        score_acc =np.empty((0,0))
        score_auc =np.empty((0,0))
        score_recall =np.empty((0,0))
        score_precision =np.empty((0,0))
        score_f1 =np.empty((0,0))
        score_kappa =np.empty((0,0))
  
    progress.value += 1
    
    def highlight_max(s):
        is_max = s == s.max()
        return ['background-color: yellow' if v else '' for v in is_max]

    compare_models_ = pd.DataFrame({'Model':model_names, 'Accuracy':avg_acc, 'AUC':avg_auc, 
                     'Recall':avg_recall, 'Prec.':avg_precision, 
                     'F1':avg_f1, 'Kappa': avg_kappa}).round(round).sort_values(by=[sort], 
                      ascending=False).reset_index(drop=True).style.apply(highlight_max,subset=['Accuracy','AUC','Recall',
                      'Prec.','F1','Kappa'])
    compare_models_ = compare_models_.set_properties(**{'text-align': 'left'})
    compare_models_ = compare_models_.set_table_styles([dict(selector='th', props=[('text-align', 'left')])])
    
    progress.value += 1
    
    clear_output()

    return compare_models_

In [17]:
def tune_model(estimator = None, 
               fold = 10, 
               round = 4, 
               n_iter = 10, 
               optimize = 'accuracy',
               ensemble = False, 
               method = 'Bagging',
               verbose = True):
    
      
    """
    
  Description:
  ------------
  This function tunes hyperparameter of a model and scores it using Stratified 
  Cross Validation. The output prints the score grid that shows Accuracy, AUC,
  Recall, Precision, F1 and Kappa by fold (default = 10).

  Function also return a trained model object that can be used for further 
  processing in pycaret or can be used to call any method available in sklearn. 
  
  tune_model() accepts string parameter for estimator.
  
    Example
    -------
    tune_model('lr') 
    
    This will tune the hyperparameters of Logistic Regression
    
    tune_model('lr', ensemble = True) 
    
    This will tune the hyperparameters of Logistic Regression wrapped around 
    Bagging Classifier. 
    
    
  Parameters
  ----------
  
  estimator : string, default = None
  
  Enter abbreviated name of the estimator class. List of estimators supported:
  
  Estimator                   Abbreviated String     Original Implementation 
  ---------                   ------------------     -----------------------
  Logistic Regression         'lr'                   linear_model.LogisticRegression
  K Nearest Neighbour         'knn'                  neighbors.KNeighborsClassifier
  Naives Bayes                'nb'                   naive_bayes.GaussianNB
  Decision Tree               'dt'                   tree.DecisionTreeClassifier
  SVM (Linear)                'svm'                  linear_model.SGDClassifier
  SVM (RBF)                   'rbfsvm'               svm.SVC
  Gaussian Process            'gpc'                  gaussian_process.GPC
  Multi Level Perceptron      'mlp'                  neural_network.MLPClassifier
  Ridge Classifier            'ridge'                linear_model.RidgeClassifier
  Random Forest               'rf'                   ensemble.RandomForestClassifier
  Quadratic Disc. Analysis    'qda'                  discriminant_analysis.QDA 
  AdaBoost                    'ada'                  ensemble.AdaBoostClassifier
  Gradient Boosting           'gbc'                  ensemble.GradientBoostingClassifier
  Linear Disc. Analysis       'lda'                  discriminant_analysis.LDA 
  Extra Trees Classifier      'et'                   ensemble.ExtraTreesClassifier
   
  fold: integer, default = 10
  Number of folds will determine how many folds would be done in the Kfold CV.
  
  round: integer, default = 4
  The number indicates the number of decimal places metrics will be rounded to. 

  n_iter: integer, default = 10
  Number of iterations within the Random Grid Search. For every iteration, 
  the model randomly selects one value from the pre-defined grid of hyperparameters.

  optimize: string, default = 'accuracy'
  Measure used to select the best model through the hyperparameter tuning.
  The default scoring measure is 'accuracy'. Other common measures include
  'f1', 'recall', 'precision', 'roc_auc'. Complete list available at:
  https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

  ensemble: Boolean, default = False
  True would enable ensembling of models through Bagging/Boosting method to be defined by 'method'.
  
  method: String, 'Bagging' or 'Boosting', default = Bagging
  method comes into effect only when ensemble = True. Default is set to Bagging. 

  verbose: Boolean, default = True
  Score grid is not printed when verbose is set to False.
  
  Returns:
  --------
  
  score grid:   A table containing the scores of the model across the kfolds. 
  -----------   Scoring metrics used are Accuracy, AUC, Recall, Precision, F1 
                and Kappa. Mean and standard deviation of the scores across the 
                folds is also returned.
  
  model:        trained model object
  -----------

  Warnings:
  ---------
  estimator parameter takes an abbreviated string. passing a trained model object
  returns an error. tune_model('lr') function internally calls create_model() before
  tuning the hyperparameters.

  
    """
   
    #progress bar
    import ipywidgets as ipw
    from IPython.display import display, HTML, clear_output
    progress = ipw.IntProgress(value=0, min=0, max=fold+5, step=1 , description='Processing: ')
    display(progress)
    
    #check input parameter
    import sys
    if type(estimator) != str:   
        print("Estimator is expecting abbreviated string for model to be tuned. Please see docstring complete list of models.") 
        sys.exit('Exception Handling XXX') 
    
    #ignore warnings
    import warnings
    warnings.filterwarnings('ignore')    

    data_X = X_train
    data_y = y_train

    progress.value += 1
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.svm import SVC
    from sklearn.gaussian_process.kernels import RBF
    from sklearn.gaussian_process import GaussianProcessClassifier
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
    from sklearn.neural_network import MLPClassifier
    from sklearn.model_selection import StratifiedKFold  
    from sklearn.model_selection import StratifiedKFold
    from sklearn.linear_model import RidgeClassifier
    import numpy as np
    import pandas as pd
    import pandas_profiling as pd_p
    import seaborn as sns
    from sklearn import preprocessing as pre
    from sklearn.pipeline import Pipeline as pipe
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import Lasso
    from sklearn.linear_model import LogisticRegression
    from sklearn.linear_model import SGDClassifier
    from sklearn import metrics
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import RandomizedSearchCV
    from scipy import stats
    import random
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import cross_val_predict
    from sklearn.model_selection import cross_validate
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import roc_auc_score
    from sklearn.metrics import cohen_kappa_score
    from sklearn.ensemble import BaggingClassifier
    
    progress.value += 1
    
    kf = StratifiedKFold(fold, random_state=seed)

    score_auc =np.empty((0,0))
    score_acc =np.empty((0,0))
    score_recall =np.empty((0,0))
    score_precision =np.empty((0,0))
    score_f1 =np.empty((0,0))
    score_kappa =np.empty((0,0))
    avgs_auc =np.empty((0,0))
    avgs_acc =np.empty((0,0))
    avgs_recall =np.empty((0,0))
    avgs_precision =np.empty((0,0))
    avgs_f1 =np.empty((0,0))
    avgs_kappa =np.empty((0,0))
    
    if estimator == 'knn':
        
        param_grid = {'n_neighbors': range(1,51),
                 'weights' : ['uniform', 'distance'],
                 'metric':["euclidean", "manhattan"]
                     }        
        model_grid = RandomizedSearchCV(estimator=KNeighborsClassifier(), param_distributions=param_grid, 
                                        scoring=optimize, n_iter=n_iter, cv=fold, random_state=seed,
                                       n_jobs=-1, iid=False)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_
 
    elif estimator == 'lr':

        param_grid = {'C': [1,5,10,25,50,100],
                  "penalty": [ 'l1', 'l2'],
                  "class_weight": ["balanced", None]
                     }
        model_grid = RandomizedSearchCV(estimator=LogisticRegression(random_state=seed), 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, cv=fold, 
                                        random_state=seed, iid=False,n_jobs=-1)
        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_

    elif estimator == 'dt':
        
        param_grid = {"max_depth": np.random.randint(3, (len(X_train.columns)*.85),4),
                  "max_features": np.random.randint(3, len(X_train.columns),4),
                  "min_samples_leaf": [2,3,4],
                  "criterion": ["gini", "entropy"]}

        model_grid = RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=seed), param_distributions=param_grid,
                                       scoring=optimize, n_iter=n_iter, cv=fold, random_state=seed,
                                       iid=False, n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_
 
    elif estimator == 'mlp':
    
        param_grid = {'learning_rate': ['constant', 'invscaling', 'adaptive'],
                 'solver' : ['lbfgs', 'sgd', 'adam'],
                 'alpha': [0.0001, 0.05],
                 'hidden_layer_sizes': np.random.randint(5,15,5),
                 'activation': ["tanh", "identity", "logistic","relu"]
                 }

        model_grid = RandomizedSearchCV(estimator=MLPClassifier(max_iter=1000, random_state=seed), 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, cv=fold, 
                                        random_state=seed, iid=False, n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_
    
    elif estimator == 'gpc':
    
        param_grid = {"max_iter_predict":[100,200,300,400,500,600,700,800,900,1000]}

        model_grid = RandomizedSearchCV(estimator=GaussianProcessClassifier(random_state=seed), param_distributions=param_grid,
                                       scoring=optimize, n_iter=n_iter, cv=fold, random_state=seed,
                                       n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_    

    elif estimator == 'rbfsvm':

        param_grid = {'C': [.5,1,10,50,100],
                "class_weight": ["balanced", None]}

        model_grid = RandomizedSearchCV(estimator=SVC(gamma='auto', C=1, probability=True, kernel='rbf', random_state=seed), 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                        cv=fold, random_state=seed, n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_    
  
    elif estimator == 'nb':

        param_grid = {'var_smoothing': [0.000000001, 0.0000001, 0.00001, 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007,
                                        0.008, 0.009, 0.01, 0.1, 1]}

        model_grid = RandomizedSearchCV(estimator=GaussianNB(), 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                        cv=fold, random_state=seed, n_jobs=-1)
 
        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_        

    elif estimator == 'svm':
   
        param_grid = {'penalty': ['l2', 'l1','elasticnet'],
                      'l1_ratio': [0,0.1,0.15,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                      'alpha': [0.0001, 0.001, 0.01, 0.0002, 0.002, 0.02, 0.0005, 0.005, 0.05],
                      'fit_intercept': [True, False],
                      'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
                      'eta0': [0.001, 0.01,0.05,0.1,0.2,0.3,0.4,0.5]
                     }    

        model_grid = RandomizedSearchCV(estimator=SGDClassifier(loss='hinge', random_state=seed), 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                        cv=fold, random_state=seed, n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_     

    elif estimator == 'ridge':

        param_grid = {'alpha': [0.0001,0.001,0.1,0.15,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                      'fit_intercept': [True, False],
                      'normalize': [True, False]
                     }    

        model_grid = RandomizedSearchCV(estimator=RidgeClassifier(random_state=seed), 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                        cv=fold, random_state=seed, n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_     
   
    elif estimator == 'rf':

        param_grid = {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                      'criterion': ['gini', 'entropy'],
                      'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
                      'min_samples_split': [2, 5, 7, 9, 10],
                      'min_samples_leaf' : [1, 2, 4],
                      'max_features' : ['auto', 'sqrt', 'log2'],
                      'bootstrap': [True, False]
                     }    

        model_grid = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=seed), 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                        cv=fold, random_state=seed, n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_     
   
    elif estimator == 'ada':

        param_grid = {'n_estimators': [10, 40, 70, 80, 90, 100, 120, 140, 150],
                      'learning_rate': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                      'algorithm' : ["SAMME", "SAMME.R"]
                     }    

        model_grid = RandomizedSearchCV(estimator=AdaBoostClassifier(random_state=seed), 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                        cv=fold, random_state=seed, n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_   

    elif estimator == 'gbc':

        param_grid = {'loss': ['deviance', 'exponential'],
                      'n_estimators': [10, 40, 70, 80, 90, 100, 120, 140, 150],
                      'learning_rate': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                      'subsample' : [0.1,0.3,0.5,0.7,0.9,1],
                      'min_samples_split' : [2,4,5,7,9,10],
                      'min_samples_leaf' : [1,2,3,4,5],
                      'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
                      'max_features' : ['auto', 'sqrt', 'log2']
                     }    

        model_grid = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=seed), 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                        cv=fold, random_state=seed, n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_   

    elif estimator == 'qda':

        param_grid = {'reg_param': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]}    

        model_grid = RandomizedSearchCV(estimator=QuadraticDiscriminantAnalysis(), 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                        cv=fold, random_state=seed, n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_      

    elif estimator == 'lda':

        param_grid = {'solver' : ['lsqr', 'eigen'],
                      'shrinkage': [0.0001, 0.001, 0.01, 0.0005, 0.005, 0.05, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
                     }    

        model_grid = RandomizedSearchCV(estimator=LinearDiscriminantAnalysis(), 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                        cv=fold, random_state=seed, n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_        

    elif estimator == 'et':

        param_grid = {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                      'criterion': ['gini', 'entropy'],
                      'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
                      'min_samples_split': [2, 5, 7, 9, 10],
                      'min_samples_leaf' : [1, 2, 4],
                      'max_features' : ['auto', 'sqrt', 'log2'],
                      'bootstrap': [True, False]
                     }    

        model_grid = RandomizedSearchCV(estimator=ExtraTreesClassifier(random_state=seed), 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                        cv=fold, random_state=seed, n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_          
    
    progress.value += 1
    
    if estimator == 'dt' and ensemble == True and method == 'Bagging':
    
    #when using normal BaggingClassifier() DT estimator raise's an exception for max_features parameter. Hence a separate 
    #call has been made for estimator='dt' and method = 'Bagging' where max_features has been removed from param_grid_dt.
    
        param_grid = {'n_estimators': [10,15,20,25,30],
                     'max_samples': [0.3,0.5,0.6,0.7,0.8,0.9],
                     'max_features':[0.3,0.5,0.6,0.7,0.8,0.9],
                     'bootstrap': [True, False],
                     'bootstrap_features': [True, False],
                     }

        param_grid_dt = {"max_depth": np.random.randint(3, (len(X_train.columns)*.85),4),
                      "min_samples_leaf": [2,3,4],
                      "criterion": ["gini", "entropy"]}


        model_grid = RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=seed), param_distributions=param_grid_dt,
                                       scoring=optimize, n_iter=n_iter, cv=fold, random_state=seed,
                                       iid=False, n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_

        best_model = BaggingClassifier(best_model, random_state=seed)

        model_grid = RandomizedSearchCV(estimator=best_model, 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                        cv=fold, random_state=seed, iid=False, n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_    
  
        progress.value += 1
    
    elif ensemble and method == 'Bagging':
    
        param_grid = {'n_estimators': [10,15,20,25,30],
                     'max_samples': [0.3,0.5,0.6,0.7,0.8,0.9],
                     'max_features':[0.3,0.5,0.6,0.7,0.8,0.9],
                     'bootstrap': [True, False],
                     'bootstrap_features': [True, False],
                     }

        best_model = BaggingClassifier(best_model, random_state=seed)

        model_grid = RandomizedSearchCV(estimator=best_model, 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                        cv=fold, random_state=seed, iid=False, n_jobs=-1)

        model_grid.fit(X_train,y_train)
        model = model_grid.best_estimator_
        best_model = model_grid.best_estimator_
        best_model_param = model_grid.best_params_    
     
    elif ensemble and method =='Boosting':
        
        param_grid = {'n_estimators': [25,35,50,60,70,75],
                     'learning_rate': [1,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2],
                     }        

        best_model = AdaBoostClassifier(best_model, random_state=seed)

        model_grid = RandomizedSearchCV(estimator=best_model, 
                                        param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                        cv=fold, random_state=seed, iid=False, n_jobs=-1)

    progress.value += 1
    
    for train_i , test_i in kf.split(data_X,data_y):
    
        Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
        ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]
    
        if hasattr(best_model, 'predict_proba'):  
        
            model.fit(Xtrain,ytrain)
            pred_prob = model.predict_proba(Xtest)
            pred_prob = pred_prob[:,1]
            pred_ = model.predict(Xtest)
            sca = metrics.accuracy_score(ytest,pred_)
            sc = metrics.roc_auc_score(ytest,pred_prob)
            recall = metrics.recall_score(ytest,pred_)
            precision = metrics.precision_score(ytest,pred_)
            kappa = cohen_kappa_score(ytest,pred_)
            f1 = metrics.f1_score(ytest,pred_)
            score_acc = np.append(score_acc,sca)
            score_auc = np.append(score_auc,sc)
            score_recall = np.append(score_recall,recall)
            score_precision = np.append(score_precision,precision)
            score_f1 =np.append(score_f1,f1)
            score_kappa =np.append(score_kappa,kappa)
        
        else:
        
            model.fit(Xtrain,ytrain)
            pred_prob = 0.00
            pred_prob = 0.00
            pred_ = model.predict(Xtest)
            sca = metrics.accuracy_score(ytest,pred_)
            sc = 0.00
            recall = metrics.recall_score(ytest,pred_)
            precision = metrics.precision_score(ytest,pred_) #change pred_prob to pred_
            kappa = cohen_kappa_score(ytest,pred_)
            f1 = metrics.f1_score(ytest,pred_)
            score_acc = np.append(score_acc,sca)
            score_auc = np.append(score_auc,sc)
            score_recall = np.append(score_recall,recall)
            score_precision = np.append(score_precision,precision)
            score_f1 =np.append(score_f1,f1)
            score_kappa =np.append(score_kappa,kappa) 
    
    progress.value += 1
    
    mean_acc=np.mean(score_acc)
    mean_auc=np.mean(score_auc)
    mean_recall=np.mean(score_recall)
    mean_precision=np.mean(score_precision)
    mean_f1=np.mean(score_f1)
    mean_kappa=np.mean(score_kappa)
    std_acc=np.std(score_acc)
    std_auc=np.std(score_auc)
    std_recall=np.std(score_recall)
    std_precision=np.std(score_precision)
    std_f1=np.std(score_f1)
    std_kappa=np.std(score_kappa)

    avgs_acc = np.append(avgs_acc, mean_acc)
    avgs_acc = np.append(avgs_acc, std_acc) 
    avgs_auc = np.append(avgs_auc, mean_auc)
    avgs_auc = np.append(avgs_auc, std_auc)
    avgs_recall = np.append(avgs_recall, mean_recall)
    avgs_recall = np.append(avgs_recall, std_recall)
    avgs_precision = np.append(avgs_precision, mean_precision)
    avgs_precision = np.append(avgs_precision, std_precision)
    avgs_f1 = np.append(avgs_f1, mean_f1)
    avgs_f1 = np.append(avgs_f1, std_f1)
    avgs_kappa = np.append(avgs_kappa, mean_kappa)
    avgs_kappa = np.append(avgs_kappa, std_kappa)

    progress.value += 1
    
    model_results = pd.DataFrame({'Accuracy': score_acc, 'AUC': score_auc, 'Recall' : score_recall, 'Prec.' : score_precision , 
                     'F1' : score_f1, 'Kappa' : score_kappa})
    model_avgs = pd.DataFrame({'Accuracy': avgs_acc, 'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision , 
                     'F1' : avgs_f1, 'Kappa' : avgs_kappa},index=['Mean', 'SD'])

    model_results = model_results.append(model_avgs)
    model_results = model_results.round(round)

    progress.value += 1
    
    if verbose:
        clear_output()
        display(model_results)
        return best_model
    else:
        clear_output()
        return best_model

In [119]:
def blend_models(estimator_list = 'All', 
                 fold = 10, 
                 round = 4, 
                 method = 'hard'):
    
    """
    
  Description:
  ------------
  This function creates a Soft Voting / Majority Rule classifier for list of estimators
  provided or for all estimators in model library and scores it using Stratified Cross 
  Validation. The output prints the score grid that shows Accuracy, AUC, Recall, 
  Precision, F1 and Kappa by fold (default = 10). 

  Function also return a trained model object that can be used for further 
  processing in pycaret or can be used to call any method available in sklearn. 
  
    Example:
    --------
    
    blend_models() 
    
    This will result in VotingClassifier for all models in library.   
    ** All other parameters are optional.
    
    For specific models, you can use:
    
    lr = create_model( 'lr' )
    rf = create_model( 'rf' )
    
    blend_models( [ lr, rf ] )
    
    This will result in VotingClassifier of lr and rf.
    
  Parameters
  ----------
  
  estimator_list : string ('All') or list of object, default = 'All'

  fold: integer, default = 10
  Number of folds will determine how many folds would be done in the Kfold CV.
  
  round: integer, default = 4
  The number of decimal places metrics will be rounded to. 

  method: string, default = 'hard'
  
  If ‘hard’, uses predicted class labels for majority rule voting. 
  Else if ‘soft’, predicts the class label based on the argmax of the sums 
  of the predicted probabilities, which is recommended for an ensemble of 
  well-calibrated classifiers. When estimator_list is set as 'All'. 
  Method is forced to be 'hard'. 
  
  Returns:
  --------
  
  score grid:   A table containing the scores of the model across the kfolds. 
  -----------   Scoring metrics used are Accuracy, AUC, Recall, Precision, F1 
                and Kappa. Mean and standard deviation of the scores across the 
                folds is also returned.
  
  model:        trained model object
  -----------

  Warnings:
  ---------
  None
  
  
    """
    
    #progress bar
    import ipywidgets as ipw
    from IPython.display import display, HTML, clear_output
    progress = ipw.IntProgress(value=0, min=0, max=fold+3, step=1 , description='Processing: ')
    display(progress)
    
    import numpy as np
    import pandas as pd
    import sys
    from sklearn import metrics
    
    #defining X_train and y_train called from setup() into variable data_X and data_y to be used in cross validation   
    data_X = X_train
    data_y = y_train

    #ignore warnings
    import warnings
    warnings.filterwarnings('ignore') 

    #general imports
    import numpy as np
    import pandas as pd
    import re
    import sys #for exception handling  
    from sklearn import metrics
    from sklearn.model_selection import StratifiedKFold  
    from sklearn.ensemble import VotingClassifier
    
    progress.value += 1
    
    score_auc =np.empty((0,0))
    score_acc =np.empty((0,0))
    score_recall =np.empty((0,0))
    score_precision =np.empty((0,0))
    score_f1 =np.empty((0,0))
    score_kappa =np.empty((0,0))
    avgs_auc =np.empty((0,0))
    avgs_acc =np.empty((0,0))
    avgs_recall =np.empty((0,0))
    avgs_precision =np.empty((0,0))
    avgs_f1 =np.empty((0,0))
    avgs_kappa =np.empty((0,0))
    avg_acc = np.empty((0,0))
    avg_auc = np.empty((0,0))
    avg_recall = np.empty((0,0))
    avg_precision = np.empty((0,0))
    avg_f1 = np.empty((0,0))
    avg_kappa = np.empty((0,0))

    kf = StratifiedKFold(fold, random_state=seed)
        
    if estimator_list == 'All':

        from sklearn.linear_model import LogisticRegression
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.naive_bayes import GaussianNB
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.linear_model import SGDClassifier
        from sklearn.svm import SVC
        from sklearn.gaussian_process import GaussianProcessClassifier
        from sklearn.neural_network import MLPClassifier
        from sklearn.linear_model import RidgeClassifier
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
        from sklearn.ensemble import AdaBoostClassifier
        from sklearn.ensemble import GradientBoostingClassifier    
        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
        from sklearn.ensemble import ExtraTreesClassifier
        from sklearn.ensemble import BaggingClassifier     

        lr = LogisticRegression(random_state=seed)
        knn = KNeighborsClassifier()
        nb = GaussianNB()
        dt = DecisionTreeClassifier(random_state=seed)
        svm = SGDClassifier(max_iter=1000, tol=0.001, random_state=seed)
        rbfsvm = SVC(gamma='auto', C=1, probability=True, kernel='rbf', random_state=seed)
        gpc = GaussianProcessClassifier(random_state=seed)
        mlp = MLPClassifier(max_iter=500, random_state=seed)
        ridge = RidgeClassifier(random_state=seed)
        rf = RandomForestClassifier(n_estimators=10, random_state=seed)
        qda = QuadraticDiscriminantAnalysis()
        ada = AdaBoostClassifier(random_state=seed)
        gbc = GradientBoostingClassifier(random_state=seed)
        lda = LinearDiscriminantAnalysis()
        et = ExtraTreesClassifier(random_state=seed)  

        progress.value += 1
        
        estimator_list = [lr,knn,nb,dt,svm,rbfsvm,gpc,mlp,ridge,rf,qda,ada,gbc,lda,et]
        voting = 'hard'

    else:

        estimator_list = estimator_list
        voting = method  
        
        progress.value += 1
        
    model_names = []

    for names in estimator_list:

        model_names = np.append(model_names, str(names).split("(")[0])

    def putSpace(input):
        words = re.findall('[A-Z][a-z]*', input)
        words = ' '.join(words)
        return words  

    model_names_modified = []
    
    for i in model_names:
        
        model_names_modified.append(putSpace(i))
        model_names = model_names_modified

    global model_names_final
    
    model_names_final = []
  
    for j in model_names_modified:

        if j == 'Gaussian N B':
            model_names_final.append('Naive Bayes')

        elif j == 'M L P Classifier':
            model_names_final.append('MLP Classifier')

        elif j == 'S G D Classifier':
            model_names_final.append('SVM - Linear Kernel')

        elif j == 'S V C':
            model_names_final.append('SVM - Radial Kernel')

        else: 
            model_names_final.append(j)
            model_names = model_names_final
            #estimator_list = estimator_list

            estimator_list_ = zip(model_names, estimator_list)
            estimator_list_ = set(estimator_list_)
            estimator_list_ = list(estimator_list_)
    
        model = VotingClassifier(estimators=estimator_list_, voting=voting, n_jobs=-1)
    
    progress.value += 1
    
    for train_i , test_i in kf.split(data_X,data_y):
    
        Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
        ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]    
    
        if voting == 'hard':
        
            model.fit(Xtrain,ytrain)
            pred_prob = 0.0
            pred_prob = 0.0
            pred_ = model.predict(Xtest)
            sca = metrics.accuracy_score(ytest,pred_)
            sc = 0.0
            recall = metrics.recall_score(ytest,pred_)
            precision = metrics.precision_score(ytest,pred_)
            kappa = metrics.cohen_kappa_score(ytest,pred_)
            f1 = metrics.f1_score(ytest,pred_)
            score_acc = np.append(score_acc,sca)
            score_auc = np.append(score_auc,sc)
            score_recall = np.append(score_recall,recall)
            score_precision = np.append(score_precision,precision)
            score_f1 =np.append(score_f1,f1)
            score_kappa =np.append(score_kappa,kappa)
        
        else:
        
            model.fit(Xtrain,ytrain)
            pred_prob = model.predict_proba(Xtest)
            pred_prob = pred_prob[:,1]
            pred_ = model.predict(Xtest)
            sca = metrics.accuracy_score(ytest,pred_)
            sc = metrics.roc_auc_score(ytest,pred_prob)
            recall = metrics.recall_score(ytest,pred_)
            precision = metrics.precision_score(ytest,pred_)
            kappa = metrics.cohen_kappa_score(ytest,pred_)
            f1 = metrics.f1_score(ytest,pred_)
            score_acc = np.append(score_acc,sca)
            score_auc = np.append(score_auc,sc)
            score_recall = np.append(score_recall,recall)
            score_precision = np.append(score_precision,precision)
            score_f1 =np.append(score_f1,f1)
            score_kappa =np.append(score_kappa,kappa)
    
    progress.value += 1
    
    mean_acc=np.mean(score_acc)
    mean_auc=np.mean(score_auc)
    mean_recall=np.mean(score_recall)
    mean_precision=np.mean(score_precision)
    mean_f1=np.mean(score_f1)
    mean_kappa=np.mean(score_kappa)
    std_acc=np.std(score_acc)
    std_auc=np.std(score_auc)
    std_recall=np.std(score_recall)
    std_precision=np.std(score_precision)
    std_f1=np.std(score_f1)
    std_kappa=np.std(score_kappa)

    avgs_acc = np.append(avgs_acc, mean_acc)
    avgs_acc = np.append(avgs_acc, std_acc) 
    avgs_auc = np.append(avgs_auc, mean_auc)
    avgs_auc = np.append(avgs_auc, std_auc)
    avgs_recall = np.append(avgs_recall, mean_recall)
    avgs_recall = np.append(avgs_recall, std_recall)
    avgs_precision = np.append(avgs_precision, mean_precision)
    avgs_precision = np.append(avgs_precision, std_precision)
    avgs_f1 = np.append(avgs_f1, mean_f1)
    avgs_f1 = np.append(avgs_f1, std_f1)
    avgs_kappa = np.append(avgs_kappa, mean_kappa)
    avgs_kappa = np.append(avgs_kappa, std_kappa)
    
    progress.value += 1
    
    model_results = pd.DataFrame({'Accuracy': score_acc, 'AUC': score_auc, 'Recall' : score_recall, 'Prec.' : score_precision , 
                     'F1' : score_f1, 'Kappa' : score_kappa})
    model_avgs = pd.DataFrame({'Accuracy': avgs_acc, 'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision , 
                     'F1' : avgs_f1, 'Kappa' : avgs_kappa},index=['Mean', 'SD'])

    model_results = model_results.append(model_avgs)
    model_results = model_results.round(round)
    
    progress.value += 1
    
    clear_output()
    display(model_results)
    return model

In [10]:
def stack_models(estimator_list, 
                 meta_model = None, 
                 fold = 10,
                 round = 4, 
                 method = 'hard', 
                 restack = False, 
                 plot = False):
    
    """
     
  Description:
  ------------
  This function creates a meta model and scores it using Stratified Cross Validation,
  the prediction from base level models passed as estimator_list parameter is used
  as input feature for meta model. Restacking parameter control the ability to expose
  raw features to meta model when set to True (default = False). 

  The output prints the score grid that shows Accuracy, AUC, Recall, Precision, 
  F1 and Kappa by fold (default = 10). Function returns a container which is the 
  list of all models. 
  
  This is an original implementation of pycaret.
  
    Example:
    --------
    
    nb = create_model('nb')
    rf = create_model('rf')
    ada = create_model('ada')
    ridge = create_model('ridge')
    knn = create_model('knn')
    
    stack_models( [ nb, rf, ada, ridge, knn ] )
    
    This will result in creation of meta model that will use the predictions of 
    all the models provided as an input feature of meta model By default meta model 
    is Logistic Regression but can be changed with meta_model param.
    
  Parameters
  ----------
  
  estimator_list : list of object
  
  meta_model : object, default = None
  if set to None, Logistic Regression is used as a meta model.

  fold: integer, default = 10
  Number of folds will determine how many folds would be done in the Kfold CV.
  
  round: integer, default = 4
  The number of decimal places metrics will be rounded to. 

  method: string, default = 'hard'
  'hard', uses predicted class labels as input to meta model. 
  'soft', uses predicted probabilities as input to meta model.
  
  restack: Boolean, default = False
  When restack is set to True, it will expose raw data to meta model.
  
  plot: Boolean, default = False
  When plot is set to True, it will return the correlation plot of prediction
  from all base models provided in estimator_list.
  
  Returns:
  --------
  
  score grid:   A table containing the scores of the model across the kfolds. 
  -----------   Scoring metrics used are Accuracy, AUC, Recall, Precision, F1 
                and Kappa. Mean and standard deviation of the scores across the 
                folds is also returned.
  
  model:        trained model object
  -----------
  
  Warnings:
  ---------
  When estimator doesn't support 'predict_proba' (for example: ridge) and method is 
  forced to 'soft', stack_models() will return an error. 
   
  
  """
    
    #dependencies
    import numpy as np
    import pandas as pd
    from sklearn import metrics
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import cross_val_predict
    import sys
    
    #Capturing the method of stacking required by user. method='soft' means 'predict_proba' else 'predict'
    
    if method == 'soft':
        predict_method = 'predict_proba'
    elif method == 'hard':
        predict_method = 'predict'
    
    #Defining meta model. Logistic Regression hardcoded for now
    
    if meta_model == None:
        from sklearn.linear_model import LogisticRegression
        meta_model = LogisticRegression()
    else:
        meta_model = meta_model
    
    #defining model_library model names
    
    model_names = np.zeros(0)
    for item in estimator_list:
        model_names = np.append(model_names, str(item).split("(")[0])
    
    ##########################
    ##########################
    ##########################
    
    base_array = np.zeros((0,0))
    base_prediction = pd.DataFrame(y_train)
    base_prediction = base_prediction.reset_index(drop=True)
    
    for model in estimator_list:
        base_array = cross_val_predict(model,X_train,y_train,cv=fold, method=predict_method)
        if method == 'soft':
            base_array = base_array[:,1]
        elif method == 'hard':
            base_array = base_array
        base_array_df = pd.DataFrame(base_array)
        base_prediction = pd.concat([base_prediction,base_array_df],axis=1)
        base_array = np.empty((0,0))
        
    #defining column names now
    target_col_name = np.array(base_prediction.columns[0])
    model_names = np.append(target_col_name, model_names)
    base_prediction.columns = model_names #defining colum names now
    
    #defining data_X and data_y dataframe to be used in next stage.
    
    if restack:
        data_X_ = X_train
        data_X_ = data_X_.reset_index(drop=True)
        data_X = base_prediction.drop(base_prediction.columns[0],axis=1)
        data_X = pd.concat([data_X_,data_X],axis=1)
        
    elif restack == False:
        data_X = base_prediction.drop(base_prediction.columns[0],axis=1)
        
    data_y = base_prediction[base_prediction.columns[0]]
    
    #Correlation matrix of base_prediction
    base_prediction_cor = base_prediction.drop(base_prediction.columns[0],axis=1)
    base_prediction_cor = base_prediction_cor.corr()
    
    #Meta Modeling Starts Here
    
    model = meta_model #this defines model to be used below as model = meta_model (as captured above)

    kf = StratifiedKFold(fold, random_state=seed) #capturing fold requested by user

    score_auc =np.empty((0,0))
    score_acc =np.empty((0,0))
    score_recall =np.empty((0,0))
    score_precision =np.empty((0,0))
    score_f1 =np.empty((0,0))
    score_kappa =np.empty((0,0))
    avgs_auc =np.empty((0,0))
    avgs_acc =np.empty((0,0))
    avgs_recall =np.empty((0,0))
    avgs_precision =np.empty((0,0))
    avgs_f1 =np.empty((0,0))
    avgs_kappa =np.empty((0,0))
    
    for train_i , test_i in kf.split(data_X,data_y):
        Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
        ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]

        model.fit(Xtrain,ytrain)
        pred_prob = model.predict_proba(Xtest)
        pred_prob = pred_prob[:,1]
        pred_ = model.predict(Xtest)
        sca = metrics.accuracy_score(ytest,pred_)
        sc = metrics.roc_auc_score(ytest,pred_prob)
        recall = metrics.recall_score(ytest,pred_)
        precision = metrics.average_precision_score(ytest,pred_prob)
        kappa = metrics.cohen_kappa_score(ytest,pred_)
        f1 = metrics.f1_score(ytest,pred_)
        score_acc = np.append(score_acc,sca)
        score_auc = np.append(score_auc,sc)
        score_recall = np.append(score_recall,recall)
        score_precision = np.append(score_precision,precision)
        score_f1 =np.append(score_f1,f1)
        score_kappa =np.append(score_kappa,kappa)
     
    mean_acc=np.mean(score_acc)
    mean_auc=np.mean(score_auc)
    mean_recall=np.mean(score_recall)
    mean_precision=np.mean(score_precision)
    mean_f1=np.mean(score_f1)
    mean_kappa=np.mean(score_kappa)
    std_acc=np.std(score_acc)
    std_auc=np.std(score_auc)
    std_recall=np.std(score_recall)
    std_precision=np.std(score_precision)
    std_f1=np.std(score_f1)
    std_kappa=np.std(score_kappa)
    
    avgs_acc = np.append(avgs_acc, mean_acc)
    avgs_acc = np.append(avgs_acc, std_acc) 
    avgs_auc = np.append(avgs_auc, mean_auc)
    avgs_auc = np.append(avgs_auc, std_auc)
    avgs_recall = np.append(avgs_recall, mean_recall)
    avgs_recall = np.append(avgs_recall, std_recall)
    avgs_precision = np.append(avgs_precision, mean_precision)
    avgs_precision = np.append(avgs_precision, std_precision)
    avgs_f1 = np.append(avgs_f1, mean_f1)
    avgs_f1 = np.append(avgs_f1, std_f1)
    avgs_kappa = np.append(avgs_kappa, mean_kappa)
    avgs_kappa = np.append(avgs_kappa, std_kappa)
      
    model_results = pd.DataFrame({'Accuracy': score_acc, 'AUC': score_auc, 'Recall' : score_recall, 'Prec.' : score_precision , 
                     'F1' : score_f1, 'Kappa' : score_kappa})
    model_avgs = pd.DataFrame({'Accuracy': avgs_acc, 'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision , 
                     'F1' : avgs_f1, 'Kappa' : avgs_kappa},index=['Mean', 'SD'])
  
    model_results = model_results.append(model_avgs)
    model_results = model_results.round(round)  
    
    models = []
    for i in estimator_list:
        models.append(i)
    
    models.append(meta_model)
    
    if plot:
        ax = sns.heatmap(base_prediction_cor, vmin=-0.5, vmax=1, center=0,cmap='magma', square=True, annot=True, 
                         linewidths=1)
    
    else:
        display(model_results)
        return models

In [11]:
def create_stacknet(estimator_list,
                    meta_model = None,
                    fold = 10,
                    round = 4,
                    method = 'hard',
                    restack = False):
    """
     
  Description:
  ------------
  This function creates a sequential stack net using cross validated predictions at
  each layer. The final score grid is predictions from meta model using Stratified 
  Cross Validation. Base level models can be passed as estimator_list parameter, the
  layers can be organized as a sub list within the estimator_list object. Restacking 
  parameter control the ability to expose raw features to meta model when set to True. 
  
    Example:
    --------
    
    nb = create_model( 'nb' )
    rf = create_model( 'rf' )
    ada = create_model( 'ada' )
    ridge = create_model( 'ridge' )
    knn = create_model( 'knn' )
    
    create_stacknet( [ [ nb, rf ], [ ada, ridge, knn] ] )
    
    This will result in stacking of models in multiple layers. The first layer 
    contains nb and rf, the predictions of which is used by models in second layer
    to produce predictions which is used by meta model to generate final predictions.
    By default meta model is Logistic Regression but can be changed with meta_model.
    
  Parameters
  ----------
  
  estimator_list : nested list of object
  
  meta_model : object, default = None
  if set to None, Logistic Regression is used as a meta model.

  fold: integer, default = 10
  Number of folds will determine how many folds would be done in the Kfold CV.
  
  round: integer, default = 4
  The number indicates the number of decimal places metrics will be rounded to. 

  method: string, default = 'hard'
  'hard', uses predicted class labels as input to meta model. 
  'soft', uses predicted probabilities as input to meta model.
  
  restack: Boolean, default = False
  When restack is set to True, it will expose raw data to meta model.
  
  Attributes
  ----------
  All original attributes available in sklearn for a given estimator.
  
  Returns:
  --------
  
  score grid:   A table containing the scores of the model across the kfolds. 
  -----------   Scoring metrics used are Accuracy, AUC, Recall, Precision, F1 
                and Kappa. Mean and standard deviation of the scores across the 
                folds is also returned.
  
  model:        trained model object
  -----------
  
  Warnings:
  ---------
  When estimator doesn't support 'predict_proba' (for example: ridge) and method is 
  forced to 'soft', stack_models() will return an error. 
  
    """

    #dependencies
    import numpy as np
    import pandas as pd
    from sklearn import metrics
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import cross_val_predict
    import sys
    
    #global base_array_df
    
    base_level = estimator_list[0]
    inter_level = estimator_list[1:]
    data_X = X_train
    data_y = y_train
    
    #defining meta model
    
    if meta_model == None:
        from sklearn.linear_model import LogisticRegression
        meta_model = LogisticRegression()
    else:
        meta_model = meta_model
    
    #Capturing the method of stacking required by user. method='soft' means 'predict_proba' else 'predict'
    
    if method == 'soft':
        predict_method = 'predict_proba'
    elif method == 'hard':
        predict_method = 'predict'
        
        
    base_array = np.zeros((0,0))
    base_array_df = pd.DataFrame()
    base_prediction = pd.DataFrame(y_train)
    base_prediction = base_prediction.reset_index(drop=True)
    
    for model in base_level:
                     
        base_array = cross_val_predict(model,X_train,y_train,cv=fold, method=predict_method)
        if method == 'soft':
            base_array = base_array[:,1]
        elif method == 'hard':
            base_array = base_array
        base_array = pd.DataFrame(base_array)
        base_array_df = pd.concat([base_array_df, base_array], axis=1)
        base_array = np.empty((0,0))  
        
    for level in inter_level:
        
        for model in level:
            
            base_array = cross_val_predict(model,base_array_df,base_prediction,cv=fold, method=predict_method)
            if method == 'soft':
                base_array = base_array[:,1]
            elif method == 'hard':
                base_array = base_array
            base_array = pd.DataFrame(base_array)
            base_array_df = pd.concat([base_array, base_array_df], axis=1)
            base_array = np.empty((0,0))
        
        if restack == False:
            base_array_df = base_array_df.iloc[:,:len(level)]
        else:
            base_array_df = base_array_df
    
    model = meta_model
    
    kf = StratifiedKFold(fold, random_state=seed) #capturing fold requested by user

    score_auc =np.empty((0,0))
    score_acc =np.empty((0,0))
    score_recall =np.empty((0,0))
    score_precision =np.empty((0,0))
    score_f1 =np.empty((0,0))
    score_kappa =np.empty((0,0))
    avgs_auc =np.empty((0,0))
    avgs_acc =np.empty((0,0))
    avgs_recall =np.empty((0,0))
    avgs_precision =np.empty((0,0))
    avgs_f1 =np.empty((0,0))
    avgs_kappa =np.empty((0,0))
    
    for train_i , test_i in kf.split(data_X,data_y):
        
        Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
        ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]

        model.fit(Xtrain,ytrain)
        pred_prob = model.predict_proba(Xtest)
        pred_prob = pred_prob[:,1]
        pred_ = model.predict(Xtest)
        sca = metrics.accuracy_score(ytest,pred_)
        sc = metrics.roc_auc_score(ytest,pred_prob)
        recall = metrics.recall_score(ytest,pred_)
        precision = metrics.average_precision_score(ytest,pred_prob)
        kappa = metrics.cohen_kappa_score(ytest,pred_)
        f1 = metrics.f1_score(ytest,pred_)
        score_acc = np.append(score_acc,sca)
        score_auc = np.append(score_auc,sc)
        score_recall = np.append(score_recall,recall)
        score_precision = np.append(score_precision,precision)
        score_f1 =np.append(score_f1,f1)
        score_kappa =np.append(score_kappa,kappa)
     
    mean_acc=np.mean(score_acc)
    mean_auc=np.mean(score_auc)
    mean_recall=np.mean(score_recall)
    mean_precision=np.mean(score_precision)
    mean_f1=np.mean(score_f1)
    mean_kappa=np.mean(score_kappa)
    std_acc=np.std(score_acc)
    std_auc=np.std(score_auc)
    std_recall=np.std(score_recall)
    std_precision=np.std(score_precision)
    std_f1=np.std(score_f1)
    std_kappa=np.std(score_kappa)
    
    avgs_acc = np.append(avgs_acc, mean_acc)
    avgs_acc = np.append(avgs_acc, std_acc) 
    avgs_auc = np.append(avgs_auc, mean_auc)
    avgs_auc = np.append(avgs_auc, std_auc)
    avgs_recall = np.append(avgs_recall, mean_recall)
    avgs_recall = np.append(avgs_recall, std_recall)
    avgs_precision = np.append(avgs_precision, mean_precision)
    avgs_precision = np.append(avgs_precision, std_precision)
    avgs_f1 = np.append(avgs_f1, mean_f1)
    avgs_f1 = np.append(avgs_f1, std_f1)
    avgs_kappa = np.append(avgs_kappa, mean_kappa)
    avgs_kappa = np.append(avgs_kappa, std_kappa)
      
    model_results = pd.DataFrame({'Accuracy': score_acc, 'AUC': score_auc, 'Recall' : score_recall, 'Prec.' : score_precision , 
                     'F1' : score_f1, 'Kappa' : score_kappa})
    model_avgs = pd.DataFrame({'Accuracy': avgs_acc, 'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision , 
                     'F1' : avgs_f1, 'Kappa' : avgs_kappa},index=['Mean', 'SD'])
  
    model_results = model_results.append(model_avgs)
    model_results = model_results.round(round)      
    
    display(model_results)

In [12]:
def interpret_model(estimator,
                   type = 'summary',
                   feature = None, 
                   observation = 'All'):
    
    """
      
  Description:
  ------------
  This function takes a trained model object and returns the interpretation plot on
  test set. This function only supports tree based algorithm. 
  
  This function is implemented based on original implementation in package 'shap'.
  SHAP (SHapley Additive exPlanations) is a unified approach to explain the output 
  of any machine learning model. SHAP connects game theory with local explanations.
  
  For more information : https://shap.readthedocs.io/en/latest/

    Example:
    --------
    
    dt = create_model('dt')
    interpret_model(dt)
    
    This will return the summary interpretation plot of Decision Tree model.
  
  Parameters
  ----------
  
  estimator : object, default=none
  
  A trained tree based model object should be passed as an estimator. 
  Model must be created using create_model() or tune_model() in pycaret or using 
  any other package that returns sklearn object.
  
  type : string, default = 'summary'
  other available options are 'dependence' and 'prediction'.
  
  feature: string, default = None
  This parameter is only needed when type = 'dependence'. By default feature is set
  to None which means the first column of dataset will be used as a variable. 
  To change feature param must be passed. 
  
  observation: integer or string (when set to 'All'), default = 'All'
  This parameter is only needed when type = 'prediction'. By default the plot 
  will  return the analysis for all observations with option to select the feature 
  on x and y axis through drop down interactivity. For analysis of individual
  observation, observation parameter must be passed with index value of 
  observation in test set. 

  Returns:
  --------
  
  Visual Plot:  Returns the visual plot.
                Returns the interactive JS plot when type = 'prediction'.
              
  Warnings:
  ---------
  None    
    
    """
    model = estimator
    model_name = str(model).split("(")[0]
    
    #dependencies
    import numpy as np
    import pandas as pd
    import shap
    import sys
    #shap.initjs()
    
    #allowed models
    allowed_models = ['RandomForestClassifier',
                      'DecisionTreeClassifier',
                      'ExtraTreesClassifier',
                      'GradientBoostingClassifier']
    
    #defining type of classifier
    type1 = ['RandomForestClassifier','DecisionTreeClassifier','ExtraTreesClassifier']
    type2 = ['GradientBoostingClassifier']
    
    #chcecking if model passed is acceptable or not
    if model_name not in allowed_models:
        sys.exit('Not Allowed')
    
    if type == 'summary':
        
        if model_name in type1:
        
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_test)
            shap.summary_plot(shap_values, X_test)
            
        elif model_name in type2:
            
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_test)
            shap.summary_plot(shap_values, X_test)
                              
    elif type == 'dependence':
        
        if feature == None:
            
            dependence = X_test.columns[0]
            
        else:
            
            dependence = feature
        
        if model_name in type1:
                
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_test)
            shap.dependence_plot(dependence, shap_values[1], X_test)
        
        elif model_name in type2:
            
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_test) 
            shap.dependence_plot(dependence, shap_values, X_test)
        
    elif type == 'prediction':
        
        if model_name in type1:
            
            if observation == 'All':
                
                explainer = shap.TreeExplainer(model)
                shap_values = explainer.shap_values(X_test)
                shap.initjs()
                return shap.force_plot(explainer.expected_value[1], shap_values[1], X_test)
            
            else: 
                
                row_to_show = observation
                data_for_prediction = X_test.iloc[row_to_show]
                explainer = shap.TreeExplainer(model)
                shap_values = explainer.shap_values(data_for_prediction)
                shap.initjs()
                return shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction)        

            
        elif model_name in type2:

            if observation == 'All':
                
                explainer = shap.TreeExplainer(model)
                shap_values = explainer.shap_values(X_test)
                shap.initjs()
                return shap.force_plot(explainer.expected_value, shap_values, X_test)
            
            else: 
                
                explainer = shap.TreeExplainer(model)
                shap_values = explainer.shap_values(X_test)
                shap.initjs()
                return shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:])

In [None]:
def automl(qualifier = 5,
           target_metric = 'Accuracy',
           fold = 10, 
           round = 4):
    
    """
      
  Description:
  ------------
  This function is an original implementation of pycaret. It sequentially creates
  various model and apply different techniques for Ensembling and Stacking. It returns
  the best model based on 'target_metric' parameter defined. To limit the processing
  time, 'qualifier' param can be reduced (by default = 5).  
  
    Example:
    --------
    
    automl = automl()
    
    ** All parameters are optional
    
  Parameters
  ----------
  
  qualifier : integer, default = None
  Number of top models considered for further processing to return the best model.
  Higher number will result in longer process times.
  
  target_metric : String, default = 'Accuracy'
  Metric to use for qualifying models and tuning the hyperparameters.

  fold: integer, default = 10
  Number of folds will determine how many folds would be done in the Kfold CV.
  
  round: integer, default = 4
  The number indicates the number of decimal places metrics will be rounded to. 

  Attributes
  ----------
  All original attributes available in sklearn for a given estimator.
  
  Returns:
  --------
  
  score grid:   A table containing the averaged Kfold scores of all the models
  -----------   Scoring metrics used are Accuracy, AUC, Recall, Precision, F1 
                and Kappa. 
  
  model:        trained model object (best model selected using target metric param)
  -----------
  
  Warnings:
  ---------
  None
    
    """
    
    #base dependencies
    from IPython.display import clear_output
    import numpy as np
    import pandas as pd
    import random
    import sys
    
    #master collector
    #This is being used for appending throughout the process 1/N
    global master, master_results
    master = []
    master_results = pd.DataFrame(columns=['Model', 'Accuracy','AUC','Recall', 'Prec.', 'F1', 'Kappa'])
    #master_display = master_results
    
    #progress bar
    import ipywidgets as ipw
    progress = ipw.IntProgress(value=0, min=0, max=12, step=1 , description='Processing: ')
    display(progress)
    display(master_results)
    
    #automl parameters to be used in this function
    top_n = qualifier #top_n candidates for processing
    
    if target_metric == 'Accuracy':
        optimize = target_metric.lower()
        sort = 'Accuracy'
        
    elif target_metric == 'AUC':
        optimize = 'roc_auc'
        sort = 'AUC'     
        
    elif target_metric == 'Recall':
        optimize = target_metric.lower()
        sort = 'Recall'        

    elif target_metric == 'Precision':
        optimize = target_metric.lower()
        sort = 'Prec.'
   
    elif target_metric == 'F1':
        optimize = target_metric.lower()
        sort = 'F1'
        
    elif target_metric == 'Kappa':
        optimize = 'roc_auc'
        sort = 'Kappa'
        
    n_iter = 10 #number of iteration for tuning
    
    #ignore warnings
    import warnings
    warnings.filterwarnings('ignore') 

    #defining X_train and y_train
    data_X = X_train
    data_y=y_train
    
    #sklearn dependencies
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.svm import SVC
    from sklearn.gaussian_process.kernels import RBF
    from sklearn.gaussian_process import GaussianProcessClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.linear_model import RidgeClassifier
    from sklearn.linear_model import Lasso
    from sklearn.linear_model import LogisticRegression
    from sklearn.linear_model import SGDClassifier
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import ExtraTreesClassifier    
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import RandomForestClassifier
    
    #sklearn ensembling dependencies
    from sklearn.ensemble import BaggingClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.ensemble import VotingClassifier
    
    #other imports from sklearn
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.model_selection import cross_val_predict
    from sklearn import metrics
    
    #create sklearn model objects
    lr = LogisticRegression(random_state=seed)
    knn = KNeighborsClassifier()
    nb = GaussianNB()
    dt = DecisionTreeClassifier(random_state=seed)
    svm = SGDClassifier(max_iter=1000, tol=0.001, random_state=seed)
    rbfsvm = SVC(gamma='auto', C=1, probability=True, kernel='rbf', random_state=seed)
    gpc = GaussianProcessClassifier(random_state=seed)
    mlp = MLPClassifier(max_iter=500, random_state=seed)
    ridge = RidgeClassifier(random_state=seed)
    rf = RandomForestClassifier(n_estimators=10, random_state=seed)
    qda = QuadraticDiscriminantAnalysis()
    ada = AdaBoostClassifier(random_state=seed)
    gbc = GradientBoostingClassifier(random_state=seed)
    lda = LinearDiscriminantAnalysis()
    et = ExtraTreesClassifier(random_state=seed)
    
    #defining model library 
    model_library = [lr, knn, nb, dt, svm, rbfsvm, gpc, mlp, ridge, rf, qda, ada, gbc, lda, et]

    #defining model names
    model_names = []

    for names in model_library:
        model_names = np.append(model_names, str(names).split("(")[0])
    
    
    progress.value += 1
    
    '''
    Step 1 - Run all the models in model library.
    This function is equivalent to compare_models() without any blacklist model

    '''
    #cross validation
    kf = StratifiedKFold(fold, random_state=seed)

    score_acc =np.empty((0,0))
    score_auc =np.empty((0,0))
    score_recall =np.empty((0,0))
    score_precision =np.empty((0,0))
    score_f1 =np.empty((0,0))
    score_kappa =np.empty((0,0))
    score_acc_running = np.empty((0,0)) ##running total
    avg_acc = np.empty((0,0))
    avg_auc = np.empty((0,0))
    avg_recall = np.empty((0,0))
    avg_precision = np.empty((0,0))
    avg_f1 = np.empty((0,0))
    avg_kappa = np.empty((0,0))
      
    for model in model_library:
 
        for train_i , test_i in kf.split(data_X,data_y):
     
            Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
            ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]
        
            if hasattr(model, 'predict_proba'):               
        
                model.fit(Xtrain,ytrain)
                pred_prob = model.predict_proba(Xtest)
                pred_prob = pred_prob[:,1]
                pred_ = model.predict(Xtest)
                sca = metrics.accuracy_score(ytest,pred_)
                sc = metrics.roc_auc_score(ytest,pred_prob)
                recall = metrics.recall_score(ytest,pred_)
                precision = metrics.precision_score(ytest,pred_)
                kappa = metrics.cohen_kappa_score(ytest,pred_)
                f1 = metrics.f1_score(ytest,pred_)
                score_acc = np.append(score_acc,sca)
                score_auc = np.append(score_auc,sc)
                score_recall = np.append(score_recall,recall)
                score_precision = np.append(score_precision,precision)
                score_f1 =np.append(score_f1,f1)
                score_kappa =np.append(score_kappa,kappa)              
        
            else:        

                model.fit(Xtrain,ytrain)
                pred_prob = 0.00
                pred_prob = 0.00
                pred_ = model.predict(Xtest)
                sca = metrics.accuracy_score(ytest,pred_)
                sc = 0.00
                recall = metrics.recall_score(ytest,pred_)
                precision = metrics.precision_score(ytest,pred_) #change pred_prob to pred_
                kappa = metrics.cohen_kappa_score(ytest,pred_)
                f1 = metrics.f1_score(ytest,pred_)
                score_acc = np.append(score_acc,sca)
                score_auc = np.append(score_auc,sc)
                score_recall = np.append(score_recall,recall)
                score_precision = np.append(score_precision,precision)
                score_f1 =np.append(score_f1,f1)
                score_kappa =np.append(score_kappa,kappa) 
        
        avg_acc = np.append(avg_acc,np.mean(score_acc))
        avg_auc = np.append(avg_auc,np.mean(score_auc))
        avg_recall = np.append(avg_recall,np.mean(score_recall))
        avg_precision = np.append(avg_precision,np.mean(score_precision))
        avg_f1 = np.append(avg_f1,np.mean(score_f1))
        avg_kappa = np.append(avg_kappa,np.mean(score_kappa))
        score_acc =np.empty((0,0))
        score_auc =np.empty((0,0))
        score_recall =np.empty((0,0))
        score_precision =np.empty((0,0))
        score_f1 =np.empty((0,0))
        score_kappa =np.empty((0,0))

    compare_models_ = pd.DataFrame({'Model':model_names, 'Accuracy':avg_acc, 'AUC':avg_auc, 
                     'Recall':avg_recall, 'Prec.':avg_precision, 
                     'F1':avg_f1, 'Kappa': avg_kappa})
    
    compare_models_ = compare_models_.sort_values(by=sort, ascending=False).reset_index(drop=True)
    compare_models_ = compare_models_.round(round)
    
    top_n_model_names = list(compare_models_.iloc[0:top_n]['Model'])  #DO NOT DELETE - IT IS USED BELOW
    top_n_model_results = compare_models_[:top_n] #DO NOT DELETE - IT IS USED - IT IS USED TO APPEND TO MASTER_RESULTS
    master_results = master_results.append(top_n_model_results)
    
    progress.value += 1
    clear_output()
    master_display = master_results.sort_values(by=sort,ascending=False)
    master_display.reset_index(drop=True, inplace=True)
    display(progress)
    display(master_display)
    
    '''

    The section below is still part of Step 1. The purpose of this chunk is to 
    take the name string from 'top_n_model_names' and create a model that is being
    appended to master list. Models are re-created (In future, re-creation must be
    replaced by already created object for efficiency purpose).
    
    '''
    top_n_models = []
    
    for i in top_n_model_names:
        
        if i == 'LinearDiscriminantAnalysis':
            
            model = LinearDiscriminantAnalysis()
            top_n_models.append(model)
            
        elif i == 'LogisticRegression':
            
            model = LogisticRegression(random_state=seed)
            top_n_models.append(model)
            
        elif i == 'GradientBoostingClassifier':
            
            model = GradientBoostingClassifier(random_state=seed)
            top_n_models.append(model)
            
        elif i == 'AdaBoostClassifier':
            
            model =  AdaBoostClassifier(random_state=seed)           
            top_n_models.append(model)
            
        elif i == 'MLPClassifier':
            
            model = MLPClassifier(max_iter=500, random_state=seed)
            top_n_models.append(model)
            
        elif i == 'RandomForestClassifier':
            
            model = RandomForestClassifier(n_estimators=10, random_state=seed)
            top_n_models.append(model)
            
        elif i == 'GaussianNB':
            
            model = GaussianNB()
            top_n_models.append(model)
            
        elif i == 'DecisionTreeClassifier':
            
            model = DecisionTreeClassifier(random_state=seed)
            top_n_models.append(model)
            
        elif i == 'ExtraTreesClassifier':
            
            model = ExtraTreesClassifier(random_state=seed)
            top_n_models.append(model)
            
        elif i == 'SVC':
            
            model = SVC(gamma='auto', C=1, probability=True, kernel='rbf', random_state=seed)
            top_n_models.append(model)
            
        elif i == 'KNeighborsClassifier':
            
            model = KNeighborsClassifier()
            top_n_models.append(model)
            
        elif i == 'GaussianProcessClassifier':
            
            model = GaussianProcessClassifier(random_state=seed)
            top_n_models.append(model)
            
        elif i == 'QuadraticDiscriminantAnalysis':
            
            model = QuadraticDiscriminantAnalysis()
            top_n_models.append(model)
            
        elif i == 'SGDClassifier':
            
            model = SGDClassifier(max_iter=1000, tol=0.001, random_state=seed)
            top_n_models.append(model)
            
        elif i == 'RidgeClassifier':
            
            model = RidgeClassifier(random_state=seed)
            top_n_models.append(model)
    
    master.append(top_n_models) #appending top_n models to master list
    
    progress.value += 1
    clear_output()
    master_display = master_results.sort_values(by=sort,ascending=False)
    master_display.reset_index(drop=True, inplace=True)
    display(progress)
    display(master_display)
    
    '''
    
    Step 2 - Create Ensemble Bagging using BaggingClassifier() from sklearn for all the 
    models in 'top_n_models' param defined above. Number of models at this stage in 
    'top_n_models' param is equal to # of models in 'master' param.
    
    This function is equivalent to ensemble_model().
    
    '''    

    top_n_bagged_models = []
    top_n_bagged_model_results = pd.DataFrame(columns=['Model', 'Accuracy','AUC','Recall', 'Prec.', 'F1', 'Kappa'])
    
    #defining names
    bagging_model_names = []
    for i in top_n_model_names:
        s = 'Bagging ' + i
        bagging_model_names.append(s)
    
    #counter for naming
    name_counter = 0 
    
    for i in top_n_models:
       
        #from sklearn.ensemble import BaggingClassifier
        model = BaggingClassifier(i,bootstrap=True,n_estimators=10, random_state=seed)
        top_n_bagged_models.append(model)
    
        #setting cross validation
        kf = StratifiedKFold(fold, random_state=seed)

        score_auc =np.empty((0,0))
        score_acc =np.empty((0,0))
        score_recall =np.empty((0,0))
        score_precision =np.empty((0,0))
        score_f1 =np.empty((0,0))
        score_kappa =np.empty((0,0))
        avgs_auc =np.empty((0,0))
        avgs_acc =np.empty((0,0))
        avgs_recall =np.empty((0,0))
        avgs_precision =np.empty((0,0))
        avgs_f1 =np.empty((0,0))
        avgs_kappa =np.empty((0,0))
        
        for train_i , test_i in kf.split(data_X,data_y):
    
            Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
            ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]
        
            if hasattr(model, 'predict_proba'):
        
                model.fit(Xtrain,ytrain)
                pred_prob = model.predict_proba(Xtest)
                pred_prob = pred_prob[:,1]
                pred_ = model.predict(Xtest)
                sca = metrics.accuracy_score(ytest,pred_)
                sc = metrics.roc_auc_score(ytest,pred_prob)
                recall = metrics.recall_score(ytest,pred_)
                precision = metrics.precision_score(ytest,pred_)
                kappa = metrics.cohen_kappa_score(ytest,pred_)
                f1 = metrics.f1_score(ytest,pred_)
                score_acc = np.append(score_acc,sca)
                score_auc = np.append(score_auc,sc)
                score_recall = np.append(score_recall,recall)
                score_precision = np.append(score_precision,precision)
                score_f1 =np.append(score_f1,f1)
                score_kappa =np.append(score_kappa,kappa)

            else:
            
                model.fit(Xtrain,ytrain)
                pred_prob = 0.00
                pred_prob = 0.00
                pred_ = model.predict(Xtest)
                sca = metrics.accuracy_score(ytest,pred_)
                sc = 0.00
                recall = metrics.recall_score(ytest,pred_)
                precision = metrics.precision_score(ytest,pred_) #change pred_prob to pred_
                kappa = metrics.cohen_kappa_score(ytest,pred_)
                f1 = metrics.f1_score(ytest,pred_)
                score_acc = np.append(score_acc,sca)
                score_auc = np.append(score_auc,sc)
                score_recall = np.append(score_recall,recall)
                score_precision = np.append(score_precision,precision)
                score_f1 =np.append(score_f1,f1)
                score_kappa =np.append(score_kappa,kappa) 
       
        mean_acc=np.mean(score_acc)
        mean_auc=np.mean(score_auc)
        mean_recall=np.mean(score_recall)
        mean_precision=np.mean(score_precision)
        mean_f1=np.mean(score_f1)
        mean_kappa=np.mean(score_kappa)

        avgs_acc = np.append(avgs_acc, mean_acc)
        avgs_auc = np.append(avgs_auc, mean_auc)
        avgs_recall = np.append(avgs_recall, mean_recall)
        avgs_precision = np.append(avgs_precision, mean_precision)
        avgs_f1 = np.append(avgs_f1, mean_f1)
        avgs_kappa = np.append(avgs_kappa, mean_kappa)
        
        #model_name = 'Bagging' + str(i).split("(")[0]
        model_results = pd.DataFrame({'Model': bagging_model_names[name_counter], 'Accuracy': avgs_acc, 'AUC': avgs_auc, 
                                      'Recall' : avgs_recall, 'Prec.' : avgs_precision , 'F1' : avgs_f1, 
                                      'Kappa' : avgs_kappa}).reset_index(drop=True)
        model_results = model_results.round(round)
        name_counter += 1
        top_n_bagged_model_results = pd.concat([top_n_bagged_model_results, model_results],ignore_index=True)
        
    master_results = master_results.append(top_n_bagged_model_results)
    master.append(top_n_bagged_models) 
    
    progress.value += 1
    clear_output()
    master_display = master_results.sort_values(by=sort,ascending=False)
    master_display.reset_index(drop=True, inplace=True)
    display(progress)
    display(master_display)
    
    '''
    
    Step 3 - Create Ensemble Boosting using AdaBoostClassifier() from sklearn for all the 
    models in 'top_n_models' param defined above. 
    
    This function is equivalent to ensemble_model(method = 'Boosting').
    
    '''        
    
    top_n_boosted_models = []
    top_n_boosted_model_results = pd.DataFrame(columns=['Model','Accuracy','AUC','Recall', 'Prec.', 'F1', 'Kappa'])
    
    boosting_model_names = []
    for i in top_n_model_names:
        s = 'Boosting ' + i
        boosting_model_names.append(s)
     
    #counter for naming
    name_counter = 0 
        
    for i in top_n_models:
       
        if hasattr(i,'predict_proba') and hasattr(i,'class_weight'):
            model = AdaBoostClassifier(i, random_state=seed)
            top_n_boosted_models.append(model)
            
        else:
            model = i
            top_n_boosted_models.append(model)
    
        #setting cross validation
        kf = StratifiedKFold(fold, random_state=seed)

        score_auc =np.empty((0,0))
        score_acc =np.empty((0,0))
        score_recall =np.empty((0,0))
        score_precision =np.empty((0,0))
        score_f1 =np.empty((0,0))
        score_kappa =np.empty((0,0))
        avgs_auc =np.empty((0,0))
        avgs_acc =np.empty((0,0))
        avgs_recall =np.empty((0,0))
        avgs_precision =np.empty((0,0))
        avgs_f1 =np.empty((0,0))
        avgs_kappa =np.empty((0,0))
        
        for train_i , test_i in kf.split(data_X,data_y):
    
            Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
            ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]
        
            if hasattr(model, 'predict_proba'):
        
                model.fit(Xtrain,ytrain)
                pred_prob = model.predict_proba(Xtest)
                pred_prob = pred_prob[:,1]
                pred_ = model.predict(Xtest)
                sca = metrics.accuracy_score(ytest,pred_)
                sc = metrics.roc_auc_score(ytest,pred_prob)
                recall = metrics.recall_score(ytest,pred_)
                precision = metrics.precision_score(ytest,pred_)
                kappa = metrics.cohen_kappa_score(ytest,pred_)
                f1 = metrics.f1_score(ytest,pred_)
                score_acc = np.append(score_acc,sca)
                score_auc = np.append(score_auc,sc)
                score_recall = np.append(score_recall,recall)
                score_precision = np.append(score_precision,precision)
                score_f1 =np.append(score_f1,f1)
                score_kappa =np.append(score_kappa,kappa)

            else:
            
                model.fit(Xtrain,ytrain)
                pred_prob = 0.00
                pred_prob = 0.00
                pred_ = model.predict(Xtest)
                sca = metrics.accuracy_score(ytest,pred_)
                sc = 0.00
                recall = metrics.recall_score(ytest,pred_)
                precision = metrics.precision_score(ytest,pred_) #change pred_prob to pred_
                kappa = metrics.cohen_kappa_score(ytest,pred_)
                f1 = metrics.f1_score(ytest,pred_)
                score_acc = np.append(score_acc,sca)
                score_auc = np.append(score_auc,sc)
                score_recall = np.append(score_recall,recall)
                score_precision = np.append(score_precision,precision)
                score_f1 =np.append(score_f1,f1)
                score_kappa =np.append(score_kappa,kappa) 
       
        mean_acc=np.mean(score_acc)
        mean_auc=np.mean(score_auc)
        mean_recall=np.mean(score_recall)
        mean_precision=np.mean(score_precision)
        mean_f1=np.mean(score_f1)
        mean_kappa=np.mean(score_kappa)

        avgs_acc = np.append(avgs_acc, mean_acc)
        avgs_auc = np.append(avgs_auc, mean_auc)
        avgs_recall = np.append(avgs_recall, mean_recall)
        avgs_precision = np.append(avgs_precision, mean_precision)
        avgs_f1 = np.append(avgs_f1, mean_f1)
        avgs_kappa = np.append(avgs_kappa, mean_kappa)
        
        #model_name = 'Boosting' + str(i).split("(")[0]
        model_results = pd.DataFrame({'Model': boosting_model_names[name_counter],'Accuracy': avgs_acc, 
                                      'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision, 
                                      'F1' : avgs_f1, 'Kappa' : avgs_kappa}).reset_index(drop=True)
        model_results = model_results.round(round)
        name_counter += 1
        top_n_boosted_model_results = pd.concat([top_n_boosted_model_results, model_results],ignore_index=True)
        
    master_results = master_results.append(top_n_boosted_model_results)
    master.append(top_n_boosted_models)
    
    progress.value += 1
    clear_output()
    master_display = master_results.sort_values(by=sort,ascending=False)
    master_display.reset_index(drop=True, inplace=True)
    display(progress)
    display(master_display) 
    
    '''

    Step 4 - Tune all models in 'top_n_models' param defined in Step 1 above.
    This function is equivalent to tune_model().


    '''           
    
    #4.1 Store tuned model objects in the list 'top_n_tuned_models'
    
    top_n_tuned_models = []
    
    for i in top_n_model_names:
        
        if i == 'RidgeClassifier':
            
            param_grid = {'alpha': [0.0001,0.001,0.1,0.15,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                          'fit_intercept': [True, False],
                          'normalize': [True, False]
                         }    

            model_grid = RandomizedSearchCV(estimator=RidgeClassifier(random_state=seed), 
                                            param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                            cv=fold, random_state=seed, n_jobs=-1)

            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
        
        elif i == 'LogisticRegression':
            
            param_grid = {'C': [1,5,10,25,50,100],
                      "penalty": [ 'l1', 'l2'],
                      "class_weight": ["balanced", None]
                         }
            model_grid = RandomizedSearchCV(estimator=LogisticRegression(random_state=seed), 
                                            param_distributions=param_grid, scoring=optimize, n_iter=n_iter, cv=fold, 
                                            random_state=seed, iid=False,n_jobs=-1)
            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
        
        elif i == 'GradientBoostingClassifier':
            
            param_grid = {'loss': ['deviance', 'exponential'],
                          'n_estimators': [10, 40, 70, 80, 90, 100, 120, 140, 150],
                          'learning_rate': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                          'subsample' : [0.1,0.3,0.5,0.7,0.9,1],
                          'min_samples_split' : [2,4,5,7,9,10],
                          'min_samples_leaf' : [1,2,3,4,5],
                          'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
                          'max_features' : ['auto', 'sqrt', 'log2']
                         }    

            model_grid = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=seed), 
                                            param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                            cv=fold, random_state=seed, n_jobs=-1)

            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
        
        elif i == 'LinearDiscriminantAnalysis':
            
            param_grid = {'solver' : ['lsqr', 'eigen'],
                          'shrinkage': [0.0001, 0.001, 0.01, 0.0005, 0.005, 0.05, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
                         }    

            model_grid = RandomizedSearchCV(estimator=LinearDiscriminantAnalysis(), 
                                            param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                            cv=fold, random_state=seed, n_jobs=-1)

            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
        
        elif i == 'AdaBoostClassifier':
            
            param_grid = {'n_estimators': [10, 40, 70, 80, 90, 100, 120, 140, 150],
                          'learning_rate': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                          'algorithm' : ["SAMME", "SAMME.R"]
                         }    

            model_grid = RandomizedSearchCV(estimator=AdaBoostClassifier(random_state=seed), 
                                            param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                            cv=fold, random_state=seed, n_jobs=-1)

            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
        
        elif i == 'RandomForestClassifier':
            
            param_grid = {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                          'criterion': ['gini', 'entropy'],
                          'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
                          'min_samples_split': [2, 5, 7, 9, 10],
                          'min_samples_leaf' : [1, 2, 4],
                          'max_features' : ['auto', 'sqrt', 'log2'],
                          'bootstrap': [True, False]
                         }    

            model_grid = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=seed), 
                                            param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                            cv=fold, random_state=seed, n_jobs=-1)

            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
        
        elif i == 'DecisionTreeClassifier':
            
            param_grid = {"max_depth": np.random.randint(3, (len(X_train.columns)*.85),4),
                      "max_features": np.random.randint(3, len(X_train.columns),4),
                      "min_samples_leaf": [2,3,4],
                      "criterion": ["gini", "entropy"]}

            model_grid = RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=seed), param_distributions=param_grid,
                                           scoring=optimize, n_iter=n_iter, cv=fold, random_state=seed,
                                           iid=False, n_jobs=-1)

            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
        
        elif i == 'MLPClassifier':
            
            param_grid = {'learning_rate': ['constant', 'invscaling', 'adaptive'],
                     'solver' : ['lbfgs', 'sgd', 'adam'],
                     'alpha': [0.0001, 0.05],
                     'hidden_layer_sizes': np.random.randint(5,15,5),
                     'activation': ["tanh", "identity", "logistic","relu"]
                     }

            model_grid = RandomizedSearchCV(estimator=MLPClassifier(max_iter=1000, random_state=seed), 
                                            param_distributions=param_grid, scoring=optimize, n_iter=n_iter, cv=fold, 
                                            random_state=seed, iid=False, n_jobs=-1)

            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
        
        elif i == 'ExtraTreesClassifier':
            

            param_grid = {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                          'criterion': ['gini', 'entropy'],
                          'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
                          'min_samples_split': [2, 5, 7, 9, 10],
                          'min_samples_leaf' : [1, 2, 4],
                          'max_features' : ['auto', 'sqrt', 'log2'],
                          'bootstrap': [True, False]
                         }    

            model_grid = RandomizedSearchCV(estimator=ExtraTreesClassifier(random_state=seed), 
                                            param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                            cv=fold, random_state=seed, n_jobs=-1)

            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
        
        elif i == 'GaussianProcessClassifier':
            
            param_grid = {"max_iter_predict":[100,200,300,400,500,600,700,800,900,1000]}

            model_grid = RandomizedSearchCV(estimator=GaussianProcessClassifier(random_state=seed), param_distributions=param_grid,
                                           scoring=optimize, n_iter=n_iter, cv=fold, random_state=seed,
                                           n_jobs=-1)

            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
        
        elif i == 'KNeighborsClassifier':
            
            param_grid = {'n_neighbors': range(1,51),
                     'weights' : ['uniform', 'distance'],
                     'metric':["euclidean", "manhattan"]
                         }        
            model_grid = RandomizedSearchCV(estimator=KNeighborsClassifier(), param_distributions=param_grid, 
                                            scoring=optimize, n_iter=n_iter, cv=fold, random_state=seed,
                                           n_jobs=-1, iid=False)

            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
        
        elif i == 'GaussianNB':
            
            param_grid = {'var_smoothing': [0.000000001, 0.0000001, 0.00001, 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007,
                                            0.008, 0.009, 0.01, 0.1, 1]}

            model_grid = RandomizedSearchCV(estimator=GaussianNB(), 
                                            param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                            cv=fold, random_state=seed, n_jobs=-1)

            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
        
        elif i == 'SVC':
            
            param_grid = {'C': [.5,1,10,50,100],
                    "class_weight": ["balanced", None]}

            model_grid = RandomizedSearchCV(estimator=SVC(gamma='auto', C=1, probability=True, kernel='rbf', random_state=seed), 
                                            param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                            cv=fold, random_state=seed, n_jobs=-1)

            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
        
        elif i == 'QuadraticDiscriminantAnalysis':
            
            param_grid = {'reg_param': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]}    

            model_grid = RandomizedSearchCV(estimator=QuadraticDiscriminantAnalysis(), 
                                            param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                            cv=fold, random_state=seed, n_jobs=-1)

            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
        
        elif i == 'SGDClassifier':
            
            param_grid = {'penalty': ['l2', 'l1','elasticnet'],
                          'l1_ratio': [0,0.1,0.15,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                          'alpha': [0.0001, 0.001, 0.01, 0.0002, 0.002, 0.02, 0.0005, 0.005, 0.05],
                          'fit_intercept': [True, False],
                          'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
                          'eta0': [0.001, 0.01,0.05,0.1,0.2,0.3,0.4,0.5]
                         }    

            model_grid = RandomizedSearchCV(estimator=SGDClassifier(loss='hinge', random_state=seed), 
                                            param_distributions=param_grid, scoring=optimize, n_iter=n_iter, 
                                            cv=fold, random_state=seed, n_jobs=-1)

            model_grid.fit(X_train,y_train)
            model = model_grid.best_estimator_
            best_model = model_grid.best_estimator_
            best_model_param = model_grid.best_params_
            top_n_tuned_models.append(best_model)
    
    master.append(top_n_tuned_models)
    
    progress.value += 1
    clear_output()
    master_display = master_results.sort_values(by=sort,ascending=False)
    master_display.reset_index(drop=True, inplace=True)
    display(progress)
    display(master_display)
    
    '''
    
    This section below is still continued from Step 4. In the part above tuned model
    object is stored in the list. In the part below the CV results are generated using
    stored objects in above step.
    
    '''
    
    tuning_model_names = []
    top_n_tuned_model_results = pd.DataFrame(columns=['Model', 'Accuracy','AUC','Recall', 'Prec.', 'F1', 'Kappa'])
    
    for i in top_n_model_names:
        s = 'Tuning ' + i
        tuning_model_names.append(s)
    
    #defining name counter
    name_counter = 0
    
    for i in top_n_tuned_models:
        model = i
    
        #setting cross validation
        kf = StratifiedKFold(fold, random_state=seed)

        score_auc =np.empty((0,0))
        score_acc =np.empty((0,0))
        score_recall =np.empty((0,0))
        score_precision =np.empty((0,0))
        score_f1 =np.empty((0,0))
        score_kappa =np.empty((0,0))
        avgs_auc =np.empty((0,0))
        avgs_acc =np.empty((0,0))
        avgs_recall =np.empty((0,0))
        avgs_precision =np.empty((0,0))
        avgs_f1 =np.empty((0,0))
        avgs_kappa =np.empty((0,0))
        
        for train_i , test_i in kf.split(data_X,data_y):
    
            Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
            ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]
        
            if hasattr(model, 'predict_proba'):
        
                model.fit(Xtrain,ytrain)
                pred_prob = model.predict_proba(Xtest)
                pred_prob = pred_prob[:,1]
                pred_ = model.predict(Xtest)
                sca = metrics.accuracy_score(ytest,pred_)
                sc = metrics.roc_auc_score(ytest,pred_prob)
                recall = metrics.recall_score(ytest,pred_)
                precision = metrics.precision_score(ytest,pred_)
                kappa = metrics.cohen_kappa_score(ytest,pred_)
                f1 = metrics.f1_score(ytest,pred_)
                score_acc = np.append(score_acc,sca)
                score_auc = np.append(score_auc,sc)
                score_recall = np.append(score_recall,recall)
                score_precision = np.append(score_precision,precision)
                score_f1 =np.append(score_f1,f1)
                score_kappa =np.append(score_kappa,kappa)

            else:
            
                model.fit(Xtrain,ytrain)
                pred_prob = 0.00
                pred_prob = 0.00
                pred_ = model.predict(Xtest)
                sca = metrics.accuracy_score(ytest,pred_)
                sc = 0.00
                recall = metrics.recall_score(ytest,pred_)
                precision = metrics.precision_score(ytest,pred_) #change pred_prob to pred_
                kappa = metrics.cohen_kappa_score(ytest,pred_)
                f1 = metrics.f1_score(ytest,pred_)
                score_acc = np.append(score_acc,sca)
                score_auc = np.append(score_auc,sc)
                score_recall = np.append(score_recall,recall)
                score_precision = np.append(score_precision,precision)
                score_f1 =np.append(score_f1,f1)
                score_kappa =np.append(score_kappa,kappa) 
       
        mean_acc=np.mean(score_acc)
        mean_auc=np.mean(score_auc)
        mean_recall=np.mean(score_recall)
        mean_precision=np.mean(score_precision)
        mean_f1=np.mean(score_f1)
        mean_kappa=np.mean(score_kappa)

        avgs_acc = np.append(avgs_acc, mean_acc)
        avgs_auc = np.append(avgs_auc, mean_auc)
        avgs_recall = np.append(avgs_recall, mean_recall)
        avgs_precision = np.append(avgs_precision, mean_precision)
        avgs_f1 = np.append(avgs_f1, mean_f1)
        avgs_kappa = np.append(avgs_kappa, mean_kappa)
        
        #model_name = 'Tuned' + str(i).split("(")[0]
        model_results = pd.DataFrame({'Model': tuning_model_names[name_counter], 'Accuracy': avgs_acc, 
                                      'AUC': avgs_auc, 'Recall' : avgs_recall, 'Prec.' : avgs_precision, 
                                      'F1' : avgs_f1, 'Kappa' : avgs_kappa}).reset_index(drop=True)
        model_results = model_results.round(round)
        name_counter += 1
        top_n_tuned_model_results = pd.concat([top_n_tuned_model_results, model_results],ignore_index=True)
        
    master_results = master_results.append(top_n_tuned_model_results)
    
    progress.value += 1
    clear_output()
    master_display = master_results.sort_values(by=sort,ascending=False)
    master_display.reset_index(drop=True, inplace=True)
    display(progress)
    display(master_display)
    
    '''
    
    Unpacking Master into master_unpack so it can be used for sampling 
    for VotingClassifier and Stacking in Step 5 and Step 6 below. Note that
    master_unpack is not the most updated list by the end of code as the 
    models created in Step 5 and 6 below are not unpacked into master_unpack.
    Last part of code used object 'master_final' to unpack all the models from
    object 'master'.
    
    '''
    
    master_unpack = []
    for i in master:
        for k in i:
            master_unpack.append(k)
    
    '''
    
    This is the loop created for random sampling index numbers in master_unpack list
    for models that can be used in VotingClassifier in Step 5 below. Same sampling i.e.
    variable mix and mix_names is used in Stacking in Step 6 below.
    
    
    '''
    
    count_while = 0
    
    mix = []
    mix_names = []
    while count_while < top_n:
        sub_list = []
        sub_list_names = []
        generator = random.sample(range(len(master_results)), random.randint(3,len(master_results)))
        for r in generator:
            sub_list.append(master_unpack[r])
            sub_list_names.append(master_results.iloc[r]['Model'])
        mix.append(sub_list)
        mix_names.append(sub_list_names)
        count_while += 1
    
    progress.value += 1
    
    '''

    Step 5 - Using mix and mix_names created above, build voting classifier n # of times.
    This is equivalent to blend_models()

    '''    
    
    top_n_voting_models = []
    top_n_voting_model_results = pd.DataFrame(columns=['Model', 'Accuracy','AUC','Recall', 'Prec.', 'F1', 'Kappa'])
    
    for i,j in zip(mix,mix_names):
        
        estimator_list = zip(j, i)
        estimator_list = list(estimator_list)    
        model = VotingClassifier(estimators=estimator_list, voting='hard', n_jobs=-1)
        top_n_voting_models.append(model)
    
        #setting cross validation
        kf = StratifiedKFold(fold, random_state=seed)

        score_auc =np.empty((0,0))
        score_acc =np.empty((0,0))
        score_recall =np.empty((0,0))
        score_precision =np.empty((0,0))
        score_f1 =np.empty((0,0))
        score_kappa =np.empty((0,0))
        avgs_auc =np.empty((0,0))
        avgs_acc =np.empty((0,0))
        avgs_recall =np.empty((0,0))
        avgs_precision =np.empty((0,0))
        avgs_f1 =np.empty((0,0))
        avgs_kappa =np.empty((0,0))
        
        for train_i , test_i in kf.split(data_X,data_y):
    
            Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
            ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]
            
            model.fit(Xtrain,ytrain)
            pred_prob = 0.00
            pred_prob = 0.00
            pred_ = model.predict(Xtest)
            sca = metrics.accuracy_score(ytest,pred_)
            sc = 0.00
            recall = metrics.recall_score(ytest,pred_)
            precision = metrics.precision_score(ytest,pred_) #change pred_prob to pred_
            kappa = metrics.cohen_kappa_score(ytest,pred_)
            f1 = metrics.f1_score(ytest,pred_)
            score_acc = np.append(score_acc,sca)
            score_auc = np.append(score_auc,sc)
            score_recall = np.append(score_recall,recall)
            score_precision = np.append(score_precision,precision)
            score_f1 =np.append(score_f1,f1)
            score_kappa =np.append(score_kappa,kappa) 
       
        mean_acc=np.mean(score_acc)
        mean_auc=np.mean(score_auc)
        mean_recall=np.mean(score_recall)
        mean_precision=np.mean(score_precision)
        mean_f1=np.mean(score_f1)
        mean_kappa=np.mean(score_kappa)

        avgs_acc = np.append(avgs_acc, mean_acc)
        avgs_auc = np.append(avgs_auc, mean_auc)
        avgs_recall = np.append(avgs_recall, mean_recall)
        avgs_precision = np.append(avgs_precision, mean_precision)
        avgs_f1 = np.append(avgs_f1, mean_f1)
        avgs_kappa = np.append(avgs_kappa, mean_kappa)

        model_results = pd.DataFrame({'Model': 'Voting Classifier', 'Accuracy': avgs_acc, 'AUC': avgs_auc, 
                                      'Recall' : avgs_recall, 'Prec.' : avgs_precision , 'F1' : avgs_f1, 
                                      'Kappa' : avgs_kappa}).reset_index(drop=True)
        model_results = model_results.round(round)
        top_n_voting_model_results = pd.concat([top_n_voting_model_results, model_results],ignore_index=True)
        
    master_results = master_results.append(top_n_voting_model_results)
    master_results = master_results.reset_index(drop=True)
    master.append(top_n_voting_models)
    
    progress.value += 1
    clear_output()
    master_display = master_results.sort_values(by=sort,ascending=False)
    master_display.reset_index(drop=True, inplace=True)
    display(progress)
    display(master_display) 
    
    '''

    Step 6 - Stacking for all the models using same sample as above that are stored in
    mix and mix_names. 
    
    This is equivalent to stack_models()


    '''    
    
    top_n_stacking_models = []
    top_n_stacking_model_results = pd.DataFrame(columns=['Model', 'Accuracy','AUC','Recall', 'Prec.', 'F1', 'Kappa'])
    
    meta_model = LogisticRegression()
    
    for i in mix:
        
        estimator_list = i
        top_n_stacking_models.append(i)
        
        #defining model_library model names
        model_names = np.zeros(0)
        for item in estimator_list:
            model_names = np.append(model_names, str(item).split("(")[0])
    
        base_array = np.zeros((0,0))
        base_prediction = pd.DataFrame(y_train)
        base_prediction = base_prediction.reset_index(drop=True)
    
        for model in estimator_list:
            base_array = cross_val_predict(model,X_train,y_train,cv=fold, method='predict')
            base_array = base_array
            base_array_df = pd.DataFrame(base_array)
            base_prediction = pd.concat([base_prediction,base_array_df],axis=1)
            base_array = np.empty((0,0))
        
        #defining column names now
        target_col_name = np.array(base_prediction.columns[0])
        model_names = np.append(target_col_name, model_names)
        base_prediction.columns = model_names #defining colum names now
        data_X = base_prediction.drop(base_prediction.columns[0],axis=1)
        data_y = base_prediction[base_prediction.columns[0]]

        #Meta Modeling Starts Here

        model = meta_model 
        
        kf = StratifiedKFold(fold, random_state=seed)

        score_auc =np.empty((0,0))
        score_acc =np.empty((0,0))
        score_recall =np.empty((0,0))
        score_precision =np.empty((0,0))
        score_f1 =np.empty((0,0))
        score_kappa =np.empty((0,0))
        avgs_auc =np.empty((0,0))
        avgs_acc =np.empty((0,0))
        avgs_recall =np.empty((0,0))
        avgs_precision =np.empty((0,0))
        avgs_f1 =np.empty((0,0))
        avgs_kappa =np.empty((0,0))

        for train_i , test_i in kf.split(data_X,data_y):
            Xtrain,Xtest = data_X.iloc[train_i], data_X.iloc[test_i]
            ytrain,ytest = data_y.iloc[train_i], data_y.iloc[test_i]

            model.fit(Xtrain,ytrain)
            pred_prob = model.predict_proba(Xtest)
            pred_prob = pred_prob[:,1]
            pred_ = model.predict(Xtest)
            sca = metrics.accuracy_score(ytest,pred_)
            sc = metrics.roc_auc_score(ytest,pred_prob)
            recall = metrics.recall_score(ytest,pred_)
            precision = metrics.average_precision_score(ytest,pred_prob)
            kappa = metrics.cohen_kappa_score(ytest,pred_)
            f1 = metrics.f1_score(ytest,pred_)
            score_acc = np.append(score_acc,sca)
            score_auc = np.append(score_auc,sc)
            score_recall = np.append(score_recall,recall)
            score_precision = np.append(score_precision,precision)
            score_f1 =np.append(score_f1,f1)
            score_kappa =np.append(score_kappa,kappa)

        mean_acc=np.mean(score_acc)
        mean_auc=np.mean(score_auc)
        mean_recall=np.mean(score_recall)
        mean_precision=np.mean(score_precision)
        mean_f1=np.mean(score_f1)
        mean_kappa=np.mean(score_kappa)
        std_acc=np.std(score_acc)
        std_auc=np.std(score_auc)
        std_recall=np.std(score_recall)
        std_precision=np.std(score_precision)
        std_f1=np.std(score_f1)
        std_kappa=np.std(score_kappa)

        avgs_acc = np.append(avgs_acc, mean_acc)
        avgs_auc = np.append(avgs_auc, mean_auc)
        avgs_recall = np.append(avgs_recall, mean_recall)
        avgs_precision = np.append(avgs_precision, mean_precision)
        avgs_f1 = np.append(avgs_f1, mean_f1)
        avgs_kappa = np.append(avgs_kappa, mean_kappa)

        model_results = pd.DataFrame({'Model': 'Stacking Classifier',  'Accuracy': avgs_acc, 'AUC': avgs_auc, 
                                      'Recall' : avgs_recall, 'Prec.' : avgs_precision , 'F1' : avgs_f1, 
                                      'Kappa' : avgs_kappa})
        top_n_stacking_model_results = pd.concat([top_n_stacking_model_results, model_results],ignore_index=True)
        top_n_stacking_model_results = top_n_stacking_model_results.round(round)  


    master_results = master_results.append(top_n_stacking_model_results)
    master_results = master_results.reset_index(drop=True)
    master.append(top_n_stacking_models)
    
    progress.value += 1
    clear_output()
    master_display = master_results.sort_values(by=sort,ascending=False)
    master_display.reset_index(drop=True, inplace=True)
    display(progress)
    display(master_display)
    
    '''

    Step 7 - Unpacking final master list stored in object 'master'. The one unpacked
    before step 4 was used for sampling in Step 5 and 6.
    
    THIS IS THE FINAL UNPACKING.
    
    ''' 
    #global master_final
    master_final = []
    for i in master:
        for k in i:
            master_final.append(k)
    
    #renaming
    master = master_final
    del(master_final) #remove master_final
    
    progress.value += 1
    
    '''
    
    Step 8 - This is the final step in which master_results is sorted based on defined metric
    to get the index of best model so that master can return the final best model.
    also master_results is sorted and index is reset before display.
    
    ''' 
    best_model_position = master_results.sort_values(by=sort,ascending=False).index[0]
    best_model = master[best_model_position]
    
    master_results_sorted = master_results.sort_values(by=sort,ascending=False)
    master_results_sorted.reset_index(drop=True, inplace=True)
    
    progress.value += 1
    
    clear_output()
    
    display(master_results_sorted)
    return best_model

## Testing Section

# Work in Progress / Future Release 

In [None]:
def optimize_model(data_X=X_train, n=3):
    global X_train
    drop_list = var_imp_array_top_n[0:n]
    X_train.drop(drop_list, axis=1, inplace=True)

In [None]:
def save_model(model, model_name):
    from sklearn.externals import joblib
    model_name = model_name + '.pkl'
    joblib.dump(model, model_name)

In [None]:
def load_model(model_name):
    from sklearn.externals import joblib
    model_name = model_name + '.pkl'
    return joblib.load(model_name)

# Modules now Available

## 1.0. compare_models

## 2.0. create_model

## 3.0. plot_model 

## 4.0. tune_model

## 5.0. ensemble_model 

## 6.0 blend_models

## 7.0. stack_models

## 8.0. create_stacknet

## 9.0. save_model 

## 10.0 load_model

## 11.0. optimize_model (Future Release)

## 12.0. predict_stacknet (Future Release)

## 13.0. calibrate_model (Future Release)