### Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import operator
import numpy as np
import regex as re
import joblib
import os
import copy
from tqdm import tqdm
from prettytable import PrettyTable

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE
from xgboost import XGBClassifier
from mlxtend.classifier import StackingClassifier

### Defining methods

In [2]:
#ReferenceL https://github.com/mdmub0587/Dont-Overfit-II/blob/master/ML_Models.ipynb
def get_classifiers_pipeline(scaler = None, sampler = None ,dimRed = None):
    '''
    This method returns map with value as pipeline containing the scaler , sampler , dimentionality reduction technique 
    if provided along with the machine learning models knn, naive bayes, logistic regression, support vector machine, 
    decision tree, random forest, xgboost and stacking with their corresponding parameter values that are required to tune the model.
    
    ----------------------
    Parameter
    scaler  : A Scaler to add to the pipeline.
                   
    sampler : A Sampler to add to the pipeline.
    
    dimRed  : A dimentionality reduction technique to add to the pipeline.
    
    ----------------------
    Returns
    classifiers : A map with key as the model name and value is a tuple of pipeline object containing the 
                  given sampler, scaler, dimentionality reduction technique and classifiers along with their params.
    '''
    classifiers = {}
    
    steps = []
    if(sampler):
        steps.append(('sampler', sampler)) #oversampling, undersampling
    if(scaler):
        steps.append(('scaler', scaler)) #minmax sampler, standard scaler
    if(dimRed):
        steps.append(('dimRed', dimRed))
    
        
    
    
    #K Nearest Neighbour
    params = {
        'model__base_estimator__n_neighbors':[2,4,5,7,9,15],
        'model__base_estimator__leaf_size':[1,2,5,10],
        'model__base_estimator__weights':['uniform', 'distance'],
        'model__base_estimator__algorithm':['auto', 'ball_tree','kd_tree','brute'],
    }
    clf = KNeighborsClassifier()
    clf = CalibratedClassifierCV(clf, method='sigmoid', n_jobs = -1)
    steps_temp = copy.deepcopy(steps)
    steps_temp.append(('model', clf))
    classifiers['KNeighborsClassifier'] = (Pipeline(steps=steps_temp), params)
    
    #Naive Bayes
    params = {
        'model__base_estimator__var_smoothing':[0.0001,0.005,0.001,0.05,0.01,0.1,10]
    }
    clf = GaussianNB()
    clf = CalibratedClassifierCV(clf, method='sigmoid', n_jobs = -1)
    steps_temp = copy.deepcopy(steps)
    steps_temp.append(('model', clf))
    classifiers['GaussianNB'] = (Pipeline(steps=steps_temp), params)
    
    #Logistic Regression
    params = {
        'model__base_estimator__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'model__base_estimator__penalty' : ['l2','l1','elasticnet', 'none'],
        'model__base_estimator__C' :[0.001,0.01, 0.1, 1, 10, 100],
    }
    clf = LogisticRegression()
    clf = CalibratedClassifierCV(clf, method='sigmoid', n_jobs = -1)
    steps_temp = copy.deepcopy(steps)
    steps_temp.append(('model', clf))
    classifiers['LogisticRegression'] = (Pipeline(steps=steps_temp), params)
    
    #Support Vector 
    params = {
        'model__base_estimator__kernel': ['linear','poly','rbf'], 
        'model__base_estimator__gamma': [0.001, 0.01, 0.1, 1,'auto'],
        'model__base_estimator__C': [0.001,0.01, 0.1, 1, 10, 100],
        'model__base_estimator__class_weight': ['balanced', None],
        'model__base_estimator__probability': [True]
    }
    clf = SVC()
    clf = CalibratedClassifierCV(clf, method='sigmoid', n_jobs = -1)
    steps_temp = copy.deepcopy(steps)
    steps_temp.append(('model', clf))
    classifiers['SVC'] = (Pipeline(steps=steps_temp), params)
    
    #Decision Tree
    params = {
        'model__base_estimator__max_depth': [3, 5 , 10, None],
        'model__base_estimator__criterion': ['gini', 'entropy'],
        'model__base_estimator__min_samples_leaf': [1,2,3,4,5],
        'model__base_estimator__max_features': ['auto', 'sqrt', 'log2', None],
        'model__base_estimator__class_weight':['balanced', None]
    }
    clf = DecisionTreeClassifier()
    clf = CalibratedClassifierCV(clf, method='sigmoid', n_jobs = -1)
    steps_temp = copy.deepcopy(steps)
    steps_temp.append(('model', clf))
    classifiers['DecisionTreeClassifier'] = (Pipeline(steps=steps_temp), params)
    
    #Random Forest
    params = {
        'model__base_estimator__n_estimators':[10,20,50,70],
        'model__base_estimator__criterion':['gini','entropy'],
        'model__base_estimator__min_samples_leaf':[1,2,3,4,5],
        'model__base_estimator__max_features':['auto', 'sqrt', 'log2'],
        'model__base_estimator__class_weight': ['balanced', 'balanced_subsample', None]
    }
    clf = RandomForestClassifier()
    clf = CalibratedClassifierCV(clf, method='sigmoid', n_jobs = -1)
    steps_temp = copy.deepcopy(steps)
    steps_temp.append(('model', clf))
    classifiers['RandomForestClassifier'] = (Pipeline(steps=steps_temp), params)
    
    #XGBoost
    params = {
        'model__base_estimator__learning_rate':[0.01,0.03,0.1,0.15,0.2],
        'model__base_estimator__n_estimators':[10,20,50,100],
        'model__base_estimator__max_depth':[3,5,10],
        'model__base_estimator__colsample_bytree':[0.1,0.3,0.5,1],
        'model__base_estimator__subsample':[0.1,0.3,0.5,1]
    }
    clf = XGBClassifier(eval_metric = 'error', use_label_encoder=False)
    clf = CalibratedClassifierCV(clf, method='sigmoid')
    steps_temp = copy.deepcopy(steps)
    steps_temp.append(('model', clf))
    classifiers['XGBClassifier'] = (Pipeline(steps=steps_temp), params)
    
    #Stacking
    params = {
        'model__meta_classifier__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'model__meta_classifier__penalty' : ['l2','l1','elasticnet', 'none'],
        'model__meta_classifier__C' :[0.001,0.01, 0.1]
    }
    
    clf = StackingClassifier(classifiers = [GaussianNB(),
                                            SVC(),
                                            DecisionTreeClassifier(),
                                            RandomForestClassifier(),
                                            XGBClassifier(eval_metric = 'error', use_label_encoder=False)],
                             meta_classifier=LogisticRegression())
    steps_temp = copy.deepcopy(steps)
    steps_temp.append(('model', clf))
    classifiers['StackingClassifier'] = (Pipeline(steps=steps_temp), params)
    
    return classifiers

In [3]:
def feature_engineering_perm(data, fe_imp):
    '''
    This method is used to engineer new features(+, -, *, /, cos, cosh, sin , sinh, exp of existing features) from the given
    list of feature transformations and the dataset.
    
    ----------------------
    Parameter
    data    : The input dataset.
    
    fe_imp  : A pandas series containing the top important feature combinations of arithmetic and trignometric
              operations.
              
    ----------------------                       
    Returns
    data_fe : A pandas DataFrame containing dataset with only the given transformed features.
    '''
        
    data_fe = pd.DataFrame()
    #defining a dict of operation name and the operations
    op_dict = {'+': operator.add, '-': operator.sub, '*': operator.mul, '/': operator.truediv,
              'cos': np.cos, 'sin': np.sin, 'cosh': np.cosh, 'sinh': np.sinh, 'exp': np.exp}
    
    for i in fe_imp.index:
        oper = ''.join(re.findall('[^0-9_]', i)) #extracting the arith or trig operation from the name of col.Eg:12+32 or cos_21
        if(not oper):
            data_fe[i] = data[i].copy() #if there is no operation involved then the same feature is taken as it is.
        else:
            op = op_dict[oper]
            if('_' in i): #checking if the operation is trig
                cols = i.split(oper+'_') #splitting based on the '_' Eg: cos_12 -> cos, 12
                data_fe[i] = op(data[cols[1]]) #appying the trig operation
            else: 
                cols = i.split(oper) #splitting based on the operation Eg: 12+13 -> splitting on '+' -> 12,13
                data_fe[i] = op(data[cols[0]], data[cols[1]]) #applying the arth operation
                
        if(data_fe[i].isin([np.inf, -np.inf, np.nan]).any()):
            data_fe[i].replace([np.inf, -np.inf], np.nan, inplace=True) #replaces the inf, -inf to nan(coz mean of r.v with inf is nan)
            data_fe[i].replace(np.nan, data_fe[i].mean(), inplace=True) # replaces nan with mean val
        

    return data_fe

In [4]:
def print_pretty_table(results, header, index):
    '''
    This method prints the classifier and their corresponding best Cv score and test score.
    ----------------------
    Parameter
    results : Contains the list of tuples containing information about the classifier.
    
    header  : The pretty print table header.
    
    index   : indices of the data in the tuple to be printed.
    
    ----------------------      
    Returns
    None
    '''
        
    t = PrettyTable(header)
    for i, result in enumerate(results): #loops over all the results list
        t.add_row([i+1]+[result[idx] for idx in index]) #fetches only the elements of result tuple at the given index to add to row
    print(t)

In [5]:
def print_model_hyperpar(cv_results_):
    '''
    This method displays cv and train score along with the model parameres for every param combination the GridSearchCV 
    used.
    ----------------------
    Parameter 
    cv_results_ : Contains the results of the GridSearchCV(GridSearchCV.cv_results_)
    
    ----------------------
    Returns
    None
    '''
        
    results = {}
    for i in cv_results_.keys():
        if i.split('_')[0] in ['mean', 'rank', 'param']: #getting only the keys containing the given key word
            results[i] = cv_results_[i]
    display(pd.DataFrame(results).sort_values(by='mean_test_score',ascending=False).head())

In [6]:
def save_models(model, category):
    '''
    This method saves the trained model with a  model name that represents the additional information about the trained 
    model.
    
    ----------------------
    Parameter
    model    : The pipeline containing the model and the preprocessing.
    
    category : The name of the category the model belongs to.
    
    ----------------------              
    Returns
    None
    '''
        
    path = 'saved_models'
    if not os.path.isdir(path):
        os.mkdir(path)
        
    for i, (clf_name, display_name, params, clf, score) in enumerate(model):
        filename = path+'/'+'_'.join([category, display_name, str(i+1)])+'.sav' #the filename will contain the category, display name and the rank(based on AUC)
        joblib.dump(clf, filename) #saving the model
    print('Example File Name: ',filename)

In [7]:
def get_classifier_score(x_train, y_train, classifiers, display_hyper = False, save_model = True, category = 'default'):
    '''
    This method trains the classifiers using GridSearchCV on given dataset and save the model and print the Cv and test 
    score of each model.
    
    ----------------------
    Parameter
    x_train       : The dataframe containing the independent variables of the train set.
    
    y_train       : The dataframe containing the dependent variable(target) of the train set.
    
    x_test        : The dataframe containing the independent variables of the test set.
    
    y_test        : The dataframe containing the dependent variable(target) of the test set.
    
    classifiers   : A map with key as the model name and value is a tuple of pipeline object containing the 
                    sampler, scaler, dimentionality reduction technique, classifiers and with their params.
                    
    display_hyper : A flag to enable or disable printing the hyperparameters and their corresponding score for all the models.
    
    save_model    : A flag to enable or disable saving models.
    
    category      : The category the model belongs to.
    
    
    ----------------------              
    Returns
    results: The list containing the best params, best estimator for all the models
    '''
        
    results = []
    
    for clf_name, (clf, params) in tqdm(classifiers.items()):
        search_clf = GridSearchCV(clf, param_grid=params, return_train_score= True,cv = 5,
                                        scoring = 'roc_auc', n_jobs=-1)
        search_clf.fit(x_train, y_train)
        
        if(display_hyper):
            print(clf_name, end = '\n')
            print_model_hyperpar(search_clf.cv_results_)
        
        steps_list = np.array(clf.steps)[:, 1:].reshape(-1)
        steps_list = [str(i).split('(')[0] for i in steps_list if not (hasattr(i,'base_estimator') or hasattr(i,'classifiers'))]
        display_name = '_'.join(steps_list+[clf_name])
        
        results.append((clf_name, display_name, search_clf.best_params_, search_clf.best_estimator_, search_clf.best_score_))
         
    results.sort(reverse=True, key=lambda x: x[-1])
    
    if(save_model):
        save_models(results, category)
    
    header = ['No.','Classifier','CV AUC Score']
    index = [0, 4] #index on the results list's tuple eg:[clf_name, search_clf.best_score_]
    print_pretty_table(results, header, index)
    
    
    return results

###  Importing Data

In [8]:
train = pd.read_csv('dont-overfit-ii/train.csv')
x_train = train.drop(['id','target'], axis=1)
y_train = train['target']


feature_imp = pd.read_csv('dont-overfit-ii/processed_data/feature_importance.csv', index_col = 'Unnamed: 0')
test = pd.read_csv('dont-overfit-ii/test.csv').drop(['id'], axis=1)

print('Train Data:', end = '')
print(x_train.shape, y_train.shape)
print('Test Data(Without Target Variable):', end = '')
print(test.shape)

Train Data:(250, 300) (250,)
Test Data(Without Target Variable):(19750, 300)


## Training

### BaseLine Models

In [9]:
#training
classifiers = get_classifiers_pipeline() #returns all the classifier pipeline
display_hyper = False
save_model = True
category = 'OrgFe' #Original Features
baseline_models = get_classifier_score(x_train, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [09:10<00:00, 68.87s/it]


Example File Name:  saved_models/OrgFe_KNeighborsClassifier_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   | 0.8309027777777777 |
|  2  |     XGBClassifier      | 0.7920138888888889 |
|  3  | RandomForestClassifier | 0.7739583333333333 |
|  4  |          SVC           | 0.7552083333333333 |
|  5  |       GaussianNB       | 0.7430555555555556 |
|  6  |   StackingClassifier   | 0.6829861111111111 |
|  7  | DecisionTreeClassifier | 0.6690972222222221 |
|  8  |  KNeighborsClassifier  |      0.61875       |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/kqZ8wq8.png' width=800 height=600 />

### Top 300 Engineered Feature Models

In [10]:
top = 300
feature_imp

Unnamed: 0,correlation,chi2_score,chi2_p_val,abs_correlation,harmonic_mean
0,0.118118,0.288789,0.590998,0.118118,0.061797
1,-0.062362,0.057894,0.809856,0.062362,0.013383
2,-0.016348,0.005082,0.943168,0.016348,0.001215
3,0.015924,0.004963,0.943835,0.015924,0.001186
4,-0.133754,0.295107,0.586967,0.133754,0.063896
...,...,...,...,...,...
exp_295,-0.153373,0.763597,0.382205,0.153373,0.145345
exp_296,-0.097177,0.302711,0.582188,0.097177,0.062715
exp_297,0.083976,0.221219,0.638113,0.083976,0.046902
exp_298,-0.020659,0.015240,0.901749,0.020659,0.003559


In [11]:
#getting the top 300 correlated engineered feature's name and transforming data according to the feature name
top_features = feature_imp['abs_correlation'].sort_values(ascending = False)[:top]
x_train_fe_1 = feature_engineering_perm(x_train, top_features)

classifiers = get_classifiers_pipeline() #returns all the classifier pipeline
display_hyper = False
save_model = True
category = 'EngFe_Cor'
baseline_models = get_classifier_score(x_train_fe_1, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [09:04<00:00, 68.08s/it]


Example File Name:  saved_models/EngFe_Cor_GaussianNB_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |     XGBClassifier      | 0.8833333333333332 |
|  2  | RandomForestClassifier | 0.867013888888889  |
|  3  |   LogisticRegression   | 0.8666666666666668 |
|  4  |          SVC           | 0.8614583333333332 |
|  5  | DecisionTreeClassifier | 0.8614583333333332 |
|  6  |  KNeighborsClassifier  | 0.8553819444444445 |
|  7  |   StackingClassifier   |      0.828125      |
|  8  |       GaussianNB       | 0.826388888888889  |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/tGmM3EF.png' width=800 height=600 />

In [12]:
#getting the top 300 based(using chi-square test) engineered feature's name and transforming data according to the feature name
top_features = feature_imp['chi2_score'].sort_values(ascending=False)[: top]
x_train_fe_2 = feature_engineering_perm(x_train, top_features)

classifiers = get_classifiers_pipeline() #returns all the classifier pipeline
display_hyper = False
save_model = True
category = 'EngFe_Chi'
baseline_models = get_classifier_score(x_train_fe_2, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [08:32<00:00, 64.09s/it]


Example File Name:  saved_models/EngFe_Chi_GaussianNB_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |     XGBClassifier      | 0.8934027777777777 |
|  2  | RandomForestClassifier | 0.8701388888888889 |
|  3  |          SVC           |      0.865625      |
|  4  |   LogisticRegression   | 0.8652777777777778 |
|  5  |  KNeighborsClassifier  | 0.8538194444444445 |
|  6  | DecisionTreeClassifier |     0.8453125      |
|  7  |   StackingClassifier   | 0.8288194444444447 |
|  8  |       GaussianNB       | 0.8166666666666667 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/rVhSc8j.png' width=800 height=600 />

In [13]:
#getting the top 300 based(harmonic_mean of chi-sq,cor value) engineered feature's name and transforming data according to the feature name
top_features = feature_imp['harmonic_mean'].sort_values(ascending=False)[: top]
x_train_fe_3 = feature_engineering_perm(x_train, top_features)

classifiers = get_classifiers_pipeline() #returns all the classifier pipeline
display_hyper = False
save_model = True
category = 'EngFe_Hmean'
baseline_models = get_classifier_score(x_train_fe_3, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [08:57<00:00, 67.13s/it]


Example File Name:  saved_models/EngFe_Hmean_GaussianNB_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |     XGBClassifier      | 0.8909722222222222 |
|  2  | RandomForestClassifier | 0.8708333333333333 |
|  3  |   LogisticRegression   | 0.8628472222222221 |
|  4  |          SVC           | 0.8600694444444444 |
|  5  |  KNeighborsClassifier  | 0.8522569444444444 |
|  6  | DecisionTreeClassifier | 0.8520833333333334 |
|  7  |   StackingClassifier   |      0.81875       |
|  8  |       GaussianNB       | 0.8184027777777778 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/eCAiBal.png' width=800 height=600 />

### Top 100 Engineered Feature Models

In [14]:
top = 100

In [15]:
#getting the top 100 correlated engineered feature's name and transforming data according to the feature name
top_features = feature_imp['abs_correlation'].sort_values(ascending = False)[:top]
x_train_fe_1 = feature_engineering_perm(x_train, top_features)

classifiers = get_classifiers_pipeline() #returns all the classifier pipeline
display_hyper = False
save_model = True
category = 'EngFe100_Cor'
baseline_models = get_classifier_score(x_train_fe_1, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [06:33<00:00, 49.22s/it]


Example File Name:  saved_models/EngFe100_Cor_StackingClassifier_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   |       0.9125       |
|  2  |          SVC           | 0.8895833333333334 |
|  3  | RandomForestClassifier | 0.8854166666666667 |
|  4  |     XGBClassifier      |      0.884375      |
|  5  |  KNeighborsClassifier  | 0.8583333333333334 |
|  6  | DecisionTreeClassifier | 0.8560763888888889 |
|  7  |       GaussianNB       | 0.8392361111111111 |
|  8  |   StackingClassifier   | 0.8371527777777776 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/JpKiH7m.png' width=800 height=600 />

In [16]:
#getting the top 100 based(using chi-square test) engineered feature's name and transforming data according to the feature name
top_features = feature_imp['chi2_score'].sort_values(ascending=False)[: top]
x_train_fe_2 = feature_engineering_perm(x_train, top_features)

classifiers = get_classifiers_pipeline() #returns all the classifier pipeline
display_hyper = False
save_model = True
category = 'EngFe100_Chi'
baseline_models = get_classifier_score(x_train_fe_2, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [06:03<00:00, 45.42s/it]


Example File Name:  saved_models/EngFe100_Chi_GaussianNB_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   | 0.9027777777777777 |
|  2  |     XGBClassifier      | 0.8909722222222223 |
|  3  |          SVC           | 0.8840277777777776 |
|  4  | RandomForestClassifier | 0.8809027777777778 |
|  5  |  KNeighborsClassifier  | 0.8647569444444445 |
|  6  | DecisionTreeClassifier | 0.8505208333333332 |
|  7  |   StackingClassifier   | 0.8428819444444444 |
|  8  |       GaussianNB       | 0.8274305555555556 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/ukukVdg.png' width=800 height=600 />

In [17]:
#getting the top 100 based(harmonic_mean of chi-sq,cor value) engineered feature's name and transforming data according to the feature name
top_features = feature_imp['harmonic_mean'].sort_values(ascending=False)[: top]
x_train_fe_3 = feature_engineering_perm(x_train, top_features)

classifiers = get_classifiers_pipeline() #returns all the classifier pipeline
display_hyper = False
save_model = True
category = 'EngFe100_Hmean'
baseline_models = get_classifier_score(x_train_fe_3, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [06:19<00:00, 47.41s/it]


Example File Name:  saved_models/EngFe100_Hmean_GaussianNB_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   | 0.8993055555555556 |
|  2  |     XGBClassifier      |      0.878125      |
|  3  | RandomForestClassifier | 0.8652777777777778 |
|  4  |          SVC           | 0.8611111111111113 |
|  5  |  KNeighborsClassifier  | 0.8520833333333334 |
|  6  | DecisionTreeClassifier | 0.8489583333333334 |
|  7  |   StackingClassifier   | 0.8329861111111111 |
|  8  |       GaussianNB       | 0.8184027777777778 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/BYYwk7S.png' width=800 height=600 />

### Top 50 Engineered Feature Models

In [18]:
top = 50

In [19]:
#getting the top 50 correlated engineered feature's name and transforming data according to the feature name
top_features = feature_imp['abs_correlation'].sort_values(ascending = False)[:top]
x_train_fe_1 = feature_engineering_perm(x_train, top_features)

classifiers = get_classifiers_pipeline() #returns all the classifier pipeline
display_hyper = False
save_model = True
category = 'EngFe50_Cor'
baseline_models = get_classifier_score(x_train_fe_1, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [06:24<00:00, 48.11s/it]


Example File Name:  saved_models/EngFe50_Cor_StackingClassifier_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   | 0.9222222222222222 |
|  2  |          SVC           | 0.920138888888889  |
|  3  |     XGBClassifier      |       0.875        |
|  4  | RandomForestClassifier | 0.8732638888888887 |
|  5  | DecisionTreeClassifier | 0.8579861111111111 |
|  6  |  KNeighborsClassifier  | 0.8501736111111111 |
|  7  |       GaussianNB       | 0.8291666666666666 |
|  8  |   StackingClassifier   | 0.8012152777777779 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/36Ugml1.png' width=800 height=600 />

In [20]:
#getting the top 50 based(using chi-square test) engineered feature's name and transforming data according to the feature name
top_features = feature_imp['chi2_score'].sort_values(ascending=False)[: top]
x_train_fe_2 = feature_engineering_perm(x_train, top_features)

classifiers = get_classifiers_pipeline() #returns all the classifier pipeline
display_hyper = False
save_model = True
category = 'EngFe50_Chi'
baseline_models = get_classifier_score(x_train_fe_2, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [06:27<00:00, 48.47s/it]


Example File Name:  saved_models/EngFe50_Chi_KNeighborsClassifier_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   | 0.8996527777777779 |
|  2  |          SVC           | 0.8947916666666667 |
|  3  |     XGBClassifier      | 0.8371527777777776 |
|  4  | RandomForestClassifier | 0.8329861111111111 |
|  5  | DecisionTreeClassifier | 0.8237847222222223 |
|  6  |       GaussianNB       | 0.8079861111111111 |
|  7  |   StackingClassifier   | 0.7822916666666666 |
|  8  |  KNeighborsClassifier  | 0.7798611111111111 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/Zj5wTeW.png' width=800 height=600 />

In [21]:
#getting the top 50 based(harmonic_mean of chi-sq,cor value) engineered feature's name and transforming data according to the feature name
top_features = feature_imp['harmonic_mean'].sort_values(ascending=False)[: top]
x_train_fe_3 = feature_engineering_perm(x_train, top_features)

classifiers = get_classifiers_pipeline() #returns all the classifier pipeline
display_hyper = False
save_model = True
category = 'EngFe50_Hmean'
baseline_models = get_classifier_score(x_train_fe_3, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [06:09<00:00, 46.19s/it]


Example File Name:  saved_models/EngFe50_Hmean_StackingClassifier_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   |      0.903125      |
|  2  |          SVC           | 0.9024305555555554 |
|  3  |     XGBClassifier      | 0.8315972222222221 |
|  4  | RandomForestClassifier | 0.8305555555555555 |
|  5  |       GaussianNB       | 0.8074652777777779 |
|  6  | DecisionTreeClassifier | 0.8026041666666666 |
|  7  |  KNeighborsClassifier  | 0.7802083333333334 |
|  8  |   StackingClassifier   | 0.7744791666666667 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/SS9YKe1.png' width=800 height=600 />

### SMOTE Based Models

In [22]:
#using SMOTE to oversample
classifiers = get_classifiers_pipeline(sampler = SMOTE()) #returns all the classifier pipeline with SMOTE
display_hyper = False
save_model = True
category = 'OrgFe' #Original Features
baseline_models = get_classifier_score(x_train, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [10:52<00:00, 81.56s/it]


Example File Name:  saved_models/OrgFe_SMOTE_KNeighborsClassifier_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   | 0.8260416666666666 |
|  2  |     XGBClassifier      | 0.7864583333333333 |
|  3  | RandomForestClassifier | 0.7663194444444444 |
|  4  |          SVC           | 0.7645833333333332 |
|  5  |       GaussianNB       | 0.7416666666666668 |
|  6  |   StackingClassifier   |     0.7234375      |
|  7  | DecisionTreeClassifier | 0.6880208333333334 |
|  8  |  KNeighborsClassifier  | 0.6682291666666667 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/VaKqz2p.png' width=800 height=600 />

In [23]:
#using BorderlineSMOTE to oversample
classifiers = get_classifiers_pipeline(sampler = BorderlineSMOTE())#returns all the classifier pipeline with BorderlineSMOTE
display_hyper = False
save_model = True
category = 'OrgFe' #Original Features
baseline_models = get_classifier_score(x_train, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [10:52<00:00, 81.59s/it]


Example File Name:  saved_models/OrgFe_BorderlineSMOTE_KNeighborsClassifier_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   | 0.8322916666666667 |
|  2  |     XGBClassifier      | 0.7944444444444445 |
|  3  |          SVC           | 0.7680555555555555 |
|  4  | RandomForestClassifier | 0.763888888888889  |
|  5  |       GaussianNB       | 0.7177083333333333 |
|  6  |   StackingClassifier   |      0.70625       |
|  7  | DecisionTreeClassifier | 0.6928819444444444 |
|  8  |  KNeighborsClassifier  | 0.6444444444444445 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/eA8Sx8b.png' width=800 height=600 />

In [24]:
#using SVMSMOTE to oversample
classifiers = get_classifiers_pipeline(sampler = SVMSMOTE())#returns all the classifier pipeline with SVMSMOTE
display_hyper = False
save_model = True
category = 'OrgFe' #Original Features
baseline_models = get_classifier_score(x_train, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [11:00<00:00, 82.61s/it]


Example File Name:  saved_models/OrgFe_SVMSMOTE_KNeighborsClassifier_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   | 0.8225694444444445 |
|  2  |     XGBClassifier      | 0.7930555555555555 |
|  3  | RandomForestClassifier | 0.7711805555555555 |
|  4  |          SVC           | 0.7597222222222223 |
|  5  |   StackingClassifier   | 0.7407986111111111 |
|  6  |       GaussianNB       | 0.7243055555555554 |
|  7  | DecisionTreeClassifier | 0.7050347222222222 |
|  8  |  KNeighborsClassifier  |     0.7015625      |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/MFBjhKJ.png' width=800 height=600 />

In [25]:
#using ADASYN to oversample
classifiers = get_classifiers_pipeline(sampler = ADASYN()) #returns all the classifier pipeline with ADASYN
display_hyper = False
save_model = True
category = 'OrgFe' #Original Features
baseline_models = get_classifier_score(x_train, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [11:05<00:00, 83.23s/it]


Example File Name:  saved_models/OrgFe_ADASYN_KNeighborsClassifier_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   | 0.826388888888889  |
|  2  |     XGBClassifier      | 0.7920138888888889 |
|  3  |          SVC           | 0.7642361111111111 |
|  4  | RandomForestClassifier |       0.7625       |
|  5  |       GaussianNB       | 0.7427083333333334 |
|  6  |   StackingClassifier   | 0.7272569444444444 |
|  7  | DecisionTreeClassifier | 0.6817708333333334 |
|  8  |  KNeighborsClassifier  | 0.6055555555555555 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/51L4jUv.png' width=800 height=600 />

### GAN Based Oversampling

In [36]:
#importing the datapoints generated using Gan
gan_generated = pd.read_csv('gan_generate_data.csv')
gan_generated

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-1.231536,-0.757058,0.323471,-0.373336,1.740396,-0.011642,0.787421,-0.554207,-1.111556,-1.101803,...,-1.539691,0.437288,-0.864773,-2.047273,-1.318439,-0.535834,-0.242430,-1.270979,0.167726,0.890323
1,0.527475,-2.009773,0.031520,-0.722841,1.272380,0.217252,0.162855,-0.504717,0.036916,-0.363956,...,0.066485,0.971897,0.824495,-0.193334,-0.845217,0.024121,-0.044567,-0.401558,-0.315309,0.729277
2,1.433272,-1.541269,0.154739,1.055611,-1.003305,1.193708,-0.841651,1.394856,2.288121,1.210096,...,-0.187716,-1.260288,0.528304,0.343590,-0.593913,-0.219812,0.704353,-1.886140,1.295608,-0.353314
3,1.243422,-3.354106,1.253618,-1.147332,0.882893,1.536356,-0.517656,-0.093137,-0.115199,-1.654826,...,1.827667,1.577450,2.854554,0.378508,0.476266,-0.366529,0.693955,-0.849632,-0.833875,0.253276
4,-0.522224,1.797188,1.197893,0.610273,0.527629,0.310325,-0.688651,-2.323620,-1.153003,0.520274,...,2.061646,0.202365,-0.287820,1.410335,-0.582540,0.138703,-1.518092,0.356950,-0.116612,0.742705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,-2.124128,0.405036,0.221787,-1.100904,2.740901,0.826122,1.344687,-0.337886,-1.193999,-3.253947,...,0.025036,0.600713,0.208316,-0.197595,1.476042,-0.986735,0.611642,0.091400,-0.861588,1.382062
177,-0.738197,-0.814411,2.704579,-0.437952,0.984169,-0.098871,0.145119,0.387999,-0.788392,-1.045667,...,-0.446112,-0.758467,-2.876692,1.000746,-0.523397,-0.683659,0.869735,0.547007,0.553762,-1.778071
178,1.926204,-2.239471,-0.529206,-1.306862,-0.288111,0.499006,0.633132,-1.568048,0.912831,-0.157988,...,1.490367,1.116894,1.565116,2.473838,0.577597,-0.387297,-1.419720,-1.121611,-0.642613,0.302140
179,2.774865,-1.580524,-0.220344,-2.936759,1.112859,-0.326763,-1.337632,-1.818586,-0.433604,-0.924282,...,-0.528094,0.142860,1.301418,0.707005,0.757709,1.041660,-1.178949,-1.898698,-1.873588,-0.140372


In [37]:
#appending the 70 generated datapoints to balance the dataset
gan_generated_x = x_train.append(gan_generated.sample(n=70), ignore_index= True)
gan_generated_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.098000,2.165000,0.681000,-0.614000,1.309000,-0.455000,-0.236000,0.276000,-2.246000,1.825000,...,0.867000,1.347000,0.504000,-0.649000,0.672000,-2.097000,1.051000,-0.414000,1.038000,-1.065000
1,1.081000,-0.973000,-0.383000,0.326000,-0.428000,0.317000,1.172000,0.352000,0.004000,-0.291000,...,-0.165000,-1.695000,-1.257000,1.359000,-0.808000,-1.624000,-0.458000,-1.099000,-0.936000,0.973000
2,-0.523000,-0.089000,-0.348000,0.148000,-0.022000,0.404000,-0.023000,-0.172000,0.137000,0.183000,...,0.013000,0.263000,-1.222000,0.726000,1.444000,-1.165000,-1.544000,0.004000,0.800000,-1.211000
3,0.067000,-0.021000,0.392000,-1.637000,-0.446000,-0.725000,-1.035000,0.834000,0.503000,0.274000,...,-0.404000,0.640000,-0.595000,-0.966000,0.900000,0.467000,-0.562000,-0.254000,-0.533000,0.238000
4,2.347000,-0.831000,0.511000,-0.021000,1.225000,1.594000,0.585000,1.509000,-0.012000,2.198000,...,0.898000,0.134000,2.415000,-0.996000,-1.006000,1.378000,1.246000,1.478000,0.428000,0.253000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,-0.056573,-0.908669,-0.764223,0.376006,-0.589402,-1.562916,1.041014,0.410800,0.792129,-0.334161,...,-1.728630,-0.007303,2.038653,0.829545,-2.129589,0.165105,1.469861,-1.030338,0.714158,-0.960065
316,-0.604192,0.959602,1.085638,-0.097572,0.094247,-0.993040,-0.388491,0.474882,-1.484817,-1.214495,...,1.047249,-0.503792,-0.197375,1.280109,-1.088370,-0.285495,0.231605,-0.629980,-0.101128,0.078099
317,-0.442997,0.185709,0.330809,0.922754,0.225768,-1.604853,-1.321189,0.659735,0.970101,-1.575665,...,-0.130702,-1.645922,-0.696468,0.696990,2.036208,0.700095,0.256176,0.355959,-0.062697,-0.526736
318,-0.164960,-1.249736,1.569788,-0.828402,0.828032,0.577683,0.578360,0.183324,0.663387,0.831685,...,-1.278772,-0.939224,-2.893024,2.037708,-0.201416,-2.337625,-0.068142,-0.586879,3.043970,-2.007598


In [38]:
gan_generated_y = y_train.append(pd.Series([0]*70), ignore_index=True)
gan_generated_y.value_counts()

0.0    160
1.0    160
dtype: int64

In [39]:
#training
classifiers = get_classifiers_pipeline() #returns all the classifier pipeline
display_hyper = False
save_model = True
category = 'OrgFeGan' #Original Features
baseline_models = get_classifier_score(gan_generated_x, gan_generated_y, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [10:48<00:00, 81.10s/it]


Example File Name:  saved_models/OrgFeGan_DecisionTreeClassifier_8.sav
+-----+------------------------+---------------+
| No. |       Classifier       |  CV AUC Score |
+-----+------------------------+---------------+
|  1  |          SVC           |  0.889453125  |
|  2  | RandomForestClassifier |  0.876171875  |
|  3  |   LogisticRegression   |  0.8712890625 |
|  4  |     XGBClassifier      |  0.869921875  |
|  5  |  KNeighborsClassifier  |  0.851171875  |
|  6  |       GaussianNB       |  0.8498046875 |
|  7  |   StackingClassifier   |   0.83359375  |
|  8  | DecisionTreeClassifier | 0.72177734375 |
+-----+------------------------+---------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/kYtGvAS.png' width=800 height=600 />

### PCA

In [30]:
classifiers = get_classifiers_pipeline(dimRed = PCA(n_components = 100)) #returns all the classifier pipeline with pca
display_hyper = False
save_model = True
category = 'OrgFe' #Original Features
baseline_models = get_classifier_score(x_train, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [06:27<00:00, 48.49s/it]


Example File Name:  saved_models/OrgFe_PCA_DecisionTreeClassifier_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   | 0.7510416666666667 |
|  2  |          SVC           | 0.7510416666666666 |
|  3  | RandomForestClassifier | 0.7336805555555557 |
|  4  |       GaussianNB       | 0.732638888888889  |
|  5  |     XGBClassifier      | 0.7100694444444444 |
|  6  |   StackingClassifier   | 0.6930555555555555 |
|  7  |  KNeighborsClassifier  | 0.6607638888888889 |
|  8  | DecisionTreeClassifier | 0.6399305555555556 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/OlAkAWo.png' width=800 height=600 />

### Scaling

In [31]:
#MinMaxScaler
classifiers = get_classifiers_pipeline(scaler = MinMaxScaler())#returns all the classifier pipeline with MinMaxScaler
display_hyper = False
save_model = True
category = 'OrgFe' #Original Features
baseline_models = get_classifier_score(x_train, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [08:27<00:00, 63.43s/it]


Example File Name:  saved_models/OrgFe_MinMaxScaler_KNeighborsClassifier_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   | 0.8243055555555555 |
|  2  |     XGBClassifier      | 0.7989583333333334 |
|  3  | RandomForestClassifier | 0.7760416666666666 |
|  4  |          SVC           | 0.7579861111111112 |
|  5  |       GaussianNB       | 0.7381944444444445 |
|  6  |   StackingClassifier   | 0.6758680555555555 |
|  7  | DecisionTreeClassifier | 0.6666666666666666 |
|  8  |  KNeighborsClassifier  | 0.6427083333333333 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/9DA8cxz.png' width=800 height=600 />

In [32]:
#StandardScaler
classifiers = get_classifiers_pipeline(scaler = StandardScaler())#returns all the classifier pipeline with StandardScaler
display_hyper = False
save_model = True
category = 'OrgFe' #Original Features
baseline_models = get_classifier_score(x_train, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [08:26<00:00, 63.30s/it]


Example File Name:  saved_models/OrgFe_StandardScaler_KNeighborsClassifier_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   | 0.836111111111111  |
|  2  |     XGBClassifier      | 0.7927083333333333 |
|  3  |          SVC           | 0.7541666666666667 |
|  4  | RandomForestClassifier |      0.753125      |
|  5  |       GaussianNB       | 0.7416666666666667 |
|  6  |   StackingClassifier   | 0.6815972222222222 |
|  7  | DecisionTreeClassifier | 0.6802083333333333 |
|  8  |  KNeighborsClassifier  | 0.6204861111111111 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/syJs71h.png' width=800 height=600 />

### Top 100 correlated Engineered Feature Models + MinMaxScaler

In [34]:
top = 100
top_features = feature_imp['abs_correlation'].sort_values(ascending = False)[:top]
x_train_fe_1 = feature_engineering_perm(x_train, top_features)

classifiers = get_classifiers_pipeline(scaler = MinMaxScaler()) #returns all the classifier pipeline with MinMaxScaler
display_hyper = False
save_model = True
category = 'EngFe100_Cor'
baseline_models = get_classifier_score(x_train_fe_1, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [06:31<00:00, 48.99s/it]


Example File Name:  saved_models/EngFe100_Cor_MinMaxScaler_GaussianNB_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   | 0.9118055555555555 |
|  2  |          SVC           | 0.8854166666666666 |
|  3  |     XGBClassifier      |      0.884375      |
|  4  | RandomForestClassifier | 0.8788194444444445 |
|  5  |  KNeighborsClassifier  | 0.8668402777777777 |
|  6  | DecisionTreeClassifier | 0.8647569444444445 |
|  7  |   StackingClassifier   | 0.8461805555555555 |
|  8  |       GaussianNB       | 0.8369791666666666 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/pZwCoQP.png' width=800 height=600 />

### Top 100 correlated Engineered Feature Models + MinMaxScaler + ADASYN

In [35]:
top = 100
top_features = feature_imp['abs_correlation'].sort_values(ascending = False)[:top]
x_train_fe_1 = feature_engineering_perm(x_train, top_features)

classifiers = get_classifiers_pipeline(scaler = MinMaxScaler(), sampler = ADASYN()) #returns all the classifier pipeline with MinMaxScaler, ADASYN
display_hyper = False
save_model = True
category = 'EngFe100_Cor'
baseline_models = get_classifier_score(x_train_fe_1, y_train, classifiers, display_hyper, save_model, category)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [07:30<00:00, 56.33s/it]


Example File Name:  saved_models/EngFe100_Cor_ADASYN_MinMaxScaler_GaussianNB_8.sav
+-----+------------------------+--------------------+
| No. |       Classifier       |    CV AUC Score    |
+-----+------------------------+--------------------+
|  1  |   LogisticRegression   |      0.915625      |
|  2  |     XGBClassifier      | 0.8927083333333334 |
|  3  |          SVC           | 0.888888888888889  |
|  4  | RandomForestClassifier | 0.8822916666666666 |
|  5  | DecisionTreeClassifier | 0.8737847222222221 |
|  6  |   StackingClassifier   | 0.8696180555555555 |
|  7  |  KNeighborsClassifier  |      0.85625       |
|  8  |       GaussianNB       | 0.8401041666666668 |
+-----+------------------------+--------------------+


#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/tDshmui.png' width=800 height=600 />

### Best Model
#### Kaggle Score(Public and Private Score)
<img src='https://i.imgur.com/vdApsyp.png' width=800 height=600 />

<br>
<br>
<br>
Private leader board - Top 21%<br>
Public leader board - Top 46%