In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as plty
import plotly.figure_factory as ff 
# from dash import Dash, dcc, html, Input, Output

In [2]:
# reading the dataset

dataset = pd.read_csv("train.csv")

In [3]:
dataset

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,0
2,013f2bd269f5,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,...,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,fd3dafe738fd,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,...,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,0
613,fd895603f071,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,...,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,0
614,fd8ef6377f76,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,...,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,0
615,fe1942975e40,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,...,9.256996,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622,0


In [4]:
# bar  chart of value
fig = plty.histogram(dataset, x="Class",  color="Class", marginal="box")

fig.show()

In [5]:
# from the above plot as we can see that class "0" is more than "1" hence the data is biased and we need to take care for those data : it can be either done by undersampling or by using keras weights

from copy import deepcopy as DC 

dataset2 = DC(dataset)

dataset2.drop(columns=["Id"], inplace = True)



#Categorical encoding

def encoding_categorical(x): #since unique values are ["A", "B"] also we can perform label encoding 
    if x == "A":
        return 0
    else:
        return 1
dataset2["EJ"] = dataset2["EJ"].apply(lambda x : encoding_categorical(x))

X = dataset2.iloc[:, :-1]
Y = dataset2.iloc[:, -1]

# checking the missing data 
def check_missing_value(flag):
    for j in dataset2.columns:
        if any(dataset2[j].isnull()) == True:
            print("missing data in column {}".format(j))

    if(flag == 1):
        print("-"*50)
        print("no missing data after knn impute")

print("checking for missing values before K-nearest-neighbour-imputation")
check_missing_value(flag=0)

from sklearn.impute import KNNImputer


dataset2 = pd.DataFrame(columns=dataset2.columns,data=KNNImputer().fit_transform(dataset2))

check_missing_value(flag = 1)  

checking for missing values before K-nearest-neighbour-imputation
missing data in column BQ
missing data in column CB
missing data in column CC
missing data in column DU
missing data in column EL
missing data in column FC
missing data in column FL
missing data in column FS
missing data in column GL
--------------------------------------------------
no missing data after knn impute


In [6]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def get_vif_(dataset2):
    vifs = [variance_inflation_factor(dataset2, i) for i in range(dataset2.shape[1])]
    vifs_df = pd.DataFrame(columns=dataset2.columns, data=np.array(vifs).reshape(1, dataset2.shape[1]))
    return vifs_df
 # variation iflation factor data pd series
vifs = get_vif_(dataset2)
columnss = []
for column in vifs.columns:
    if np.array(vifs[column]) > 10:
        columnss.append(column)
print("columns that are needed to be removed are {}".format(columnss))

dataset2.drop(columns= columnss, inplace=True)

columns that are needed to be removed are ['AH', 'AR', 'AX', 'BD ', 'BN', 'CC', 'CH', 'CL', 'CR', 'CS', 'CU', 'DA', 'DH', 'DL', 'DN', 'DV', 'EB', 'EH', 'EJ', 'EP', 'FD ', 'FI', 'GH', 'GL']


In [7]:
# since we have  imbalance data we need to undersample the data else the result are produced biased
from sklearn.utils import shuffle
class0 = dataset2[dataset2["Class"]== 0]
class1 = dataset2[dataset2["Class"]== 1]




def undersampling(class0, class1): # if len of class 1 > class 2 and vice versa
    train_subsets = []
    for i in range(4):
        train_subsets.append(shuffle(pd.concat([class1, class0.sample(n = class1.shape[0], random_state=i)])))
    
    return train_subsets

def oversampling():
    pass

def hybrid_sampling():
    pass
train_sets = undersampling(class0=class0, class1=class1)

In [8]:
Fdataset = pd.concat([train_sets[0],train_sets[1],train_sets[2],train_sets[3]])

In [9]:
# hyper parameter tuning for random forest using randomsearch and gridsearchcv

from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# setting number of trees for the random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10 )]

# no of features to consider after every split
max_features = ["auto", "sqrt", "log2"]

#Minimum no of samples that is required to split a  node
min_sample_split = [2,5,10,14]

# Miniumum no of samples that is required in leaf node
min_sample_leaf = [1,2,4,6,8]

# create a random grid 

random_grid = { 

    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth" : [int(j) for j in np.linspace(10, 1000, 10)],
    "min_samples_leaf" : min_sample_split, 
    "min_samples_leaf" : min_sample_leaf,
    "criterion" : ["entropy", "gini"]
}

In [10]:
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier()

class Splitter(): 
    def __init__(self, algorithm, splits):
        self.algorithm = algorithm
        self.splits = splits
        self.best_score = None
        self.best_param = None

    def make_split(self, X, y, random_split):

        for random_state in random_split:

            if self.algorithm == "kf":

                kf = KFold(n_splits=self.splits, random_state = random_state, shuffle = True)

            else:

                kf = StratifiedKFold(n_splits = self.splits, random_state = random_state, shuffle = True)


            for (train_index, val_index) in kf.split(X, y):

                x_train, y_train = X.iloc[train_index], y.iloc[train_index]
                x_val, y_val = X.iloc[val_index], y.iloc[val_index]

                RF_hyper_tuning = RandomizedSearchCV(estimator=RF, param_distributions= random_grid,n_iter = 20, cv = 5, verbose=2, n_jobs=-1, random_state=100)

                RF_hyper_tuning.fit(x_train,y_train)

                if self.best_score is None or RF_hyper_tuning.best_score_ > self.best_score:
                    self.best_score = RF_hyper_tuning.best_score_
                    self.best_params = RF_hyper_tuning.best_params_




            return self.best_params, self.best_score


                ## calling the model 
        

In [11]:
#seperating the classes and preparing them
import random 
X = Fdataset.iloc[:, :-1]
Y = Fdataset.iloc[:, -1]

random_state_list = random.sample(range(9999), 5)
Split  = Splitter("kf", 5)
best_score, best_params = Split.make_split(X,Y, random_state_list)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.5s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.5s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   2.8s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   2.9s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   3.1s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_le

  warn(
  warn(
  warn(


[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.7s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.7s


  warn(


[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.7s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.8s


  warn(


[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.8s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.1s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.1s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.2s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.2s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.0s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.5s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_l

  warn(
  warn(


[CV] END criterion=entropy, max_depth=120, max_features=log2, min_samples_leaf=6, n_estimators=200; total time=   0.8s


  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   6.9s


  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.1s


  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.2s
[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   6.8s
[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.4s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   6.0s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   6.4s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   6.5s
[CV] END criterion=entropy, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.8s
[CV] END criterion=entropy, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.8s
[CV] END criterion=entropy, max_depth=890, max_feat

  warn(
  warn(
  warn(


[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   6.2s
[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   6.1s
[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.7s


  warn(
  warn(


[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.1s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.1s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.2s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   5.9s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.1s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.0s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   5.6s
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.5s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   2.8s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   2.9s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   3.1s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_le

  warn(
  warn(


[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.7s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.8s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.8s


  warn(
  warn(
  warn(


[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.8s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   1.9s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   1.8s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.0s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   1.8s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_sample

  warn(


[CV] END criterion=entropy, max_depth=120, max_features=log2, min_samples_leaf=6, n_estimators=200; total time=   0.7s


  warn(


[CV] END criterion=entropy, max_depth=120, max_features=log2, min_samples_leaf=6, n_estimators=200; total time=   0.7s


  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   6.9s


  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.6s


  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.3s
[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.3s
[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.3s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   6.7s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   6.6s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   6.7s
[CV] END criterion=entropy, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.7s
[CV] END criterion=entropy, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=120, max_feature

  warn(


[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   5.7s
[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.7s
[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.7s
[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.7s


  warn(
  warn(
  warn(
  warn(


[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   6.1s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.4s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.4s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.4s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.4s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.5s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   5.9s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   6.0s
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   3.2s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   3.2s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   3.2s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_le

  warn(
  warn(


[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.7s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.8s


  warn(
  warn(


[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.9s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.9s


  warn(


[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.8s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.2s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.3s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.0s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.2s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.1s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.7s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_l

  warn(
  warn(


[CV] END criterion=entropy, max_depth=120, max_features=log2, min_samples_leaf=6, n_estimators=200; total time=   0.7s


  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.1s


  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.5s
[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.7s


  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.2s
[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.2s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   6.2s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   6.9s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   7.1s
[CV] END criterion=entropy, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.7s
[CV] END criterion=entropy, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.7s
[CV] END criterion=entropy, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.7s
[CV] END criterion=gini, max_depth=120, max_features

  warn(
  warn(
  warn(


[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.7s


  warn(
  warn(


[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.8s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.3s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.2s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.1s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.3s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.2s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   6.1s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   5.9s
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   3.0s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   3.1s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   3.1s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_le

  warn(
  warn(


[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.7s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.8s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.8s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.8s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.8s


  warn(
  warn(
  warn(


[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.7s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.1s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.2s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.8s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.2s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.3s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.2s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_l

  warn(
  warn(


[CV] END criterion=entropy, max_depth=120, max_features=log2, min_samples_leaf=6, n_estimators=200; total time=   0.7s


  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.3s


  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   8.0s
[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.6s


  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.3s
[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.4s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   6.6s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   6.9s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   6.4s
[CV] END criterion=entropy, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.8s
[CV] END criterion=entropy, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.7s
[CV] END criterion=entropy, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=entropy, max_depth=890, max_featu

  warn(


[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   7.2s
[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.7s
[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.7s


  warn(
  warn(
  warn(


[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.7s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   6.8s


  warn(


[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.2s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.3s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.4s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.5s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.4s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   6.3s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   6.7s
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   3.0s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   3.1s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=8, n_estimators=1000; total time=   3.1s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_le

  warn(
  warn(
  warn(


[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.8s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.7s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.7s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.8s


  warn(
  warn(


[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=1, n_estimators=200; total time=   0.8s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.8s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.0s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.3s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.3s
[CV] END criterion=gini, max_depth=340, max_features=auto, min_samples_leaf=4, n_estimators=600; total time=   2.1s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=200; total time=   0.8s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_sample

  warn(
  warn(
  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   8.7s


  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   8.6s
[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   8.6s


  warn(


[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   7.5s
[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=1, n_estimators=2000; total time=   8.4s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   7.1s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   8.0s
[CV] END criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=4, n_estimators=2000; total time=   8.1s
[CV] END criterion=entropy, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.7s
[CV] END criterion=entropy, max_depth=890, max_features=log2, min_samples_leaf=8, n_estimators=200; total time=   0.8s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   6.4s
[CV] END criterion=gini, max_depth=120, max_features=a

  warn(
  warn(
  warn(


[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.6s
[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.7s
[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=2, n_estimators=200; total time=   0.7s


  warn(
  warn(


[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.3s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.3s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.3s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.0s
[CV] END criterion=gini, max_depth=10, max_features=auto, min_samples_leaf=6, n_estimators=400; total time=   1.1s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   5.9s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=8, n_estimators=2000; total time=   5.7s


In [14]:
best_score

{'n_estimators': 2000,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 670,
 'criterion': 'entropy'}

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.2,random_state=100)
final_model = RandomForestClassifier(**best_score)
final_model.fit(x_train, y_train)



In [28]:
y_hat = final_model.predict(x_test)
print(accuracy_score(y_test, y_hat))

0.976878612716763
