# RandomForest Classifier Modelling

In [1]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade
#! pip install shap
# Install XGBoost 
#!pip install lightgbm --upgrade

### Read the CSV and Perform Basic Data Cleaning

In [10]:
# Import depedencies
import pandas as pd
import numpy as np
import pickle
import os

In [11]:
# Load libraries
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.inspection import permutation_importance
import shap

In [12]:
# File path 
data_feature_file = os.path.join("","data","featureData","feature_dataframe.csv")
model_result = os.path.join("","data","results","RandomForest.csv")
model_file_name = os.path.join("","data","model",'randomForest_finalized_model.pickle')

In [13]:
# Read data
df = pd.read_csv(data_feature_file)

In [14]:
# Display Sample data
df.head()

Unnamed: 0,age,job,education,contact,duration,campaign,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,...,jun,mar,may,nov,oct,sep,mon,thu,tue,wed
0,56,4,2,0,261,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
1,57,7,5,0,149,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
2,37,7,5,0,226,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
3,40,10,3,0,151,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
4,56,7,5,0,307,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0


In [15]:
# Check data size
df.shape

(41188, 36)

In [16]:
# Check dependent value count to see if data is imbalanced
df['y'].value_counts()

0    36548
1     4640
Name: y, dtype: int64

#### Random Forest is a tree-based model and hence does not require feature scaling. 

#### Build base model 

In [9]:
# Let's seperate dependent and independent variables
X= df.drop(columns = ['y'])
y = df['y']

In [10]:
# Get feature list
feature_name = X.columns.tolist()

In [11]:
# Shape of independent variable or features
X.shape

(41188, 35)

In [12]:
# Split data to train and test and check size of train data.
# Using 70-/30 split with random state as 420 (hyper parameter)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=140, test_size=0.30)

# Print shape of train data
print(X_train.shape)

(28831, 35)


In [13]:
# Check the shape of features
print(X_train.shape)
print(X_test.shape)

(28831, 35)
(12357, 35)


In [14]:
# Model instance
model_base = RandomForestClassifier(n_estimators=200)

# Fit model
model_base.fit(X_train, y_train)

# Predict
pred_base= model_base.predict(X_test)

In [15]:
# Scores and accurancy
model_base_score = round(model_base.score(X_train, y_train)*100,3)
model_base_accuracy = round(model_base.score(X_test, y_test)*100,3)

In [16]:
print("Base model training model score: ",model_base_score)
print("Base model testing model score: ",model_base_accuracy)

Base model training model score:  100.0
Base model testing model score:  91.236


In [17]:
# Print matrix results
print("Accuracy score %s" %accuracy_score(y_test,pred_base))
print(confusion_matrix(y_test, pred_base))
print("Classification report  \n %s" %(classification_report(y_test, pred_base)))

Accuracy score 0.9123573682932751
[[10603   374]
 [  709   671]]
Classification report  
               precision    recall  f1-score   support

           0       0.94      0.97      0.95     10977
           1       0.64      0.49      0.55      1380

    accuracy                           0.91     12357
   macro avg       0.79      0.73      0.75     12357
weighted avg       0.90      0.91      0.91     12357



###### As data is imbalanced, F1 score is important. F1 score for class 1 value prediction is very low compared to class 0 prediction.

In [18]:
# Let' use different methods to handle imbalnced data and test model results

In [19]:
from sklearn.metrics import classification_report
def randForest(X_train, y_train, X_test, y_test):
    
    # Build model
    model = RandomForestClassifier(n_estimators = 200, oob_score = True, n_jobs = -1,random_state =50, 
                              max_features = "log2", min_samples_leaf = 100)
    
    # Fir model
    model.fit(X_train, y_train)
    
    # Scores and accurancy
    model_train_score = round(model.score(X_train, y_train)*100,3)
    model_test_score = round(model.score(X_test, y_test)*100,3)

    print("\nTraining model score: ",model_train_score)
    print("Testing model score: ",model_test_score)
    
    # Get accurancy
    accuracy = model.score(X_test, y_test)
    print("\nAccuracy", accuracy, "\n")
    
    # Get predictions
    y_pred = model.predict(X_test)

    # Get confusion matrix
    rnf_matri = classification_report(y_test,y_pred)
    print(rnf_matri)

In [20]:
# Get Model results
def getModelResult(X,y):
    
    # Split data into train-test -- statify y will insure there will be eqal represenation for each class
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)
    
    # Get model results 
    randForest(X_train, y_train, X_test, y_test)

In [21]:
def featureSelection(X,y,method):
    # Split data into train-test -- statify y will insure there will be eqal represenation for each class
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)
    
    # Get Random Forest Regressor instance and fit training data
    rf = RandomForestRegressor(n_estimators=200)
    rf.fit(X_train, y_train)
    
    if method == 1:
        # Using RF to select features
        sorted_idx = rf.feature_importances_.argsort()
        
        # Select X and y
        X=X[X.columns[rf.feature_importances_>0.001]]
        
        # Get results witout feature selection
        print("\n-- Model performace after Random Forest feature selection -- ")
        getModelResult(X,y)
    
    elif method==2:
        
        # Get feature importance
        perm_importance = permutation_importance(rf, X_test, y_test)

        # Get Index of feature
        sorted_idx = perm_importance.importances_mean.argsort()

        # Recreate dependent and independent data set using selected features
        X=X[X.columns[perm_importance.importances_mean>0]]

        # Get results witout feature selection
        print("\n-- Model performace after Permutation feature selection -- ")
        getModelResult(X,y)
    
    elif method==3:
        explainer = shap.TreeExplainer(rf)
        shap_values = explainer.shap_values(X_test)
        
        # Get feature names
        vals= np.abs(shap_values).mean(0)
        feature_importance = pd.DataFrame(list(zip(X.columns,vals)),columns=['col_name','feature_importance_vals'])
        feature_importance.sort_values(by=['feature_importance_vals'],ascending=False,inplace=True)
        feature_importance = feature_importance[feature_importance.feature_importance_vals > 0.002 ]
        
        # Recreate dependent and independent data set using selected features
        X= X[feature_importance.col_name]

        # Get results witout feature selection
        print("\n -- Model performace after SHAP feature selection -- ")
        getModelResult(X,y)


#### Method 1: Undersampling

In [22]:
# Class count
count_class_0, count_class_1 = df.y.value_counts()

# Divide by class
df_class_0 = df[df['y'] == 0]
df_class_1 = df[df['y'] == 1]

In [23]:
# Undersample 0-class and concat the DataFrames of both class
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under.y.value_counts())

Random under-sampling:
1    4640
0    4640
Name: y, dtype: int64


In [24]:
# Get features and dependent variable data 
X = df_test_under.drop('y',axis='columns')
y = df_test_under['y']

In [25]:
# Get results witout feature selection
print("-- Model performace without feature selection -- ")
getModelResult(X,y)

-- Model performace without feature selection -- 

Training model score:  86.692
Testing model score:  87.015

Accuracy 0.8701508620689655 

              precision    recall  f1-score   support

           0       0.91      0.82      0.86       928
           1       0.84      0.92      0.88       928

    accuracy                           0.87      1856
   macro avg       0.87      0.87      0.87      1856
weighted avg       0.87      0.87      0.87      1856



Model without feature selection with undersampling imporved overall F1 score. 

In [26]:
# Get Model performance for undersampled data using feature selection
featureSelection(X,y,1)
featureSelection(X,y,2)
featureSelection(X,y,3)


-- Model performace after Random Forest feature selection -- 

Training model score:  86.503
Testing model score:  87.015

Accuracy 0.8701508620689655 

              precision    recall  f1-score   support

           0       0.91      0.83      0.86       928
           1       0.84      0.91      0.88       928

    accuracy                           0.87      1856
   macro avg       0.87      0.87      0.87      1856
weighted avg       0.87      0.87      0.87      1856


-- Model performace after Permutation feature selection -- 

Training model score:  87.527
Testing model score:  87.608

Accuracy 0.8760775862068966 

              precision    recall  f1-score   support

           0       0.92      0.82      0.87       928
           1       0.84      0.93      0.88       928

    accuracy                           0.88      1856
   macro avg       0.88      0.88      0.88      1856
weighted avg       0.88      0.88      0.88      1856


 -- Model performace after SHAP feature

#### Under sampling gave much better F1 score comapred wth base model. Feature selection did not add much value in improving model accuracy.

#### Method 2: Over Sampling

In [27]:
# Oversample 1-class and concat the DataFrames of both classes
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over.y.value_counts())

Random over-sampling:
1    36548
0    36548
Name: y, dtype: int64


In [28]:
# Get features and dependent variable data 
X = df_test_over.drop('y',axis='columns')
y = df_test_over['y']

In [29]:
# Get model results for oversampled data
print("-- Model performace without feature selection -- ")
getModelResult(X,y)

-- Model performace without feature selection -- 

Training model score:  89.254
Testing model score:  89.295

Accuracy 0.8929548563611491 

              precision    recall  f1-score   support

           0       0.95      0.83      0.89      7310
           1       0.85      0.96      0.90      7310

    accuracy                           0.89     14620
   macro avg       0.90      0.89      0.89     14620
weighted avg       0.90      0.89      0.89     14620



Model results imporved slightly with data oversampling technique. 

In [30]:
# Get Model performance for oversampled data using feature selection
featureSelection(X,y,1)
featureSelection(X,y,2)
featureSelection(X,y,3)


-- Model performace after Random Forest feature selection -- 

Training model score:  89.226
Testing model score:  89.323

Accuracy 0.8932284541723666 

              precision    recall  f1-score   support

           0       0.95      0.83      0.89      7310
           1       0.85      0.96      0.90      7310

    accuracy                           0.89     14620
   macro avg       0.90      0.89      0.89     14620
weighted avg       0.90      0.89      0.89     14620


-- Model performace after Permutation feature selection -- 

Training model score:  89.055
Testing model score:  89.159

Accuracy 0.8915868673050615 

              precision    recall  f1-score   support

           0       0.95      0.83      0.88      7310
           1       0.85      0.96      0.90      7310

    accuracy                           0.89     14620
   macro avg       0.90      0.89      0.89     14620
weighted avg       0.90      0.89      0.89     14620


 -- Model performace after SHAP feature

#### Over sampling gave much better F1 score comapred wth base model. Feature selection did not add much value in improving model accuracy.

### Method 3: SMOTE

In [17]:
X = df.drop('y',axis='columns')
y = df['y']

In [18]:
from imblearn.over_sampling import SMOTE

# Get smote object
smote = SMOTE(sampling_strategy='minority')

# Fit data
X_smote, y_smote = smote.fit_sample(X, y)

# Check SMOTE results 
y_smote.value_counts()

1    36548
0    36548
Name: y, dtype: int64

In [33]:
# Get model results for oversampled data
print("-- Model performace without feature selection -- ")
getModelResult(X_smote, y_smote)

-- Model performace without feature selection -- 

Training model score:  92.009
Testing model score:  91.703

Accuracy 0.91703146374829 

              precision    recall  f1-score   support

           0       0.95      0.88      0.91      7310
           1       0.89      0.95      0.92      7310

    accuracy                           0.92     14620
   macro avg       0.92      0.92      0.92     14620
weighted avg       0.92      0.92      0.92     14620



Model imporved overall F1 score even wothout feature selection.

In [34]:
# Get Model performance for SMOTE data using feature selection
featureSelection(X_smote, y_smote,1)
featureSelection(X_smote, y_smote,2)
featureSelection(X_smote, y_smote,3)


-- Model performace after Random Forest feature selection -- 

Training model score:  91.961
Testing model score:  91.676

Accuracy 0.9167578659370725 

              precision    recall  f1-score   support

           0       0.95      0.88      0.91      7310
           1       0.89      0.95      0.92      7310

    accuracy                           0.92     14620
   macro avg       0.92      0.92      0.92     14620
weighted avg       0.92      0.92      0.92     14620


-- Model performace after Permutation feature selection -- 

Training model score:  92.132
Testing model score:  91.929

Accuracy 0.9192886456908345 

              precision    recall  f1-score   support

           0       0.95      0.88      0.92      7310
           1       0.89      0.96      0.92      7310

    accuracy                           0.92     14620
   macro avg       0.92      0.92      0.92     14620
weighted avg       0.92      0.92      0.92     14620


 -- Model performace after SHAP feature

#### SMOTE model gave much better F1 score comapred wth base model. Feature selection did not add much value in improving model accuracy.

#### Method 4: Use of Ensemble with undersampling

In [35]:
# Get features and dependent variable data 
X = df.drop('y',axis='columns')
y = df['y']

# Split data into train-test -- statify y will insure there will be eqal represenation for each class
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

In [36]:
y_train.value_counts()

0    29238
1     3712
Name: y, dtype: int64

In [37]:
29238/3712  # We can build 8 models

7.876616379310345

In [38]:
25990+3712

29702

model1 --> class1(3712) + class0(0, 3712)

model2 --> class1(3712) + class0(3713, 7425)

model3 --> class1(3712) + class0(7425, 11137)

model4 --> class1(3712) + class0(11138, 14850)

model5 --> class1(3712) + class0(14851, 18563)

model6 --> class1(3712) + class0(18564, 22276)

model7 --> class1(3712) + class0(22277, 25989)

model8 --> class1(3712) + class0(25990, 29238)

In [39]:
def get_train_batch(df_majority, df_minority, start, end):
    df_train = pd.concat([df_majority[start:end], df_minority], axis=0)
    X_train = df_train.drop('y', axis='columns')
    y_train = df_train.y
    return X_train, y_train 

In [40]:
# Preapre df for training data set
df_train = X_train.copy()
df_train['y'] = y_train

In [41]:
# Get class level data
df_class0 = df_train[df_train.y==0]
df_class1 = df_train[df_train.y==1]

In [42]:
# Let's build 9 models

In [43]:
# Model 1---

# Train test split
X_train, y_train = get_train_batch(df_class0, df_class1, 0, 3712)

# Build model
model = RandomForestClassifier(n_estimators=200)

# Fit model
model.fit(X_train, y_train)

# Get predictions
y_pred1 = model.predict(X_test)

In [44]:
# Model 2---3713, 7425

# Train test split
X_train, y_train = get_train_batch(df_class0, df_class1, 3713, 7425)

# Build model
model = RandomForestClassifier(n_estimators=200)

# Fit model
model.fit(X_train, y_train)

# Get predictions
y_pred2 = model.predict(X_test)

In [45]:
# Model 3---7425, 11137

# Train test split
X_train, y_train = get_train_batch(df_class0, df_class1, 7425, 11137)

# Build model
model = RandomForestClassifier(n_estimators=200)

# Fit model
model.fit(X_train, y_train)

# Get predictions
y_pred3 = model.predict(X_test)

In [46]:
# Model 4---11138, 14850

# Train test split
X_train, y_train = get_train_batch(df_class0, df_class1, 11138, 14850)

# Build model
model = RandomForestClassifier(n_estimators=200)

# Fit model
model.fit(X_train, y_train)

# Get predictions
y_pred4 = model.predict(X_test)

In [47]:
# Model 5---14851, 18563

# Train test split
X_train, y_train = get_train_batch(df_class0, df_class1,14851, 18563)

# Build model
model = RandomForestClassifier(n_estimators=200)

# Fit model
model.fit(X_train, y_train)

# Get predictions
y_pred5 = model.predict(X_test)

In [48]:
# Model 6---18564, 22276

# Train test split
X_train, y_train = get_train_batch(df_class0, df_class1,18564, 22276)

# Build model
model = RandomForestClassifier(n_estimators=200)

# Fit model
model.fit(X_train, y_train)

# Get predictions
y_pred6 = model.predict(X_test)

In [49]:
# Model 7---18564, 22276

# Train test split
X_train, y_train = get_train_batch(df_class0, df_class1,22277, 25989)

# Build model
model = RandomForestClassifier(n_estimators=200)

# Fit model
model.fit(X_train, y_train)

# Get predictions
y_pred7 = model.predict(X_test)

In [50]:
# Model 8---25990, 29238

# Train test split
X_train, y_train = get_train_batch(df_class0, df_class1,25990, 29238)

# Build model
model = RandomForestClassifier(n_estimators=200)

# Fit model
model.fit(X_train, y_train)

# Get predictions
y_pred8 = model.predict(X_test)

In [51]:
# Get check all predictions
y_pred_final =y_pred1.copy()

# Loop through results
for i in range(len(y_pred1)):
    # Get results from all models
    n_ones = y_pred1[i] + y_pred2[i] + y_pred3[i]+ y_pred4[i] + y_pred5[i] + y_pred6[i]+ y_pred7[i] + y_pred8[i] 
    # If sum is more than 1 means there is misclassification else classified correctly
    if n_ones>1:
        y_pred_final[i] = 1
    else:
        y_pred_final[i] = 0

In [52]:
# Get classification report
rnf_matrix = classification_report(y_test, y_pred_final)
print(rnf_matrix)

              precision    recall  f1-score   support

           0       0.99      0.82      0.90      7310
           1       0.40      0.96      0.57       928

    accuracy                           0.84      8238
   macro avg       0.70      0.89      0.73      8238
weighted avg       0.93      0.84      0.86      8238



Looking at all models with feature selection, we will consider test_selectFeature_score =91.929 of RF model with permutation feature selection method of feature selection.

In [None]:
test_selectFeature_score = 91.929

Cross-validation is not necessary for random forest, because multiple bagging in process of training random forest prevents over-fitting.

In [19]:
# Let's use smote data for grid search
len(X_smote),len(y_smote)

(73096, 73096)

In [20]:
def train_test(X,y):
    # Split data into train-test -- statify y will insure there will be equal represenation for each class
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50, stratify=y)
    return X_train, X_test, y_train, y_test

In [21]:
# Split data into train-test -- statify y will insure there will be equal represenation for each class
X_train, X_test, y_train, y_test = train_test(X_smote, y_smote)

In [23]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [26]:
# Use the random grid to search for best hyperparameters

# First create the base model to tune
rf = RandomForestClassifier(n_estimators = 200, random_state = 42)

# Random search of parameters, using 5 fold cross validation, search across 100 different combinations 
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 10, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.


RandomizedSearchCV(cv=10,
                   estimator=RandomForestClassifier(n_estimators=200,
                                                    random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, return_train_score=True,
 

In [43]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [63]:
#Model with optimal parameters
grid_search=RandomForestClassifier(max_depth= None, max_features= 'sqrt', min_samples_leaf= 1, 
                            min_samples_split= 2, n_estimators= 400)

In [64]:
# Fit the data
grid_search.fit(X_train, y_train)

RandomForestClassifier(max_features='sqrt', n_estimators=400)

In [65]:
# Get prediction and test model accuracy
y_pred_tuned = grid_search.predict(X_test)

In [66]:
grid_search_training_score = round(grid_search.score(X_train, y_train)*100,3)
grid_search_tuned_accuracy = round(grid_search.score(X_test, y_test)*100,3)

In [67]:
# Print matrix results
print("Accuracy for Random Forest on CV data: %s" %accuracy_score(y_test,y_pred_tuned))
print("Classification report  \n %s" %(classification_report(y_test, y_pred_tuned)))

Accuracy for Random Forest on CV data: 0.9502735978112176
Classification report  
               precision    recall  f1-score   support

           0       0.96      0.94      0.95      7310
           1       0.94      0.96      0.95      7310

    accuracy                           0.95     14620
   macro avg       0.95      0.95      0.95     14620
weighted avg       0.95      0.95      0.95     14620



In [70]:
# Get train and test scores
training_scoreTuned = round(rf_random.score(X_train, y_train)*100,3)
test_scoreTuned = round(accuracy_score(y_test, y_pred_tuned)*100,3)
print("Testing model score (Tuned model): ",test_scoreTuned)

Testing model score (Tuned model):  95.027


#### Tuned model gave the best results for accuracy and F1 score. 

#### Model Evaluation

In [73]:
# Save results in csv file
evaluations = {'': ['Base Model', 'Select Features Model', 'Tuned Model'],
                    'Accuracy': [f"{model_base_accuracy}%", f"{test_selectFeature_score}%", f"{test_scoreTuned}%"]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index('')

# Export model result data
evaluations_df.to_csv(model_result)

# Show model results
evaluations_df

Unnamed: 0,Accuracy
,
Base Model,91.236%
Select Features Model,91.929%
Tuned Model,95.027%


### Save Model

In [72]:
# Save the model 
pickle.dump(grid_search, open(model_file_name, 'wb'))

### Summary: 
- F1-Score is the weighted average of Precision and Recall used in all types of classification algorithms. Therefore, this score takes both false positives and false negatives into account. F1-Score is usually more useful than accuracy, especially if you have an uneven class distribution.
- Base model accuracy of  91.24% which is slighly imporved to 91.93% with feature selection and handling imbalnced data using SMOTE technique which was further improved to 95.027 with GridSearch.