<a href="https://colab.research.google.com/github/mazensomran/repo1/blob/main/Feature_importance_estimation_for_XGB_LGB_and_CB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import cv2
import pandas as pd
import numpy as np
import os
from skimage.filters import roberts, sobel, scharr, prewitt
from scipy import ndimage as nd
import time
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import PrecisionRecallDisplay
import pickle
from sklearn.ensemble import RandomForestClassifier

In [None]:
train_path = "/content/drive/MyDrive/augmented_train1/"
test_path = "/content/drive/MyDrive/augmented_test1/"
mask_path ="/content/drive/MyDrive/augmented_masks1/"

In [None]:
def feature_extractor(path,mask_path):
    Dataset = pd.DataFrame()
    i =1 # for monitoring the operation process
    for image in os.listdir(path):  # iterate through each file
        #print(i)
        df = pd.DataFrame()
        if image.split('.')[0][:3] == 'Def': # first three letters of each defect type
            mask1 = cv2.imread(mask_path + image)
            mask1 = cv2.cvtColor(mask1, cv2.COLOR_BGR2GRAY)
            ret,bi_mask = cv2.threshold(mask1,192,255,cv2.THRESH_BINARY)
            mask = cv2.resize(bi_mask, (200, 200))//255 #to make the classes confined between 1 and 0 only for each pixel
            df["label"] = mask.reshape(-1) # the label of each defected tile images is its binary mask
        #else:
           # df["label"] = np.zeros((80, 80)).reshape(-1) #if the image not for defected tile its label is (0)

            input_img = cv2.imread(path + image)  # Read images
            img = cv2.cvtColor(input_img, cv2.COLOR_BGR2GRAY)
            img = cv2.resize(img,(200, 200))


            pixel_values = img.reshape(-1)/255.0
            df['Pixel_Value'] = pd.DataFrame(pixel_values)   #Pixel value itself as a feature
        #Defining the desired filter (feature).
            edge_roberts = roberts(img)
            edge_roberts1 = edge_roberts.reshape(-1)
            df['Roberts'] = pd.DataFrame(edge_roberts1)

        # SOBEL
            edge_sobel = sobel(img)
            edge_sobel1 = edge_sobel.reshape(-1)
            df['Sobel'] = pd.DataFrame(edge_sobel1)

        # VARIANCE with size=3
            variance_img = nd.generic_filter(img, np.var, size=3)
            edge_variance = variance_img.reshape(-1)/255.0
            df['variance'] = pd.DataFrame(edge_variance)

        # GAUSSIAN with sigma=3
            gaussian_img = nd.gaussian_filter(img, sigma=3)
            gaussian_img1 = gaussian_img.reshape(-1)/255.0
            df['Gaussian3'] = pd.DataFrame(gaussian_img1)

        # SCHARR
            edge_scharr = scharr(img)
            edge_scharr1 = edge_scharr.reshape(-1)
            df['Scharr'] = pd.DataFrame(edge_scharr1)

        # PREWITT
            edge_prewitt = prewitt(img)
            edge_prewitt1 = edge_prewitt.reshape(-1)
            df['Prewitt'] = pd.DataFrame(edge_prewitt1)

        # MEDIAN with sigma=3
            median_img = nd.median_filter(img, size=3)
            median_img1 = median_img.reshape(-1)/255.0
            df['Median3'] = pd.DataFrame(median_img1)

        # CANNY EDGE
            edges = cv2.Canny(img, 100, 200)  # Image, min and max values
            edges1 = edges.reshape(-1)/255.0
            df['Canny_Edge'] = pd.DataFrame(edges1)

            print(i)
        #Add column to original dataframe
            i+=1
            Dataset = pd.concat([Dataset,df])

    return Dataset

In [None]:
Training_data= feature_extractor(train_path,mask_path)

In [None]:
#Training_data = Training_data[Training_data.label != 0]
X_train= Training_data.drop(labels =['label',], axis=1)
X_train.info()
#Scaling training set if needed
'''from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)'''

y_train = Training_data['label'].values

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7680000 entries, 0 to 39999
Data columns (total 9 columns):
 #   Column       Dtype  
---  ------       -----  
 0   Pixel_Value  float64
 1   Roberts      float64
 2   Sobel        float64
 3   variance     float64
 4   Gaussian3    float64
 5   Scharr       float64
 6   Prewitt      float64
 7   Median3      float64
 8   Canny_Edge   float64
dtypes: float64(9)
memory usage: 585.9 MB


In [None]:
np.unique(y_train)

array([0., 1.])

In [None]:
import xgboost as xgb
params = {
    'max_depth': 3,
    'eta': 0.1,
    'objective': 'binary:logistic'
}
XGB_model = xgb.XGBClassifier(**params)

In [None]:
t0 = time.time()
XGB_model.fit(X_train, y_train)
Training_time = time.time()-t0
print("Training_time", Training_time)

Training_time 453.20412015914917


In [None]:
Feature_importance = {}
for score, name in zip(XGB_model.feature_importances_, X_train.columns):
  Feature_importance[name] = round(score, 2)
sorted_Feature_importance = {k: v for k, v in sorted(Feature_importance.items(), key=lambda item: item[1],reverse=True)}
sorted_Feature_importance

{'Gaussian3': 0.3,
 'Scharr': 0.16,
 'Pixel_Value': 0.12,
 'Roberts': 0.11,
 'Prewitt': 0.1,
 'Median3': 0.1,
 'variance': 0.08,
 'Canny_Edge': 0.02,
 'Sobel': 0.01}

In [None]:
import lightgbm as lgb
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

train_data = lgb.Dataset(X_train, label=y_train)

In [None]:
t0 = time.time()
LGB_model = lgb.train(params, train_data, num_boost_round=100)
Training_time = time.time()-t0
print("Training_time", Training_time)

You can set `force_col_wise=true` to remove the overhead.
Training_time 5.506487607955933


In [None]:
# Get feature importances
feature_importance = LGB_model.feature_importance()

# Get feature names
feature_names = LGB_model.feature_name()

# Create a feature importance dataframe
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Sort the features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importance
print(feature_importance_df)

       Feature  Importance
4    Gaussian3         750
0  Pixel_Value         592
7      Median3         520
1      Roberts         282
6      Prewitt         232
5       Scharr         223
3     variance         219
2        Sobel         107
8   Canny_Edge          75


In [None]:
!pip install catboost
from catboost import CatBoostClassifier

In [None]:
CB_model = CatBoostClassifier()
t0 = time.time()
CB_model.fit(X_train, y_train)
Training_time = time.time()-t0
print("Training_time", Training_time)

In [None]:
Feature_importance = {}
for score, name in zip(CB_model.feature_importances_, X_train.columns):
  Feature_importance[name] = round(score, 2)
sorted_Feature_importance = {k: v for k, v in sorted(Feature_importance.items(), key=lambda item: item[1],reverse=True)}
sorted_Feature_importance

{'Gaussian3': 30.81,
 'Median3': 19.73,
 'Pixel_Value': 12.97,
 'variance': 10.6,
 'Prewitt': 8.75,
 'Scharr': 8.28,
 'Sobel': 4.07,
 'Roberts': 3.92,
 'Canny_Edge': 0.88}