In [35]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from pandas.plotting import scatter_matrix

# load X and y
# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute

X = pd.read_csv('borutax_211105.csv', header=None, index_col=0).values
y = pd.read_csv('borutay_25211105.csv', header=None, index_col=0).values
y = y.ravel()

print("X", X)
print("y", y)

X [[1. 1. 1. ... 1. 1. 0.]
 [1. 1. 1. ... 1. 1. 0.]
 [1. 1. 1. ... 1. 1. 0.]
 ...
 [1. 1. 1. ... 1. 0. 1.]
 [1. 1. 1. ... 1. 0. 1.]
 [1. 1. 1. ... 1. 1. 1.]]
y [2 1 2 ... 2 2 1]


In [36]:
(train_x, test_x ,train_y, test_y) = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [37]:
rf = RandomForestClassifier(max_depth=30, n_estimators=500, random_state=42)
rf.fit(train_x, train_y)

print(rf.classes_)
print(confusion_matrix(train_y,rf.predict(train_x),labels=rf.classes_))
trainaccuracy_random_forest = rf.score(train_x, train_y)
print('TrainAccuracy: {}'.format(trainaccuracy_random_forest))

y_pred = rf.predict(test_x)
accuracy_random_forest = accuracy_score(test_y, y_pred)
print('Accuracy: {}'.format(accuracy_random_forest))

from sklearn.metrics import classification_report
print(classification_report(test_y, y_pred))

[1 2]
[[117  91]
 [ 56 445]]
TrainAccuracy: 0.7926657263751763
Accuracy: 0.7213114754098361
              precision    recall  f1-score   support

           1       0.55      0.44      0.49        93
           2       0.77      0.84      0.81       212

    accuracy                           0.72       305
   macro avg       0.66      0.64      0.65       305
weighted avg       0.71      0.72      0.71       305



In [38]:
df = pd.DataFrame(rf.feature_importances_)
#df = df.sort_values(by = 0, ascending=False)

df.head()

Unnamed: 0,0
0,0.0
1,0.0
2,0.000242
3,0.0
4,0.0


In [39]:
#df.to_csv('boruta211105_75.csv',index=False)
#df.to_csv('boruta211105_50.csv',index=False)
df.to_csv('boruta211105_25.csv',index=False)

In [40]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=7)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', two_step=False, verbose=2, random_state=1)

In [41]:
# find all relevant features - 5 features should be selected
feat_selector.fit(train_x, train_y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	10 / 100
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	11 / 100
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	12 / 100
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	13 / 100
Confirmed: 	0
Tentative: 	438
Rejected: 	0
Iteration: 	14 / 100
Confirmed: 	6
Tentative: 	38
Rejected: 	394
Iteration: 	15 / 100
Confirmed: 	6
Tentative: 	38
Rejected: 	394
Iteration: 	16 / 100
Confirmed: 	6
Tentative: 	

BorutaPy(estimator=RandomForestClassifier(class_weight='balanced', max_depth=7,
                                          n_estimators=60, n_jobs=-1,
                                          random_state=RandomState(MT19937) at 0x7FA2B0390B40),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x7FA2B0390B40, two_step=False,
         verbose=2)

In [42]:
train_x_selected = train_x[:,feat_selector.support_]
test_x_selected = test_x[:,feat_selector.support_]

print("train_x_selected", train_x_selected)
print("test_x_selected", test_x_selected)

train_x_selected [[1. 1. 1. 0. 1. 0.]
 [1. 1. 1. 1. 0. 0.]
 [1. 1. 1. 0. 0. 0.]
 ...
 [0. 0. 1. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0.]
 [1. 1. 0. 1. 0. 0.]]
test_x_selected [[1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 0. 0. 1.]
 [1. 1. 1. 1. 1. 1.]
 ...
 [0. 0. 1. 0. 1. 0.]
 [1. 1. 1. 1. 1. 0.]
 [1. 0. 1. 0. 1. 0.]]


In [43]:
rf = RandomForestClassifier(max_depth=30, n_estimators=500, random_state=42)
rf.fit(train_x_selected, train_y)

print(rf.classes_)
print(confusion_matrix(train_y,rf.predict(train_x_selected),labels=rf.classes_))
trainaccuracy_random_forest = rf.score(train_x_selected, train_y)
print('TrainAccuracy: {}'.format(trainaccuracy_random_forest))

y_pred = rf.predict(test_x_selected)
accuracy_random_forest = accuracy_score(test_y, y_pred)
print('Accuracy: {}'.format(accuracy_random_forest))

from sklearn.metrics import classification_report
print(classification_report(test_y, y_pred))

[1 2]
[[ 60 148]
 [ 25 476]]
TrainAccuracy: 0.7559943582510579
Accuracy: 0.7180327868852459
              precision    recall  f1-score   support

           1       0.59      0.25      0.35        93
           2       0.74      0.92      0.82       212

    accuracy                           0.72       305
   macro avg       0.66      0.59      0.58       305
weighted avg       0.69      0.72      0.68       305



In [44]:
# check selected features - features are selected indicated as True
feat_selector.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [45]:
# check ranking of features
rank = pd.DataFrame(feat_selector.ranking_)
#rank.to_csv('boruta211105_75rank.csv',index=True)
#rank.to_csv('boruta211105_50rank.csv',index=True)
rank.to_csv('boruta211105_25rank.csv',index=True)


#feat_selector.ranking_
print(rank)

       0
0    408
1    408
2    378
3    408
4    408
..   ...
433   40
434    2
435   53
436    3
437   74

[438 rows x 1 columns]


In [46]:
rf.feature_importances_

array([0.14773271, 0.12487329, 0.20307271, 0.24268852, 0.1287447 ,
       0.15288807])

In [47]:
feat_selector.n_features_ 

6