### Importing libraries

In [2]:
from NiaPy.algorithms.basic import FireflyAlgorithm, BatAlgorithm, ParticleSwarmOptimization
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import RandomUnderSampler

from preprocessing import get_train_test_data
from benchmark import ClassificationBenchmark
from optimizer import optimize
import warnings
warnings.filterwarnings("ignore")

### Loading and preparing training and testing data

In [3]:
%%time

X_train, X_test, y_train, y_test = get_train_test_data('train_transaction.csv', 'train_identity.csv')

CPU times: user 1min 53s, sys: 1min 37s, total: 3min 30s
Wall time: 3min 54s


In [4]:
X_train_nia, y_train_nia = RandomUnderSampler().fit_resample(X_train, y_train)
X_test_nia, y_test_nia = X_test, y_test

### Let's check the score of the algorithm without using nature inspired algorithms

In [5]:
clf = RandomForestClassifier()
clf.fit(X_train_nia, y_train_nia)
y_predict_nia = clf.predict(X_test_nia)
roc_auc_score(y_test_nia, y_predict_nia)

0.8478323189750383

### Optimizing columns using Firefly algorithm and Decision Tree Classifier

In [6]:
%%time

firefly_decision_tree_benchmark = ClassificationBenchmark(DecisionTreeClassifier,
                                                          roc_auc_score,
                                                          X_train_nia,
                                                          y_train_nia,
                                                          X_test_nia,
                                                          y_test_nia)

firefly_decision_tree_columns = optimize(firefly_decision_tree_benchmark, FireflyAlgorithm(), 100)

 20%|██        | 1/5 [29:02<1:56:11, 1742.88s/it]

--------------
Run 1
--------------
Score: 0.7967592589458548
Number of features selected: 107





 40%|████      | 2/5 [58:45<1:28:19, 1766.54s/it]

--------------
Run 2
--------------
Score: 0.7943020237861125
Number of features selected: 122





 60%|██████    | 3/5 [1:28:12<58:53, 1766.53s/it]

--------------
Run 3
--------------
Score: 0.7984324931890024
Number of features selected: 113





 80%|████████  | 4/5 [1:57:36<29:25, 1765.38s/it]

--------------
Run 4
--------------
Score: 0.7952610425488922
Number of features selected: 110





100%|██████████| 5/5 [2:27:31<00:00, 1770.33s/it]

--------------
Run 5
--------------
Score: 0.7922101319280683
Number of features selected: 122




Best score of 5 runs: 0.7984324931890024
Number of features selected: 113
CPU times: user 2h 16min, sys: 11min 10s, total: 2h 27min 11s
Wall time: 2h 27min 31s





### Optimizing columns using Firefly algorithm and Logistic Regression

In [7]:
%%time

firefly_logistic_regression_benchmark = ClassificationBenchmark(LogisticRegression,
                                                                roc_auc_score,
                                                                X_train_nia,
                                                                y_train_nia,
                                                                X_test_nia,
                                                                y_test_nia)

firefly_logistic_regression_columns = optimize(firefly_logistic_regression_benchmark, FireflyAlgorithm(), 100)

 20%|██        | 1/5 [18:44<1:14:57, 1124.49s/it]

--------------
Run 1
--------------
Score: 0.7576257153282225
Number of features selected: 113





 40%|████      | 2/5 [38:12<57:29, 1149.80s/it]  

--------------
Run 2
--------------
Score: 0.756563275005898
Number of features selected: 124





 60%|██████    | 3/5 [1:27:30<1:05:51, 1975.85s/it]

--------------
Run 3
--------------
Score: 0.7581794547681722
Number of features selected: 126





 80%|████████  | 4/5 [4:36:13<1:34:25, 5665.94s/it]

--------------
Run 4
--------------
Score: 0.7575462917676032
Number of features selected: 127





100%|██████████| 5/5 [5:35:46<00:00, 4029.34s/it]  

--------------
Run 5
--------------
Score: 0.7574781690982269
Number of features selected: 121




Best score of 5 runs: 0.7581794547681722
Number of features selected: 126
CPU times: user 6h 19min 44s, sys: 1h 18min 30s, total: 7h 38min 15s
Wall time: 5h 35min 46s





### Optimizing columns using Firefly algorithm and Random Forest Classifier

In [22]:
%%time

def rf_model():
    return RandomForestClassifier(n_estimators=20)


firefly_random_forest_benchmark = ClassificationBenchmark(rf_model,
                                                          roc_auc_score,
                                                          X_train_nia,
                                                          y_train_nia,
                                                          X_test_nia,
                                                          y_test_nia)

firefly_random_forest_columns = optimize(firefly_random_forest_benchmark, FireflyAlgorithm(), 100)

 20%|██        | 1/5 [1:00:02<4:00:08, 3602.06s/it]

--------------
Run 1
--------------
Score: 0.848983939449224
Number of features selected: 103





 40%|████      | 2/5 [1:58:54<2:58:03, 3561.31s/it]

--------------
Run 2
--------------
Score: 0.8463946306762121
Number of features selected: 110





 60%|██████    | 3/5 [2:57:41<1:58:11, 3545.55s/it]

--------------
Run 3
--------------
Score: 0.84779608927391
Number of features selected: 127





 80%|████████  | 4/5 [3:56:58<59:09, 3549.92s/it]  

--------------
Run 4
--------------
Score: 0.8470661515502402
Number of features selected: 123





100%|██████████| 5/5 [4:56:16<00:00, 3555.34s/it]

--------------
Run 5
--------------
Score: 0.8475517683208137
Number of features selected: 124




Best score of 5 runs: 0.848983939449224
Number of features selected: 103
CPU times: user 4h 38min 10s, sys: 16min 29s, total: 4h 54min 39s
Wall time: 4h 56min 16s





### Optimizing columns using Bat Algorithm and Random Forest Classifier

In [36]:
%%time

def rf_bat_model():
    return RandomForestClassifier(n_estimators=10, max_depth=10)

bat_random_forest_benchmark = ClassificationBenchmark(rf_bat_model,
                                                      roc_auc_score,
                                                      X_train_nia,
                                                      y_train_nia,
                                                      X_test_nia,
                                                      y_test_nia)

bat_random_forest_columns = optimize(bat_random_forest_benchmark, BatAlgorithm(), 100)

 20%|██        | 1/5 [41:43<2:46:54, 2503.68s/it]

--------------
Run 1
--------------
Score: 0.8110621932343244
Number of features selected: 118





 40%|████      | 2/5 [1:22:24<2:03:20, 2466.74s/it]

--------------
Run 2
--------------
Score: 0.809136992695334
Number of features selected: 121





 60%|██████    | 3/5 [2:01:39<1:20:31, 2415.64s/it]

--------------
Run 3
--------------
Score: 0.8100410343780643
Number of features selected: 98





 80%|████████  | 4/5 [2:41:00<39:54, 2394.25s/it]  

--------------
Run 4
--------------
Score: 0.8117540565587816
Number of features selected: 127





100%|██████████| 5/5 [3:52:53<00:00, 2794.68s/it]

--------------
Run 5
--------------
Score: 0.8090437085133325
Number of features selected: 122




Best score of 5 runs: 0.8117540565587816
Number of features selected: 127
CPU times: user 2h 49min 16s, sys: 31min 14s, total: 3h 20min 31s
Wall time: 3h 52min 53s





### Optimizing columns using Particle Swarm Optimization and Random Forest Classifier

In [38]:
%%time

particle_swarm_random_forest_benchmark = ClassificationBenchmark(rf_model,
                                                                 roc_auc_score,
                                                                 X_train_nia,
                                                                 y_train_nia,
                                                                 X_test_nia,
                                                                 y_test_nia)

particle_swarm_forest_columns = optimize(particle_swarm_random_forest_benchmark, ParticleSwarmOptimization(), 100)

 20%|██        | 1/5 [1:13:18<4:53:12, 4398.03s/it]

--------------
Run 1
--------------
Score: 0.8547472247448139
Number of features selected: 110





 40%|████      | 2/5 [2:28:36<3:43:26, 4468.67s/it]

--------------
Run 2
--------------
Score: 0.8607576403502151
Number of features selected: 121





 60%|██████    | 3/5 [6:15:36<4:48:14, 8647.38s/it]

--------------
Run 3
--------------
Score: 0.8468617666182758
Number of features selected: 122





 80%|████████  | 4/5 [7:34:30<1:58:22, 7102.70s/it]

--------------
Run 4
--------------
Score: 0.8562549142587715
Number of features selected: 121





100%|██████████| 5/5 [8:51:41<00:00, 6380.37s/it]  

--------------
Run 5
--------------
Score: 0.854956229883906
Number of features selected: 112




Best score of 5 runs: 0.8607576403502151
Number of features selected: 121
CPU times: user 6h 4min 48s, sys: 21min 22s, total: 6h 26min 10s
Wall time: 8h 51min 41s





### Finally, let's check roc_acc score with the best nature inspired and machine learning algorithms for our dataset

In [45]:
%%time

clf = RandomForestClassifier(verbose=100)
clf.fit(X_train[particle_swarm_forest_columns], y_train)
y_predict = clf.predict(X_test[particle_swarm_forest_columns])
roc_auc_score(y_test, y_predict)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
building tree 1 of 100
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.6s remaining:    0.0s
building tree 2 of 100
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.9s remaining:    0.0s
building tree 3 of 100
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    7.5s remaining:    0.0s
building tree 4 of 100
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    9.8s remaining:    0.0s
building tree 5 of 100
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.4s remaining:    0.0s
building tree 6 of 100
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   15.3s remaining:    0.0s
building tree 7 of 100
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   18.5s remaining:    0.0s
building tree 8 of 100
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   21.0s remaining:    0.0s
building tree 9 of 100
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   23.4s remaining: 

0.7474658840859421

### Let's also check the results without feature engineering

In [46]:
clf = RandomForestClassifier(verbose=100)
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
roc_auc_score(y_test, y_predict)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
building tree 1 of 100
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.5s remaining:    0.0s
building tree 2 of 100
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.1s remaining:    0.0s
building tree 3 of 100
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   11.3s remaining:    0.0s
building tree 4 of 100
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   15.1s remaining:    0.0s
building tree 5 of 100
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   18.3s remaining:    0.0s
building tree 6 of 100
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   22.1s remaining:    0.0s
building tree 7 of 100
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   25.6s remaining:    0.0s
building tree 8 of 100
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   29.8s remaining:    0.0s
building tree 9 of 100
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   33.4s remaining: 

0.7264743707125577