In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders.woe import WOEEncoder
from sklearn.compose import ColumnTransformer
from NiaPy.task import StoppingTask, OptimizationType
from NiaPy.algorithms.basic import GreyWolfOptimizer, FireflyAlgorithm, GeneticAlgorithm, \
    BatAlgorithm, ParticleSwarmOptimization

In [None]:
from preprocessing import preprocess

X_train, X_test, y_train, y_test = preprocess()

In [1]:
from xgboost import XGBRFClassifier
from sklearn.metrics import f1_score

clf = XGBRFClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score(y_test, y_pred)

NameError: name 'X_train' is not defined

In [136]:
from sklearn.metrics import accuracy_score

accuracy_score(y_pred, y_test)

0.9718059436118872

In [137]:
from imblearn.under_sampling import RandomUnderSampler


def random_undersample(x, y):
    """
    Undersample data by random choosing samples from majority class.
    :param x: train data features.
    :param y: train data labels.
    :return: x, y after undersampling.
    """
    rus = RandomUnderSampler(random_state=42)
    return rus.fit_resample(x, y)

In [138]:
X_train, y_train = random_undersample(X_train, y_train)
X_test = X_test

In [139]:
X_train = pd.DataFrame(X_train)

In [140]:
X_test = pd.DataFrame(X_test)

In [141]:
def model_fn():
    return DecisionTreeClassifier(random_state=42)

In [142]:
from sklearn.metrics import accuracy_score

In [143]:
from NiaPy.benchmarks import Benchmark


class ClassificationBenchmark(Benchmark):
    """
    NiaPy benchmark for classification task.

    :param model_fn: function which returns sklearn model.
    :param eval_fn: function(y_test, y_pred) which evaluates predictions
         and returns a scalar.
    :param x_train: train data.
    :param y_train: train labels.
    :param x_test: test data.
    :param y_test: test labels.
    """
    def __init__(self, model_fn, eval_fn, x_train, y_train, x_test, y_test):
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.model_fn = model_fn
        self.eval_fn = eval_fn

        Benchmark.__init__(self, 0, 1)

    def get_length(self):
        """
        Get length of the vector which is being optimized.

        :return: length of the vector which is being optimized.
        """
        return len(self.x_train.columns)

    def select_columns(self, solution_vec):
        """
        Select columns based on the solution vector.

        :param solution_vec: solution of the problem as a vector.
        :return: list of column names based on the solution vector.
        """
        return self.x_train.columns[solution_vec >= 0.5].tolist()

    def function(self):
        def evaluate(_, solution_vec):
            selected_columns = self.select_columns(solution_vec)

            # fix of incorrect serialization when using multi threading module
            if len(selected_columns) == 1 and \
                    not isinstance(selected_columns[0], str):
                selected_columns = selected_columns[0]

            if len(selected_columns) < 1:
                # inverted score, since the optimizer minimizes the task
                return 1 - 0

            clf = self.model_fn()
            clf = clf.fit(self.x_train[selected_columns], self.y_train)

            y_pred = clf.predict(self.x_test[selected_columns])
            score = self.eval_fn(self.y_test, y_pred)

            # inverted score, since the optimizer minimizes the task
            return 1 - score

        return evaluate

In [145]:
from sklearn.tree import DecisionTreeClassifier

benchmark = ClassificationBenchmark(model_fn,
                                    accuracy_score,
                                    X_train,
                                    y_train,
                                    X_test,
                                    y_test)

In [146]:
def optimize(benchmark, algorithm, nGEN, num_runs = 5):
    """
    Optimize task with provided algorithm.

    :param benchmark: NiaPy.Benchmark to optimize.
    :param algorithm: algorithm object to use for optimization task.
    :param nGEN: number of generations.
    :param num_runs: number of algorithm runs (defaults to 5).
    """
    best_columns = None
    best_score = 0

    for i in tqdm(range(num_runs)):
        # when using OptimizationType.MAXIMIZATION, the library will fail
        # we use OptimizationType.MINIMIZATION instead and invert the score
        task = StoppingTask(
            D=benchmark.get_length(),
            nGEN=nGEN,
            optType=OptimizationType.MINIMIZATION,
            benchmark=benchmark
        )

        solution_vec, score = algorithm.run(task=task)
        # invert the score
        score = 1 - score
        columns = benchmark.select_columns(solution_vec)

        print('--------------')
        print(f'Run {i + 1}')
        print('--------------')
        print(f'Score: {score}')
        print(f'Number of features selected: {len(columns)}\n')
        print('\n')

        if score > best_score:
            best_score = score
            best_columns = columns

    print(f'\nBest score of {num_runs} runs: {best_score}')
    print(f'Number of features selected: {len(best_columns)}')

    return best_columns

In [147]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,222,223,224,225,226,227,228,229,230,231
0,-0.375099,0.797244,0.570168,-0.035788,-0.057743,6.410245,1.372399,-0.045217,-0.052667,4.149713,...,0.0,-0.554788,-0.439415,-0.484877,-0.358965,0.011083,-0.147243,-0.499915,-0.383702,0.226201
1,-0.370550,-0.098583,-0.092111,-0.035788,-0.057743,-0.217004,-0.113986,-0.045217,-0.052667,-0.210538,...,0.0,-0.554788,0.000000,-0.496415,-0.358965,0.011083,-0.147243,-0.474585,-0.383702,-0.446561
2,1.345121,-0.098583,-0.092111,-0.035788,-0.057743,-0.217004,-0.113986,-0.045217,-0.052667,-0.210538,...,0.0,-0.554788,0.682728,-0.496415,-0.358965,0.011083,-0.147243,-0.184240,-0.383702,-0.490869
3,0.976858,-0.098583,-0.092111,-0.035788,-0.057743,-0.217004,-0.113986,-0.045217,-0.052667,-0.210538,...,0.0,-0.554788,-0.528362,-0.429678,-0.358965,0.011083,-0.147243,-0.239920,-0.383702,-0.446561
4,-0.149592,-0.098583,-0.092111,-0.035788,-0.042562,-0.217004,-0.113986,-0.045217,-0.041771,-0.270267,...,1.0,0.477773,0.335557,0.367303,-0.358965,-0.046083,0.642275,-0.474585,-0.383702,0.226201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8219,-0.413875,-0.075213,-0.045283,-0.035788,-0.057743,-0.140388,0.031738,-0.045217,-0.052667,-0.091079,...,0.0,-0.554788,2.340956,0.579292,-0.358965,0.011083,-0.147243,0.119928,-0.383702,0.226201
8220,-0.437556,1.420428,1.687346,-0.035788,0.731668,-0.217004,0.629206,0.829217,1.385665,-0.270267,...,1.0,1.284766,3.167634,2.015533,1.414037,-0.046083,0.059207,1.289880,1.289880,0.505240
8221,0.396303,-0.098583,-0.092111,-0.035788,-0.057743,-0.217004,-0.113986,-0.045217,-0.052667,-0.210538,...,0.0,-0.554788,-0.415885,-0.599558,-0.358965,-0.046083,0.059207,-0.184240,-0.383702,-0.446561
8222,-0.149592,-0.098583,-0.092111,-0.035788,-0.042562,-0.217004,-0.113986,-0.045217,-0.052667,-0.270267,...,1.0,-0.028327,0.647636,0.301360,-0.358965,0.011083,-0.147243,1.935491,-0.383702,0.226201


In [None]:
%%time

columns = optimize(benchmark, FireflyAlgorithm(), 100)

In [1]:
from preprocessing import preprocess

X_train, X_test, y_train, y_test = preprocess()

use_cols:


100%|██████████| 435/435 [00:00<00:00, 200717.52it/s]


cat_features:


100%|██████████| 49/49 [00:00<00:00, 87344.20it/s]


num_features:


100%|██████████| 383/383 [00:00<00:00, 332110.49it/s]


KeyError: "['dist1', 'dist2', 'D5', 'D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14', 'V138', 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V168', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V190', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 'V199', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V211', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V218', 'V219', 'V220', 'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V230', 'V231', 'V232', 'V233', 'V234', 'V235', 'V236', 'V237', 'V238', 'V239', 'V240', 'V241', 'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V248', 'V249', 'V250', 'V251', 'V252', 'V253', 'V254', 'V255', 'V256', 'V257', 'V258', 'V259', 'V260', 'V261', 'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V269', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276', 'V277', 'V278', 'V322', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338', 'V339', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11'] not in index"