In [1]:
import os
import time
import random

import numpy as np
import pandas as pd

from tqdm import trange, tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

np.random.seed(666)

In [2]:
def prettify_duration(duration_seconds):
    s = duration_seconds % 60
    duration_in_minutes = (duration_seconds - s) // 60
    m = duration_in_minutes % 60
    h = (duration_in_minutes - m) // 60

    return f"{h}h {m}m {s}s"

In [3]:
X_train_df = pd.read_table(os.path.join(os.getcwd(), "data", "x_train.txt"), header=None, sep=" ")
y_train_df = pd.read_table(os.path.join(os.getcwd(), "data", "y_train.txt"), header=None)
X_test_df = pd.read_table(os.path.join(os.getcwd(), "data", "x_test.txt"), header=None, sep=" ")

In [4]:
X_train = X_train_df.values
X_scaled = StandardScaler().fit_transform(X_train)
y_train = y_train_df.values.flatten()
X_test = X_test_df.values

In [5]:
class Lasso:
    def __init__(self, X, y):
        self.X = StandardScaler().fit_transform(X)
        self.y = y

        return

    def run(self):
        best_C = None
        best_value = float("-inf")
        best_features = None
        selected_features = np.arange(self.X.shape[1])
        
        for C in tqdm(np.linspace(1, 0.001, 1000)):
            model = LogisticRegression(penalty="l1", solver="liblinear", C=C).fit(self.X[:, selected_features], self.y)
        
            probabilities_of_label_1 = model.predict_proba(self.X[:, selected_features])[:, 1]
            s = np.argsort(probabilities_of_label_1)
        
            selected_features = np.argwhere(model.coef_.flatten() != 0).flatten()
        
            reward = (np.round(probabilities_of_label_1[s[-1000:]]) == self.y[s[-1000:]]).sum() * 10
            cost = len(selected_features) * 200
        
            value = reward - cost
            if best_value < value:
                best_C = C
                best_value = value
                best_features = selected_features

        return best_C, best_value, best_features

In [6]:
lasso = Lasso(X_train, y_train)
a, b, c = lasso.run()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [06:51<00:00,  2.43it/s]


In [7]:
a

np.float64(0.09599999999999997)

In [8]:
b

np.int64(6870)

In [9]:
c

array([1, 2])

In [10]:
class GA:
    def __init__(self, X, y):
        self.X = StandardScaler().fit_transform(X)
        self.y = y
        self.target_indexes = np.argwhere(self.y == 1)
        
        return

    def generate_population(self, population_size):
        return [np.random.choice(self.X.shape[1], np.random.randint(1, self.X.shape[1] + 1), False) for _ in range(population_size)]

    def evaluate_features(self, features):
        model = LogisticRegression(penalty=None).fit(self.X[:, features], self.y)

        probabilities_of_label_1 = model.predict_proba(self.X[:, features])[:, 1]
        s = np.argsort(probabilities_of_label_1)
    
        reward = (np.round(probabilities_of_label_1[s[-1000:]]) == self.y[s[-1000:]]).sum() * 10
        cost = len(features) * 200
        
        return -(reward - cost)

    def cross_over(self, parent_a, parent_b):
        all_indexes = np.unique(np.concatenate((parent_a,parent_b)))
        
        return (
            np.random.choice(all_indexes, np.random.randint(1, len(all_indexes) + 1), False),
            np.random.choice(all_indexes, np.random.randint(1, len(all_indexes) + 1), False)
        )

    def mutate(self, representative):
        to_remove = np.random.choice(self.X.shape[1], np.random.randint(1, 5 + 1), False)
        to_add = np.random.choice(self.X.shape[1], np.random.randint(1, 5 + 1), False)
        
        return np.unique(np.union1d(np.setdiff1d(representative, to_remove), to_add))

    def run(self, population_size=100, number_of_generations=100, cross_over_prob=0.2, mutation_prob=0.2):
        population = self.generate_population(population_size)
    
        for generation in tqdm(range(number_of_generations)):
            ## cross-over
            # calculate values of function to optimise
            proximities = np.array([self.evaluate_features(features) for features in population])
            # and transform them into probabilities
            probabilities = self.proximities2probabilities(proximities)
            children = []
            # try to perform cross-over population_size times
            for _ in range(population_size):
                # if cross-over chance is successful
                if random.random() < cross_over_prob:
                    # select two parents for cross-over
                    parent_indexes = np.random.choice(len(population), 2, False, probabilities)
                    # retrieve parents
                    parent_a, parent_b = population[parent_indexes[0]], population[parent_indexes[1]]
                    # perform cross-over
                    child_a, child_b = self.cross_over(parent_a, parent_b)
                    children.append(child_a)
                    children.append(child_b)
            
            population.extend(children)
            
            ## mutation
            # calculate values of function to optimise
            proximities = np.array([self.evaluate_features(features) for features in population])
            # and transform them into probabilities
            probabilities = self.proximities2probabilities(proximities)
            mutated = []
            # try to perform mutation population_size times
            for _ in range(population_size):
                # if mutation chance is successful
                if random.random() < mutation_prob:
                    # select one representative to be mutated
                    index = np.random.choice(len(population), 1, False, probabilities)
                    # retrieve representative
                    representative = population[index[0]]
                    # perform mutation
                    mutant = self.mutate(representative)
                    mutated.append(mutant)
                    
            population.extend(mutated)
    
            ## selection
            # calculate values of function to optimise
            proximities = np.array([self.evaluate_features(features) for features in population])
            # and transform them into probabilities
            probabilities = self.proximities2probabilities(proximities)
            # evaluate, how much is 10% of population_size
            top_10_best_len = int(0.1 * population_size)
            # get indexes of models from population sorted by values of optimised function
            s = np.argsort(proximities)
            # top 10% of population_size models are advancing to new generation by default
            new_population = [population[idx] for idx in s[:top_10_best_len]]
            # rest indexes select randomly from current population
            pr = probabilities[s[top_10_best_len:]]
            rest_indexes = np.random.choice(s[top_10_best_len:], population_size - top_10_best_len, False, pr / pr.sum())
            # fill new_population up to population_size elements
            new_population.extend([population[idx] for idx in rest_indexes])
            # replace current population with new one
            population = new_population
    
        return population

    @staticmethod
    def proximities2probabilities(proximities):
        proximities = np.clip(proximities, -50, 50)
        mods = np.exp(-proximities)
    
        return mods / mods.sum()

In [11]:
ga = GA(X_train, y_train)
population = ga.run()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [04:10<00:00,  2.51s/it]


In [12]:
population[0]

array([3], dtype=int32)

In [13]:
-ga.evaluate_features(population[0])

np.int64(7200)