In [1]:
import os
import time
import random

import numpy as np
import pandas as pd

from tqdm import trange, tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

np.random.seed(666)

In [2]:
def prettify_duration(duration_seconds):
    s = duration_seconds % 60
    duration_in_minutes = (duration_seconds - s) // 60
    m = duration_in_minutes % 60
    h = (duration_in_minutes - m) // 60

    return f"{h}h {m}m {s}s"

In [3]:
X_train_df = pd.read_table(os.path.join(os.getcwd(), "data", "x_train.txt"), header=None, sep=" ")
y_train_df = pd.read_table(os.path.join(os.getcwd(), "data", "y_train.txt"), header=None)
X_test_df = pd.read_table(os.path.join(os.getcwd(), "data", "x_test.txt"), header=None, sep=" ")

In [4]:
X_train = X_train_df.values
X_scaled = StandardScaler().fit_transform(X_train)
y_train = y_train_df.values.flatten()
X_test = X_test_df.values

In [5]:
class Lasso:
    def __init__(self, X, y):
        self.stdsc = StandardScaler().fit(X)
        
        self.X = self.stdsc.transform(X)
        self.y = y
        
        self.best_C = None
        self.best_features = None

        return

    def fit(self):
        best_value = float("-inf")
        selected_features = np.arange(self.X.shape[1])
        
        for C in tqdm(np.linspace(1, 0.001, 1000)):
            model = LogisticRegression(penalty="l1", solver="liblinear", C=C).fit(self.X[:, selected_features], self.y)
        
            probabilities_of_label_1 = model.predict_proba(self.X[:, selected_features])[:, 1]
            s = np.argsort(probabilities_of_label_1)
        
            selected_features = np.argwhere(model.coef_.flatten() != 0).flatten()
        
            reward = (np.round(probabilities_of_label_1[s[-1000:]]) == self.y[s[-1000:]]).sum() * 10
            cost = len(selected_features) * 200
        
            value = reward - cost
            if best_value < value:
                best_value = value
                
                self.best_C = C
                self.best_features = selected_features

        return self

    def predict(self, X):
        X = self.stdsc.transform(X)
        model = LogisticRegression(penalty="l1", solver="liblinear", C=self.best_C).fit(self.X[:, self.best_features], self.y)

        probabilities_of_label_1 = model.predict_proba(X[:, self.best_features])[:, 1]
        s = np.argsort(probabilities_of_label_1)

        return s[-1000:]

In [6]:
lasso = Lasso(X_train, y_train).fit()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [06:56<00:00,  2.40it/s]


In [7]:
lasso.best_C

np.float64(0.09599999999999997)

In [8]:
lasso.best_features

array([1, 2])

In [9]:
pd.DataFrame(lasso.best_features).to_csv("solutions/lasso_features_selected.csv")

In [10]:
lasso.predict(X_test)

array([1927, 4032, 2423, 1402, 2369,  631, 3581, 4640, 1668,  255,  386,
       4419, 4153, 4187, 1910, 2394,  996, 1537, 2982, 4097, 3836,  800,
       2667, 3205, 1761, 2126,  296, 2652,  889, 1212, 3595, 3873, 4403,
       2419, 1101, 1114, 3361, 4136, 2780, 4673, 1704, 2869, 3941, 1338,
       3330,  452, 2651, 3544, 4648, 2483, 1786, 3343, 2121, 2362,  311,
       2189, 1782, 3174,  592, 4373,  364, 3767, 2857, 3810, 2257, 1980,
       3240, 1060, 4086, 4070, 1878,  347, 3935, 3351, 1374, 4757, 1971,
       3294, 3322, 4039, 2471, 1584, 3951, 1076, 1041, 1896, 3277, 2263,
       1544, 2587, 4963, 3113,  470, 4990,   82,   33, 1226, 2855, 2361,
       4929,  872, 2521,  335, 4299, 3846, 2447, 4707, 3871, 4949, 1746,
       2961, 2681,  113, 1838, 4790, 2765, 1152, 2062,  790, 1488,  531,
       1667,  632,  807, 1506, 2909, 1426, 3429, 3538, 2316, 2781,   94,
        587, 2410,  602, 3893, 1570, 4714,  900, 2960, 3656, 2091, 2192,
       3902, 1068, 1970, 2465, 2747, 3048, 3675, 13

In [11]:
pd.DataFrame(lasso.predict(X_test)).to_csv("solutions/lasso_observations_predicted.csv")

In [12]:
class GA:
    def __init__(self, X, y):
        self.stdsc = StandardScaler().fit(X)
        
        self.X = self.stdsc.transform(X)
        self.y = y
        
        self.target_indexes = np.argwhere(self.y == 1)
        
        return

    def generate_population(self, population_size):
        return [np.random.choice(self.X.shape[1], np.random.randint(1, self.X.shape[1] + 1), False) for _ in range(population_size)]

    def evaluate_features(self, features):
        model = LogisticRegression(penalty=None).fit(self.X[:, features], self.y)

        probabilities_of_label_1 = model.predict_proba(self.X[:, features])[:, 1]
        s = np.argsort(probabilities_of_label_1)
    
        reward = (np.round(probabilities_of_label_1[s[-1000:]]) == self.y[s[-1000:]]).sum() * 10
        cost = len(features) * 200
        
        return -(reward - cost), s[-1000:]

    def cross_over(self, parent_a, parent_b):
        all_indexes = np.unique(np.concatenate((parent_a,parent_b)))
        
        return (
            np.random.choice(all_indexes, np.random.randint(1, len(all_indexes) + 1), False),
            np.random.choice(all_indexes, np.random.randint(1, len(all_indexes) + 1), False)
        )

    def mutate(self, representative):
        to_remove = np.random.choice(self.X.shape[1], np.random.randint(1, 5 + 1), False)
        to_add = np.random.choice(self.X.shape[1], np.random.randint(1, 5 + 1), False)
        
        return np.unique(np.union1d(np.setdiff1d(representative, to_remove), to_add))

    def run(self, population_size=100, number_of_generations=100, cross_over_prob=0.2, mutation_prob=0.2):
        population = self.generate_population(population_size)
    
        for generation in tqdm(range(number_of_generations)):
            ## cross-over
            # calculate values of function to optimise
            proximities = np.array([self.evaluate_features(features)[0] for features in population])
            # and transform them into probabilities
            probabilities = self.proximities2probabilities(proximities)
            children = []
            # try to perform cross-over population_size times
            for _ in range(population_size):
                # if cross-over chance is successful
                if random.random() < cross_over_prob:
                    # select two parents for cross-over
                    parent_indexes = np.random.choice(len(population), 2, False, probabilities)
                    # retrieve parents
                    parent_a, parent_b = population[parent_indexes[0]], population[parent_indexes[1]]
                    # perform cross-over
                    child_a, child_b = self.cross_over(parent_a, parent_b)
                    children.append(child_a)
                    children.append(child_b)
            
            population.extend(children)
            
            ## mutation
            # calculate values of function to optimise
            proximities = np.array([self.evaluate_features(features)[0] for features in population])
            # and transform them into probabilities
            probabilities = self.proximities2probabilities(proximities)
            mutated = []
            # try to perform mutation population_size times
            for _ in range(population_size):
                # if mutation chance is successful
                if random.random() < mutation_prob:
                    # select one representative to be mutated
                    index = np.random.choice(len(population), 1, False, probabilities)
                    # retrieve representative
                    representative = population[index[0]]
                    # perform mutation
                    mutant = self.mutate(representative)
                    mutated.append(mutant)
                    
            population.extend(mutated)
    
            ## selection
            # calculate values of function to optimise
            proximities = np.array([self.evaluate_features(features)[0] for features in population])
            # and transform them into probabilities
            probabilities = self.proximities2probabilities(proximities)
            # evaluate, how much is 10% of population_size
            top_10_best_len = int(0.1 * population_size)
            # get indexes of models from population sorted by values of optimised function
            s = np.argsort(proximities)
            # top 10% of population_size models are advancing to new generation by default
            new_population = [population[idx] for idx in s[:top_10_best_len]]
            # rest indexes select randomly from current population
            pr = probabilities[s[top_10_best_len:]]
            rest_indexes = np.random.choice(s[top_10_best_len:], population_size - top_10_best_len, False, pr / pr.sum())
            # fill new_population up to population_size elements
            new_population.extend([population[idx] for idx in rest_indexes])
            # replace current population with new one
            population = new_population
    
        return population

    def predict(self, X, features):
        X = self.stdsc.transform(X)
        model = LogisticRegression(penalty=None).fit(self.X[:, features], self.y)
        
        probabilities_of_label_1 = model.predict_proba(X[:, features])[:, 1]
        s = np.argsort(probabilities_of_label_1)
        
        return s[-1000:]

    @staticmethod
    def proximities2probabilities(proximities):
        proximities = np.clip(proximities, -50, 50)
        mods = np.exp(-proximities)
    
        return mods / mods.sum()

In [13]:
ga = GA(X_train, y_train)
population = ga.run()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [04:23<00:00,  2.64s/it]


In [14]:
population[0]

array([2], dtype=int32)

In [15]:
pd.DataFrame(population[0]).to_csv("solutions/ga_features_selected.csv")

In [16]:
ga.predict(X_test, population[0])

array([2961, 1517, 3174, 1612, 2957, 2600, 2857, 3698,  104, 3544, 3551,
       2126, 3361,  940, 1060, 2566, 3865, 2901, 1589, 3517, 2485, 2022,
       1896, 3322, 3712, 2855, 1584, 4688, 4472, 3871, 4929, 3343, 3581,
        452, 3530, 2488, 1878, 3602,  879, 4876, 4438, 2982,  962, 3873,
        311, 3941,  103, 4714, 1374, 3586,  442, 2521, 1014, 2631, 2349,
        274,  296,  591, 1360, 1668, 2667, 4869, 2063,  335, 1910, 1007,
       3810, 3384, 1761, 4673, 2681, 2189, 3802, 4963, 1030,  364, 1226,
       3372, 2651, 4388, 3685,  347, 3962, 4078, 2869,   65, 4136,  566,
        592, 4070, 1402, 1576, 2471, 4097, 2462, 1338, 1992, 4233, 4893,
       1786,  554, 2047,  694, 3027, 4124,  854,  632, 1746, 4086, 1838,
       4707, 3689,  231,  171, 2737,  955,  470, 3616,  953, 1101, 2483,
       1076,  996, 2519, 1966, 3252, 1185,  889, 1068, 4403, 2143, 2747,
        872, 2465, 2263, 3538,   33, 1549, 3294, 2419, 2980, 4790, 3846,
       1971, 2909, 3351, 2587, 2192, 2165, 3747, 19

In [17]:
pd.DataFrame(ga.predict(X_test, population[0])).to_csv("solutions/ga_observations_predicted.csv")