In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from random import randint, random

In [4]:
# load dataset

def load_data(person):
    df = np.genfromtxt('Features_os' + str(person) + '.csv', delimiter=',')
    # delete first row with electrode names
    df = np.delete(df, 0, 0)
    return df

In [13]:
def split_data(df):

    # split data to 4 groups: features and classes for errors and successful clicks
    X_err = df[df[:,-1] == 0][:,:-1]
    y_err = df[df[:,-1] == 0][:,-1]
    X_succ = df[df[:,-1] == 1][:,:-1]
    y_succ = df[df[:,-1] == 1][:,-1]

    # 20% do ostatecznego testu, wyrównana liczba przypadków w grupach
    X_err_rest, X_err_end_test, y_err_rest, y_err_end_test = train_test_split(X_err, y_err, test_size=0.2, random_state=3)
    X_succ_rest, X_succ_end_test, y_succ_rest, y_succ_end_test = train_test_split(X_succ, y_succ,
                                                                                  test_size=y_err_end_test.shape[0],
                                                                                  train_size=y_err_rest.shape[0],
                                                                                  random_state=3)

    # w tym momencie dane do ostatecznego testu wychodzą posegregowane, najpierw błędy interfejsu, potem udane kliknięcia
    # w k-NN to nie będzie miało znaczenia, a co w innych klasyfikatorach?
    X_end_test = np.concatenate((X_err_end_test, X_succ_end_test))
    y_end_test = np.concatenate((y_err_end_test, y_succ_end_test))

    # sklejam pozostałe 80% które wykorzystam przy cross-walidacji
    X_rest = np.concatenate((X_err_rest, X_succ_rest))
    y_rest = np.concatenate((y_err_rest, y_succ_rest))
    
    return X_rest, y_rest, X_end_test, y_end_test

In [24]:
# funkcja tworząca pierwsze pokolenie
# jedynki oznaczają cechy sygnału przekazywane do klasyfikatora

def first_gen(generation_size):
    first_gen = np.random.randint(2, size=(generation_size,136))
    return first_gen

In [46]:
# funkcja oceniająca przystosowanie danego osobnika

def eval(individual, X_train, X_test, y_train, y_test):
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train[:,individual==1], y_train)
    accuracy = neigh.score(X_test[:,individual==1], y_test)
    return accuracy*np.count_nonzero(individual==0)

In [45]:
# funkcja oceniająca całe pokolenie

def evaluation(X_train, X_test, y_train, y_test, generation):
#     X_train, X_test, y_train, y_test = X_train, X_test, y_train, y_test
    gen_fitness = np.apply_along_axis(eval, 1, generation, X_train, X_test, y_train, y_test)
    best_ind = generation[np.argmax(gen_fitness)]
    best_fit = max(gen_fitness)
    return gen_fitness, np.count_nonzero(best_ind), best_fit, best_fit/np.count_nonzero(best_ind==0), best_ind

In [52]:
# ruletka

def roulette(generation, gen_fitness, generation_size):
    gen_fitness /= sum(gen_fitness)
    survivors = np.random.choice(generation_size, generation_size, p=gen_fitness)
    return generation[survivors,:]

In [60]:
# funkcja krzyżująca osobniki

def crossover(survivors, generation_size, crossover_prob):
# czy można się pozbyć tego fora?
    for i in range(0,generation_size,2):
        if np.random.rand() <= crossover_prob:
            cut = np.random.randint(1,135)
            temp = survivors[1,:cut].copy()
            survivors[1,:cut] = survivors[0,:cut]
            survivors[0,:cut] = temp
    return survivors

In [66]:
# funkcja mutująca wybrane geny wybranych osobników z pokolenia

def mutation(individual, mutation_prob):
    if np.random.rand() <= mutation_prob:
        locus = np.random.randint(0,135)
        individual[locus] = 1-individual[locus]
#     return ind

In [67]:
def gen_alg(X_train, X_test, y_train, y_test, generation_size, num_of_generations, mutation_prob, crossover_prob):
    
# inicjalizacja populacji (first generation)
    generation = first_gen(generation_size)
    
# ocena przystosowania populacji
    global_best_fitness = 0
    gen_fitness, local_n_of_zeros, global_best_fitness, global_best_accuracy, best_individual = evaluation(X_train, X_test,
                                                                                                         y_train, y_test,
                                                                                                         generation)
#     print('local ',local_best_fitness)
#     print('num of zeros ', local_n_of_zeros)
#     global_best_fitness = local_best_fitness
#     global_best_accuracy = local_best_accuracy
#     print(global_best_fitness)
        
# główna pętla programu
    for i in range(num_of_generations):
#         print('gen ', i+1)   
# ruletka
        survivors = roulette(generation, gen_fitness, generation_size)
# krzyżowanie
        descendants = crossover(survivors, generation_size, crossover_prob)  
# mutacje
        np.apply_along_axis(mutation, 1, descendants, mutation_prob)
# ocena
        gen_fitness, local_n_of_zeros, local_best_fitness, local_best_accuracy, local_best_individual = evaluation(X_train, X_test,
                                                                                                                   y_train, y_test,
                                                                                                                   descendants)
#         print('local ',local_best_fitness)
#         print('num of zeros ', local_n_of_zeros)
        if local_best_fitness > global_best_fitness:
            global_best_fitness = local_best_fitness
            global_best_accuracy = local_best_accuracy
            best_individual = local_best_individual
#         print(global_best_fitness)
        generation = descendants
    return global_best_fitness, global_best_accuracy, best_individual

In [79]:
# z pozostałych 80% do treningu:
# podział na 5 grup

def train(X_rest, y_rest, generation_size, num_of_generations, crossover_prob, mutation_prob):
    kf = KFold(n_splits=5, shuffle=True, random_state=7)
    alltime_best_fitness = 0
#     alltime_best_accuracy = 0
#     best_individual = np.array([])

    for train_index, test_index in kf.split(X_rest):
        X_train, X_test = X_rest[train_index], X_rest[test_index]
        y_train, y_test = y_rest[train_index], y_rest[test_index]
        global_best_fitness, global_best_accuracy, best_individual = gen_alg(X_train, X_test, y_train, y_test,
                                                                             generation_size, num_of_generations, 
                                                                             mutation_prob, crossover_prob)
        print('fitness: ',global_best_fitness,"\naccuracy: ", global_best_accuracy, "\nindividual: ", best_individual)
        if global_best_fitness > alltime_best_fitness:
#             alltime_best_fitness = global_best_fitness
#             alltime_best_accuracy = global_best_accuracy
            alltime_best_individual = best_individual
    return alltime_best_individual

In [80]:
def end_test(X_end_test, y_end_test, alltime_best_individual):
    folds = KFold(n_splits=X_end_test.shape[0])
    neigh3 = KNeighborsClassifier(n_neighbors=3)
    score = cross_val_score(neigh3, X_end_test[:,alltime_best_individual==1], y_end_test, cv=folds)
    print('end test score: ',score)
    print('score mean: ',score.mean())

In [81]:
def main(person, generation_size, num_of_generations, crossover_prob, mutation_prob):
    df = load_data(person)
    X_rest, y_rest, X_end_test, y_end_test = split_data(df)
    alltime_best_individual = train(X_rest, y_rest, generation_size, num_of_generations, crossover_prob, mutation_prob)
    end_test(X_end_test, y_end_test, alltime_best_individual)

In [6]:
# pomyłki interfejsu to tylko około 7% danych

# rzeczy do przetestowania
# tworząc pierwsze pokolenie osobników zrobić tak żeby od razu miały około tylko 10% (kilkanaście) jedynek
# inne funkcje oceny

In [95]:
main(person=13, generation_size=30, num_of_generations=100, crossover_prob=1, mutation_prob=1)

fitness:  82.0 
accuracy:  1.0 
individual:  [1 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 1 1 1 0 1 0 0 1 0 1 1 0 1 1 1
 0 0 0 1 0 1 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 1 1 1 1 1 0 0 1 0 0 0
 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 1 0 0 1 1 0 1 0 0 0 1 1 0 0 0 0
 0 0 1 0 0 1 0 1 1 1 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0]
fitness:  90.0 
accuracy:  1.0 
individual:  [0 1 0 0 1 0 1 0 1 1 0 1 0 1 0 1 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0
 0 1 1 1 1 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1 1
 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 1
 0 1 0 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0 0 1 1]
fitness:  66.4 
accuracy:  0.8 
individual:  [1 1 0 0 0 1 1 0 1 1 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0
 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 0 0 1 1 1 1 1 0 1 0
 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 0
 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 1 0 0 1 0 1]
fitness:  88.0 
accuracy:  1.0 
in

#### ŚMIETNIK

In [2]:
# # load dataset
# df = pd.read_csv("Features_os1.csv")
# df

Unnamed: 0,FCz_1,FCz_2,FCz_3,FCz_4,FCz_5,FCz_6,FCz_7,FCz_8,FCz_9,FCz_10,...,F4_9,F4_10,F4_11,F4_12,F4_13,F4_14,F4_15,F4_16,F4_17,successful
0,249,33.265792,7.485167,33.265792,7.485167,1560.168609,-1419.023003,141.145606,141.145606,2979.191612,...,350.610402,2846.535297,223.371236,57.426836,69,0.832273,1,0.014493,14,0
1,135,25.850247,5.222387,25.850247,5.222387,1282.497651,-1643.422393,-360.924742,360.924742,2925.920044,...,516.316241,2693.821001,207.643873,62.534716,-45,-1.389660,1,-0.022222,10,0
2,222,24.838694,8.937668,24.838694,8.937668,1540.706563,-1408.406943,132.299620,132.299620,2949.113506,...,383.951421,2258.338796,232.536540,48.686147,45,1.081914,1,0.022222,10,0
3,117,33.657467,3.476197,33.657467,3.476197,1368.569698,-1203.463508,165.106190,165.106190,2572.033206,...,205.362222,2470.569462,194.146729,62.700484,40,1.567512,1,0.025000,16,0
4,68,22.586078,3.010704,22.586078,3.010704,1452.402138,-1738.529504,-286.127366,286.127366,3190.931643,...,123.035255,3568.434665,264.743769,71.916111,61,1.178953,1,0.016393,13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,162,17.198370,9.419497,17.198370,9.419497,743.599905,-958.782772,-215.182866,215.182866,1702.382677,...,215.182866,1702.382677,154.308560,32.271389,71,0.454527,3,0.042254,16,1
352,106,11.326947,9.358214,11.326947,9.358214,498.254718,-730.594543,-232.339826,232.339826,1228.849261,...,232.339826,1228.849261,122.915928,23.004816,-16,-1.437801,1,-0.062500,16,1
353,208,12.165435,17.097621,12.165435,17.097621,644.952105,-440.480973,204.471131,204.471131,1085.433078,...,204.471131,1085.433078,112.507580,20.079171,38,0.528399,1,0.026316,15,1
354,0,11.016174,0.000000,11.016174,0.000000,380.784415,-373.491731,7.292684,7.292684,754.276145,...,7.292684,754.276145,129.378139,21.803610,-93,-0.234447,5,-0.053763,16,1


In [4]:
# # split data to 2 groups
# err_df = df[df['successful'] == 0]
# succ_df = df[df['successful'] == 1]

# # create features array and class array for errors
# X_err = np.array(err_df.drop(['successful'],1))
# y_err = np.array(err_df['successful'])

# # create features array and class array for successful clicks
# X_succ = np.array(succ_df.drop(['successful'],1))
# y_succ = np.array(succ_df['successful'])

In [8]:
# funkcja tworząca pierwsze pokolenie
# jedynki oznaczają cechy sygnału przekazywane do klasyfikatora

# def first_gen(generation_size):
#     first_gen = []
#     for one in range(generation_size):
#         ind = []
#         for i in range(136):
#             ind.append(randint(0, 1))
#         first_gen.append(ind)
#     return first_gen

In [10]:
# # funkcja oceniająca przystosowanie danego osobnika

# def eval(individual):
#     features = []
#     n_of_ones = 0
#     for i in range(136):
#         if individual[i] == 1:
#             features.append(i)
#     neigh = KNeighborsClassifier(n_neighbors=3)
#     neigh.fit(X_train[:, features], y_train)
#     accuracy = neigh.score(X_test[:, features], y_test)
#     return 136-len(features), accuracy, accuracy * (136-len(features)) - 50

In [12]:
# # funkcja oceniająca całe pokolenie

# def evaluation(generation):
#     local_best_fitness = -100
#     local_best_accuracy = 0
#     local_n_of_zeros = 0
#     best_individual = []
#     gen_fitness = np.array([0])
#     for one in generation:
#         n_of_zeros, accuracy, fitness = eval(one)
#         fitness = eval(one)
#         if fitness > local_best_fitness:
#             local_best_fitness = fitness
#             local_best_accuracy = accuracy
#             local_n_of_zeros = n_of_zeros
#             best_individual = one
#         print(fitness)
#         np.append(gen_fitness, fitness)
# #     return gen_fitness, local_best_fitness, best_individual
#     return gen_fitness, local_n_of_zeros, local_best_fitness, local_best_accuracy, best_individual

# f, lf, bi = evaluation(g)
# print(f)

In [14]:
# # ruletka

# def roulette(generation, gen_fitness):
#     roulette_tab = []
#     prev = 0
#     for one in gen_fitness:
#         prev = one/sum(gen_fitness) + prev
#         roulette_tab.append(prev)
#     roulette_tab[-1] = 1
#     survivors = []
#     for i in range(generation_size):
#         random_num = random()
#         winner = 0
#         while random_num > roulette_tab[winner]:
#             winner += 1
#         survivors.append(generation[winner])
#     return survivors

In [17]:
# def cross2ind(ind):
#     if np.random.rand() <= crossover_prob:
#         cut = np.random.randint(1,135)
#         print(cut)
#         print(ind)
#         idn = ind.reshape(2,136)
#         print('reshaped: ',ind)
#         temp = ind[1,:cut].copy()
#         ind[1,:cut] = ind[0,:cut]
#         ind[0,:cut] = temp
#         idn = ind.reshape(2,136)

In [18]:
# # funkcja krzyżująca osobniki

# # def crossover(survivors):
# s = s.reshape(generation_size//2,272)
# # print(s)
# d = np.apply_along_axis(cross2ind, 1, s)

In [19]:
# # funkcja mutująca wybrane geny wybranych osobników z pokolenia

# def mutation(descendants, mutation_prob):
#     for one in descendants:
#         if random() <= mutation_prob:
#             locus = randint(0,135)
# #             zamiana genu
#             one[locus] = 1 - one[locus]
#     return descendants

In [27]:
# g = first_gen(generation_size)
# gf, ones, bf, ba, bi = evaluation(g)
# print(gf)
# print(ones)
# print(bf)
# print(ba)
# print(bi)
# s = roulette(g, gf)
# print(s)
# d = crossover(s)
# d = np.apply_along_axis(mutation, 1, d)
# d

In [90]:
# print(alltime_best_fitness,"\n", alltime_best_accuracy, "\n", alltime_best_individual)