In [1]:
import numpy as np
import pandas as pd
from operator import attrgetter
from sklearn.linear_model import LogisticRegression
import random

In [2]:
Data = pd.read_excel('./Midterm_Data/Data.xlsx') # training data
Test1 = pd.read_excel('./Midterm_Data/Test1.xlsx') # validation data
Test2 = pd.read_excel('./Midterm_Data/Test2.xlsx') # testing data

Data.fillna(Data.mean(), inplace=True)

# data preprocessing

- 1 way ratio: 0 ~ 94 (xi)
- 2 way ratio: 95 ~ 4559 (xi * xj)

In [3]:
def data_preprocessing(data):
    
    # Data_bio_ratio
    x = pd.DataFrame(index=range(data.shape[0]),columns=range(95))
    for i in range(19):
        for j in range(5):
            if i < 9:
                x[i * 5 + j] = data[f'F0{i+1}'] / data[f'R0{j+1}']
            else:
                x[i * 5 + j] = data[f'F{i+1}'] / data[f'R0{j+1}']

    one_way = x.to_numpy()
    two_way = np.zeros((data.shape[0], 95 * 94 // 2))
    count = 0
    for i in range(one_way.shape[1]):
        for j in range(i+1, one_way.shape[1]):
            two_way[:, count] = one_way[:, i] * one_way[:, j]
            count += 1
    
    return np.hstack((one_way, two_way)), data[['C01','C02','C03','C04','C05']].to_numpy().T

In [4]:
x_train, y_train = data_preprocessing(Data)
print("x_train.shape: ", x_train.shape)
print("y_train.shape: ", y_train.shape)

x_val, y_val = data_preprocessing(Test1)
print("x_val.shape: ", x_val.shape)
print("y_val.shape: ", y_val.shape)

x_test, y_test = data_preprocessing(Test2)
print("x_test.shape: ", x_test.shape)
print("y_test.shape: ", y_test.shape)

x_train.shape:  (4240, 4560)
y_train.shape:  (5, 4240)
x_val.shape:  (475, 4560)
y_val.shape:  (5, 475)
x_test.shape:  (987, 4560)
y_test.shape:  (5, 987)


# NSGA-II

In [5]:
class individual:
    def __init__(self, lst):
        self.chromosome = lst
        self.fitness = []

In [6]:
def calculate_population_fitness(population):
    summation = [0, 0, 0, 0]
    
    for j in range(len(population)):
        
        for d_num in [0, 1, 3]:
        
            clf = LogisticRegression(max_iter=400, tol=1, solver='saga').fit(x_train[:, population[j].chromosome], y_train[d_num])
            tmp = clf.score(x_val[:, population[j].chromosome], y_val[d_num])
            population[j].fitness.append(tmp)
            
            summation[d_num] += tmp
        
    print(summation)
    
    return summation
            

In [7]:
# type of a, b are both "individual"
# return true if a pareto dominate b

def pareto_dominate(a, b):
    return ((a.fitness[0] >= b.fitness[0]) and (a.fitness[1] >= b.fitness[1]) and (a.fitness[2] >= b.fitness[2]))

In [8]:
def find_pareto_front(pop):

    pop_size = len(pop)
    pareto_front = []
    
    for i in range(pop_size):
        
        tmp_front = pareto_front.copy()
        pareto_front.append(pop[i])
        
        for j in tmp_front:
            
            if pareto_dominate(j, pop[i]):
                pareto_front.remove(pop[i])
                break
                
            elif pareto_dominate(pop[i], j):
                pareto_front.remove(j)
    return pareto_front

In [9]:
# len(parentssss) == len(offspringsss)
def breed(parentssss):
    
    offspringsss = []
    chromosome_length = len(parentssss[0].chromosome)
    
    for i in range(len(parentssss)):
        parent1 = random.choice(parentssss)
        parent2 = random.choice(parentssss)
    

        # uniform crossover
        which_parent = np.random.uniform(size = chromosome_length)

        offspring1 = np.zeros(chromosome_length, dtype=int)
        offspring2 = np.zeros(chromosome_length, dtype=int)

        
        for k in range(chromosome_length):
            if which_parent[k] > 0.5:
                offspring1[k] = parent1.chromosome[k]
                offspring2[k] = parent2.chromosome[k]
            else:
                offspring1[k] = parent2.chromosome[k]
                offspring2[k] = parent1.chromosome[k]
                
                
        # mutation
        prob = np.random.uniform()
        mutation_gene_index = np.random.randint(chromosome_length)
        mutation_gene_value = np.random.choice(x_train.shape[1])
        if prob < 0.5:
            offspring1[mutation_gene_index] = mutation_gene_value
        elif prob > 0.5:
            offspring2[mutation_gene_index] = mutation_gene_value
                
                
        offspringsss.append(individual(list(offspring1)))
        offspringsss.append(individual(list(offspring2)))
    
    
    return offspringsss

In [10]:
population = []
population_size = 100

archive = []
archive_size = int(population_size/2)

chromosome_length = 10
generation = 1000

pop_avg_fitness_lst = []

# initailize first generation population

for i in range(population_size):
    population.append(individual(np.random.choice(x_train.shape[1], chromosome_length, replace=False)))


for iteration in range(generation):

    print(f'gen: {iteration}')
    # calcualate population fitness
    pop_avg_fitness_lst.append(calculate_population_fitness(population))
        

    # Population = Population U Archive
    population = population + archive


    # Archive <-- {}
    archive = []

    while True:

        # Find Pareto Front from population
        p_front = find_pareto_front(population)
        
        
        # if A.size + (R_i).size >= desired archive size
        if len(archive) + len(p_front) > archive_size:
            
            if len(archive) < archive_size:
            
            # A <-- A U sparsest a - A.size individuals in R_i
            # find sparer individual in p_front and add to archive
                archive = archive + random.sample(p_front, (archive_size - len(archive)))

                
            
            # break from for loop
            break

        # if A.size + (R_i).size < desired archive size
        else:

            # A <-- A U R_i
            archive = archive + p_front
            
        # remvoe Pareto Front from population
        for p in p_front:
            population.remove(p)

    # while loop end

    # P <-- Breed(A)
    population = breed(archive)
    


gen: 0
[85.77263157894741, 84.25052631578957, 0, 83.7115789473685]
gen: 1


KeyboardInterrupt: 

In [35]:
calculate_population_fitness(population)

population = population + archive
p_front = find_pareto_front(population)
p_front.sort(key=lambda x: sum(x.fitness), reverse=True)

print(p_front[0].chromosome)
print(pop_avg_fitness_lst)

[89.71157894736844, 88.15578947368424, 0, 86.09473684210528]
[1909, 2223, 1191, 3481, 1588, 1766, 10, 2056, 1501, 3935]
[[84.84421052631588, 83.96631578947377, 0, 83.58315789473696], [86.3305263157895, 84.66526315789483, 0, 84.12000000000009], [87.9705263157895, 85.14526315789477, 0, 84.73473684210532], [89.19789473684214, 86.03578947368425, 0, 85.7326315789474], [89.06315789473689, 87.58736842105269, 0, 86.08000000000007]]


# print best chromosome, best acc, and predict test2.xls

In [37]:
with open('./nsga_pop_avg_fitness_lst.txt', 'w') as f:
    for i in pop_avg_fitness_lst:
        f.write("%s\n" % i)
        
    f.write("best fitness: %s\n" % p_front[0].fitness)
    

In [11]:
def two_ratio(num):
    i = 1
    k = 94
    num = num - 95
    while num > k:
        
        num = num - k
        i += 1
        k -= 1
        
    return i, num + i

In [16]:
best_biomarkers = open('./result/nsga_pop_avg_fitness_lst.txt', 'r').readlines()[-2][18:-2].split(',')
best_biomarkers = list(map(int, best_biomarkers))
print(best_biomarkers)

for i in best_biomarkers:
    print(two_ratio(i))

[4210, 3146, 1146, 1641, 2426, 3825, 3340, 2409, 2289, 1790]
(69, 70)
(42, 59)
(12, 84)
(19, 26)
(30, 41)
(57, 63)
(46, 51)
(29, 89)
(28, 35)
(21, 26)


In [17]:
def predict(chromosome, d_num):
    clf = LogisticRegression(max_iter=400, tol=0.1, solver='saga').fit(x_train[:, chromosome], y_train[d_num])
    return clf.predict(x_test[:, chromosome])

In [18]:
y_test = []
for i in range(5):
    Test2[f'C0{i+1}'] = predict(best_biomarkers, i)
    
Test2.to_excel("./Midterm_Data/Test2_nsga_result.xlsx", index=False) 