In [1]:
import numpy as np
import pandas as pd
from operator import attrgetter
from sklearn.linear_model import LogisticRegression
import random

In [2]:
Data = pd.read_excel('./Midterm_Data/Data.xlsx') # training data
Test1 = pd.read_excel('./Midterm_Data/Test1.xlsx') # validation data
Test2 = pd.read_excel('./Midterm_Data/Test2.xlsx') # testing data

Data.fillna(Data.mean(), inplace=True)
# Data # training data

# data preprocessing

- 1 way ratio: 0 ~ 94 (xi)
- 2 way ratio: 95 ~ 4559 (xi * xj)

In [3]:
def data_preprocessing(data):
    
    # Data_bio_ratio
    x = pd.DataFrame(index=range(data.shape[0]),columns=range(95))
    for i in range(19):
        for j in range(5):
            if i < 9:
                x[i * 5 + j] = data[f'F0{i+1}'] / data[f'R0{j+1}']
            else:
                x[i * 5 + j] = data[f'F{i+1}'] / data[f'R0{j+1}']

    one_way = x.to_numpy()
    two_way = np.zeros((data.shape[0], 95 * 94 // 2))
    count = 0
    for i in range(one_way.shape[1]):
        for j in range(i+1, one_way.shape[1]):
            two_way[:, count] = one_way[:, i] * one_way[:, j]
            count += 1
    
    return np.hstack((one_way, two_way)), data[['C01','C02','C03','C04','C05']].to_numpy().T

In [4]:
x_train, y_train = data_preprocessing(Data)
print("x_train.shape: ", x_train.shape)
print("y_train.shape: ", y_train.shape)

x_val, y_val = data_preprocessing(Test1)
print("x_val.shape: ", x_val.shape)
print("y_val.shape: ", y_val.shape)

x_test, y_test = data_preprocessing(Test2)
print("x_test.shape: ", x_test.shape)
print("y_test.shape: ", y_test.shape)

x_train.shape:  (4240, 4560)
y_train.shape:  (5, 4240)
x_val.shape:  (475, 4560)
y_val.shape:  (5, 475)
x_test.shape:  (987, 4560)
y_test.shape:  (5, 987)


# Competitive Coevolution

In [5]:
class biomarkers_individual:
    def __init__(self, lst):
        self.chromosome = lst
        self.internal_fitness = -1
        self.external_fitness = -1

    def get_internal_fitness():
        return self.internal_fitness
        
class test_patients_individual:
    def __init__(self, lst):
        self.chromosome = lst
        self.internal_fitness = -1
        
    def get_internal_fitness():
        return self.internal_fitness

In [6]:
def Access_Internal_Fitness_PQ(biomarkers_pop, test_patients_pop):
    
    k_fold = 5 # k-fold
    
    for i in range(len(biomarkers_pop)):

        acc_sum = 0
        for diagnosis_num in range(y_train.shape[0]):
            clf = LogisticRegression(max_iter=400, tol=1, solver='saga').fit(x_train[:, biomarkers_pop[i].chromosome], y_train[diagnosis_num])

            for k in range(k_fold):
                test_patients = random.choice(test_patients_pop)
                acc_sum += clf.score(x_val[test_patients.chromosome][:,biomarkers_pop[i].chromosome], y_val[diagnosis_num, test_patients.chromosome])
        
        biomarkers_pop[i].internal_fitness = acc_sum
        

In [7]:
def Access_Internal_Fitness_QP(test_patients_pop, biomarkers_pop):
    
    k_fold = 5 # k-fold
    
    for i in range(len(test_patients_pop)):
        
        acc_sum = 0
        for k in range(k_fold):
            biomarkers = random.choice(biomarkers_pop)
            
            for diagnosis_num in range(y_train.shape[0]):
                clf = LogisticRegression(max_iter=400, tol=1, solver='saga').fit(x_train[:, biomarkers.chromosome], y_train[diagnosis_num])
                acc_sum += clf.score(x_val[test_patients_pop[i].chromosome][:,biomarkers.chromosome], y_val[diagnosis_num, test_patients_pop[i].chromosome])
            
        test_patients_pop[i].internal_fitness = acc_sum
        

In [8]:
def Access_External_Fitness(biomarkers_pop):

    for i in range(len(biomarkers_pop)):

        acc_sum = 0
        for j in range(y_train.shape[0]):
            clf = LogisticRegression(max_iter=400, tol=1, solver='saga').fit(x_train[:, biomarkers_pop[i].chromosome], y_train[j])
            acc_sum += clf.score(x_val[:, biomarkers_pop[i].chromosome], y_val[j])
        
        biomarkers_pop[i].external_fitness = acc_sum
            

In [9]:
def calculate_population_fitness(population, d_num):
    fitness_list = []
    for j in range(len(population)):
        acc_sum = 0
        clf = LogisticRegression(max_iter=400, tol=0.1, solver='saga').fit(x_train[:, population[j]], y_train[d_num])
        acc_sum += clf.score(x_val[:, population[j]], y_val[d_num])
        fitness_list.append(acc_sum)
    return fitness_list

In [10]:
# len(parentssss) == len(offspringsss)
def breed_biomarkers(parentssss):
    
    offspringsss = []
    chromosome_length = len(parentssss[0].chromosome)
    
    for i in range(int(len(parentssss) / 2)):
        parent1 = random.choice(parentssss)
        parent2 = random.choice(parentssss)

        # uniform crossover
        which_parent = np.random.uniform(size = chromosome_length)

        offspring1 = np.zeros(chromosome_length, dtype=int)
        offspring2 = np.zeros(chromosome_length, dtype=int)

        for k in range(chromosome_length):
            if which_parent[k] > 0.5:
                offspring1[k] = parent1.chromosome[k]
                offspring2[k] = parent2.chromosome[k]
            else:
                offspring1[k] = parent2.chromosome[k]
                offspring2[k] = parent1.chromosome[k]
        
        offspringsss.append(biomarkers_individual(list(offspring1)))
        offspringsss.append(biomarkers_individual(list(offspring2)))
    
    
    return offspringsss

In [11]:
# len(parentssss) == len(offspringsss)
def breed_test_patients(parentssss):
    
    offspringsss = []
    chromosome_length = len(parentssss[0].chromosome)
    
    for i in range(int(len(parentssss) / 2)):
        parent1 = random.choice(parentssss)
        parent2 = random.choice(parentssss)
        
        # uniform crossover
        which_parent = np.random.uniform(size = chromosome_length)

        offspring1 = np.zeros(chromosome_length, dtype=int)
        offspring2 = np.zeros(chromosome_length, dtype=int)

        for k in range(chromosome_length):
            if which_parent[k] > 0.5:
                offspring1[k] = parent1.chromosome[k]
                offspring2[k] = parent2.chromosome[k]
            else:
                offspring1[k] = parent2.chromosome[k]
                offspring2[k] = parent1.chromosome[k]
                
        offspringsss.append(test_patients_individual(list(offspring1)))
        offspringsss.append(test_patients_individual(list(offspring2)))
    
    
    return offspringsss

In [12]:
pop_size = 100
generation = 1001

biomarkers_ell = 10
test_patients_ell = 25

pop_avg_fitness_lst = []

# P <- Build Initial Population   (biomarkers)

P = []
for i in range(pop_size):
    P.append(biomarkers_individual(np.random.choice(x_train.shape[0], biomarkers_ell, replace=False)))
    

# Q <- Build Initial Alternative Population   (best 25 test patients in Test1.xls.)
Q = []
for i in range(pop_size):
    Q.append(test_patients_individual(np.random.choice(x_val.shape[0], test_patients_ell, replace=False)))


# Best solution <- Empty Set   (biomarkers)
best = biomarkers_individual([])


# Access Internal Fitness(P, Q)
Access_Internal_Fitness_PQ(P, Q)



# Access Internal Fitness(Q, P)
Access_Internal_Fitness_QP(Q, P)



# Access External Fitness(P)
Access_External_Fitness(P)


print(f'best.external_fitness: {best.external_fitness}')

# for each individual in P
for i in range(pop_size):

# do
    # if (Best == Empty_Set) OR (External Fitness(P_i) > External Fitness(Best))
    if P[i].external_fitness > best.external_fitness:
        
        # then Best = P_i
        best.chromosome = P[i].chromosome.copy()
        best.internal_fitness = P[i].internal_fitness
        best.external_fitness = P[i].external_fitness
    
    
# Repeat Untill Max Evaluation Reach
for iteration in range(generation):
    print(f'iteration: {iteration}')

    # P' <- Join(P, Breed(P))
    PPP = P + breed_biomarkers(P)
    
    # Q' <- Join(Q, Breed(Q))
    QQQ = Q + breed_test_patients(Q)
    
    
    # Access Internal Fitness(P', Q), do selection too !!
    Access_Internal_Fitness_PQ(PPP, Q)


    # do selection too !!
    PPP.sort(key=lambda x: x.internal_fitness, reverse=True)

    
    PPP = PPP[:len(PPP)//2]

    

    # Access Internal Fitness(Q', P), do selection too !!
    Access_Internal_Fitness_QP(QQQ, P)
    

    # do selection too !!
    QQQ.sort(key=lambda x: x.internal_fitness, reverse=False)
    
    
        
    QQQ = QQQ[:len(QQQ)//2]
    

    # Access External Fitness(P')
    Access_External_Fitness(PPP)
    
    
    
    # for each individual in P'
    
    print(f'best.external_fitness: {best.external_fitness}')
    
    summation = 0
    for i in range(pop_size):

        summation += PPP[i].external_fitness
        
    # do
        # if ( External Fitness(P'_i) > External Fitness(Best) )
        if PPP[i].external_fitness > best.external_fitness:

            # then Best = P_i
            best.chromosome = PPP[i].chromosome.copy()
            best.internal_fitness = PPP[i].internal_fitness
            best.external_fitness = PPP[i].external_fitness
    
    
    print(f'{iteration}th population average fitness: {summation/pop_size}')
    pop_avg_fitness_lst.append(summation/pop_size)
    
    # P <- P'
    P = PPP
    
    # Q <- Q'
    Q = QQQ
    
# "best" is the solution !!



best.external_fitness: -1
iteration: 0
best.external_fitness: 4.501052631578948
0th population average fitness: 4.363305263157897
iteration: 1
best.external_fitness: 4.528421052631579
1th population average fitness: 4.38604210526316
iteration: 2
best.external_fitness: 4.543157894736842
2th population average fitness: 4.404589473684211


# print best chromosome, best acc, and predict test2.xls

In [14]:
with open('./competitive_coevolution_pop_avg_fitness.txt', 'w') as f:
    for i in pop_avg_fitness_lst:
        f.write("%s\n" % i)

    f.write("best chromosome: %s\n" % best.chromosome)
    f.write("best fitness: %s\n" % best.external_fitness)
        

In [12]:
def two_ratio(num):
    i = 1
    k = 94
    num = num - 95
    while num > k:
        
        num = num - k
        i += 1
        k -= 1
        
    return i, num + i

In [13]:
best_biomarkers = open('./result/competitive_coevolution_pop_avg_fitness.txt', 'r').readlines()[-2][18:-2].split(',')
best_biomarkers = list(map(int, best_biomarkers))
print(best_biomarkers)

for i in best_biomarkers:
    print(two_ratio(i))

[2076, 3328, 2408, 4161, 3953, 1865, 2548, 2187, 3556, 1636]
(25, 26)
(45, 88)
(29, 88)
(67, 74)
(60, 83)
(22, 28)
(32, 36)
(26, 68)
(50, 81)
(19, 21)


In [16]:
def predict(chromosome, d_num):
    clf = LogisticRegression(max_iter=400, tol=0.1, solver='saga').fit(x_train[:, chromosome], y_train[d_num])
    return clf.predict(x_test[:, chromosome])

In [17]:
y_test = []
for i in range(5):
    Test2[f'C0{i+1}'] = predict(best_biomarkers, i)
    
Test2.to_excel("./Midterm_Data/Test2_comp_result.xlsx", index=False) 