In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('./extention_of_Z-Alizadeh_sani_dataset.csv')
float_columns = df.select_dtypes('float64').columns

for column in float_columns:
    df[column] = pd.cut(x=df[column], bins=3, labels=range(3), right=False)

int_columns = df.select_dtypes('int64').columns
df[int_columns] = df.select_dtypes('int64').astype('category')
df = df.drop('Cath', axis= 1)
dataset_infos = df.nunique()

dataset_infos


In [None]:
POPULATION_SIZE = 250
MAX_GEN = 200
CO_PROB = 0.9
M_PROB = 0.15
REP_NO = 5
W_CONFIDENCE = 0.8
W_SUPPORT = 0.2
DATASET_SIZE = 212
fit_dic = {}
DATASET_KEYS = dataset_infos.keys()
DATASET_VALUES = dataset_infos.values
CHROMOSOME_SIZE = DATASET_VALUES.sum()


In [None]:
df = pd.read_csv('./binarized.csv')

df = df.sample(frac= 1, axis= 0).reset_index(drop=True)

train_data = df[0:DATASET_SIZE]
train_data_copy = train_data

test_data = df[DATASET_SIZE:]
train_data.head()

In [None]:
COLUMNS = df.columns

In [None]:
class Rule:
    def __init__(self, conditions, confidence, support, chromosome):
        self.conditions = conditions
        self.confidence = confidence
        self.support = support
        self.output = ''
        self.chromosome = chromosome


In [None]:
def init_pop():
    return np.random.randint(0, 2, (POPULATION_SIZE, CHROMOSOME_SIZE))


In [None]:
def cross_over(chrom1, chrom2):
    points = np.random.randint(1, CHROMOSOME_SIZE, 2)
    points.sort()
    point1, point2 = points
    new_chrom1 = np.concatenate([chrom1[:point1], chrom2[point1:point2], chrom1[point2:]])
    new_chrom2 = np.concatenate([chrom2[:point1], chrom1[point1:point2], chrom2[point2:]])
    return new_chrom1, new_chrom2


In [None]:
def mutation(chrom):
    noMU = np.random.randint(1, 7)
    for i in range(noMU):
        point = np.random.randint(0, CHROMOSOME_SIZE)
        if(chrom[point] == 0):
            chrom[point] = 1
        else:
            chrom[point] = 0

    return chrom


In [None]:
def extract_condition(chrom):
    k = 0
    condition = ''
    lb = 0
    ub = DATASET_VALUES[0]
    addAnd = False

    for i in range(len(DATASET_VALUES)):
        temp = np.array(chrom[lb: ub])
        if(i < len(DATASET_VALUES) - 1):
            lb += DATASET_VALUES[i]
            ub += DATASET_VALUES[i + 1]

        if (temp.sum() == 0 or temp.sum() == DATASET_VALUES[i]):
            k += DATASET_VALUES[i]
            continue
        if(i > 0 and addAnd):
            condition += ' & '

        condition += '('
        addOr = False

        for j in range(DATASET_VALUES[i]):
            if(addOr):
                condition += ' | '
            addOr = True
            addAnd = True
            condition += f'(df_copy["{COLUMNS[k]}"] == {temp[j]})'
            k += 1

        condition += ')'
    
    return condition


In [None]:
def fitness(chrom):
    f = 0
    str_chrom = "".join(str(x) for x in chrom)
    try:
        f = fit_dic[str_chrom]
    except KeyError:
        df_copy = train_data

        condition = extract_condition(chrom)

        if(len(condition) > 0):
            df_copy = df_copy[eval(condition)]

        z = np.array([len(df_copy[(df_copy['Cath'] == "CAD")]),
                        len(df_copy[(df_copy['Cath'] == "Normal")])])

        sum_z = z.sum()
        if(sum_z != 0):
            f = W_CONFIDENCE * (z.max() / sum_z) + W_SUPPORT * (sum_z / DATASET_SIZE)
            fit_dic[str_chrom] = f
        else:
            fit_dic[str_chrom] = 0
            f = 0
    return f

In [None]:
def tournament_selection(sorted_pop):
    selected = []
    for i in range(POPULATION_SIZE):
        indices = np.random.randint(0, POPULATION_SIZE, 2)
        if(indices[0] > indices[1]):
            copy = np.copy(sorted_pop[indices[0]])
            selected.append(copy)
        else:
            copy = np.copy(sorted_pop[indices[1]])
            selected.append(copy)
    
    return np.array(selected)


In [None]:
def genetic():
  population = init_pop()
  second_chance = False
  ma = []
  mi = []
  me = []
    
  for i in range(MAX_GEN):
    pop_fitnesses = np.array([fitness(member) for member in population])
    if((i > MAX_GEN / 2) and pop_fitnesses.max() < 0.1 and second_chance == False):
      print('second chance')
      population = init_pop()
      second_chance = True

    ma.append(pop_fitnesses.max())
    mi.append(pop_fitnesses.min())
    me.append(pop_fitnesses.mean())
    sorted_pop = population[np.argsort(pop_fitnesses)]
    parents = tournament_selection(sorted_pop)
    
    children = []
    for j in range(POPULATION_SIZE // 2):
      co_i = np.random.randint(0, POPULATION_SIZE, 2)
      c1 = parents[co_i[0]]
      c2 = parents[co_i[1]]

      if (np.random.uniform(0, 1) < CO_PROB):
        c1, c2 = cross_over(c1, c2)
      children.append(c1)
      children.append(c2)

    children = np.array(children)
    mutation_indices = np.array([])
    if(second_chance):
      mutation_indices = np.random.uniform(0, 1, POPULATION_SIZE) < M_PROB + 0.2
    else: 
      mutation_indices = np.random.uniform(0, 1, POPULATION_SIZE) < M_PROB

    mutate_children = children[mutation_indices]
    if (len(mutate_children) > 0):
      mutated = [mutation(x) for x in mutate_children]
      children = np.concatenate([mutated, children[np.invert(mutation_indices)]])

    ch_fitnesses = [fitness(member) for member in children]
    sorted_ch = children[np.argsort(ch_fitnesses)]
    
    for j in range(REP_NO):
      sorted_pop[j] = sorted_ch[POPULATION_SIZE - j - 1]
    
    population = sorted_pop
  
  fitnesses = [fitness(member) for member in population]
  s = population[np.argsort(fitnesses)]

  return ma, mi, me, s[POPULATION_SIZE - 1]
    

In [None]:
rules = []
dropped_rows = 1
df_copy = train_data

while ((len(df_copy) > 0) and (dropped_rows > 0)):
    fit_dic.clear()
    dropped_rows = 0
    ma, mi, me, b_parent = genetic()
    rules.append(b_parent)

    condition = extract_condition(b_parent)
    print(condition)

    if(len(condition) > 0):
        pre_len = len(df_copy)
        df_copy = df_copy.drop(df_copy[eval(condition)].index, errors='ignore', axis=0)
        dropped_rows =  pre_len - len(df_copy)

    r = range(MAX_GEN)
    fig, ax = plt.subplots(1)
    l1, = ax.plot(r, ma, 'r')
    l1.set_label('Maximum')
    l2, = ax.plot(r, me, 'b')
    l2.set_label('Mean')
    l3, = ax.plot(r, mi, 'g')
    l3.set_label('Minimum')
    plt.ylabel('Fitness')
    plt.xlabel('i')
    ax.legend()
    # plt.title('')
    plt.show()



In [None]:
# df_copy = train_data
final_rules = []
for k in range(len(rules)):
    df_copy = train_data_copy
    
    condition = extract_condition(rules[k])

    if(len(condition) > 0):
      df_copy = df_copy[eval(condition)]

    if(len(df_copy) == 0):
      continue

    z = np.array([len(df_copy[(df_copy['Cath'] == "CAD")]),
                  len(df_copy[(df_copy['Cath'] == "Normal")])])

    newRule = Rule(conditions=condition, confidence=z.max() / len(df_copy),
                   support=len(df_copy) / DATASET_SIZE, chromosome=rules[k])
    
    if(z[0] >= z[1]):
      newRule.output = 'CAD'
    else:
      newRule.output = 'Normal'
    final_rules.append(newRule)

    print(f'Rule {k}: Confidence = {newRule.confidence} and Support = {newRule.support} and output = {newRule.output}')


In [None]:
final_rules.sort(key=lambda el: (el.confidence, el.support), reverse=True)
for i in final_rules:
    print(
        f'Confidence = {i.confidence} and Support = {i.support} and output = {i.output}')


In [None]:
test_data_copy = test_data
test_data_copy = test_data_copy.reset_index()
correct = 0
wrong = 0
for rule in final_rules:
    rule.conditions = rule.conditions.replace('df_copy', 'test_data_copy')
    includes = test_data_copy.loc[eval(rule.conditions)]
    true_inclusions = len(includes[includes['Cath'] == rule.output])
    wrong += len(includes) - true_inclusions
    correct += true_inclusions
    test_data_copy = test_data_copy.drop(
        test_data_copy[eval(rule.conditions)].index, errors='ignore', axis=0)


In [None]:
accuracy = correct / (correct + wrong + len(test_data_copy))
accuracy


In [None]:
with open('results.txt', 'w') as f:
    for rule in final_rules:
        f.write("".join(str(x) for x in rule.chromosome))
        f.write('\n')
