In [4]:

# %%
import argparse
import itertools
import operator
import random
import dill as pickle
import numpy as np
import pandas as pd
from deap import base, tools, gp, creator

from digen import Benchmark
import digen

def safeDiv(left, right):
    try:
        return left / right
    except ZeroDivisionError:
        return 1

columns = 10
pset = gp.PrimitiveSetTyped("MAIN", itertools.repeat(float, int(columns)), float, "X")

pset.addPrimitive(operator.add, [float, float], float)
pset.addPrimitive(operator.sub, [float, float], float)
pset.addPrimitive(operator.mul, [float, float], float)
pset.addPrimitive(safeDiv, [float, float], float)

pset.addPrimitive(operator.eq, [float, float], float)
pset.addPrimitive(operator.ne, [float, float], float)
pset.addPrimitive(operator.ge, [float, float], float)
pset.addPrimitive(operator.gt, [float, float], float)
pset.addPrimitive(operator.le, [float, float], float)
pset.addPrimitive(operator.lt, [float, float], float)

pset.addPrimitive(min, [float, float], float)
pset.addPrimitive(max, [float, float], float)

randval = "rand" + str(random.random())[2:]
pset.addEphemeralConstant(randval, lambda: random.random() * 100, float)
pset.addTerminal(0.0, float)
pset.addTerminal(1.0, float)

creator.create("FitnessMax", base.Fitness, weights=(1.0, 1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=2, max_=5)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)

toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, pset=pset, min_=0, max_=2)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)  # Returns a tuple of one tree.
ref_points = tools.uniform_reference_points(nobj=2, p=12)
toolbox.register("select", tools.selNSGA3, ref_points=ref_points)
toolbox.register("map", map)  # Overload the map function

def get_random_data(rows, cols, seed):
    """ return randomly generated data is shape passed in """

    np.random.seed(seed)
    random.seed(seed)
    #    data = np.random.randint(0,3,size=(rows,cols))
    data = np.random.normal(size=(rows, cols))
    X = pd.DataFrame(data)
    X.columns = list(map(lambda k: 'X' + str(k), range(X.shape[1])))
    return X

def apply_equation(X, func):
    # Dimensionality reduction.
    y = np.zeros(X.shape[0])
    for j in range(X.shape[0]):  # For every row. Every row is passed through a tree.
        y[j] = func(*X.iloc[j]) if isinstance(X, pd.DataFrame) else func(*X[j, :])  # X
    
    return y

# %%
def regenerate(dataset=None):

    benchmark = Benchmark()
    # assign id in DIGEN to default seed
    seedmap = dict(map(lambda x: (x.split('_')[0], x.split('_')[1]), benchmark.list_datasets()))
    # assign id in DIGEN to equation
    datamap = dict(map(lambda x, y: (x.split('_')[0], y), benchmark.get_models().keys(), benchmark.get_models().values()))

    
    key = dataset.split('_')[0]
    equation = datamap[key]

    seed = int(seedmap[key])

    random.seed(seed)
    np.random.seed(seed)
    
    X = get_random_data(1000, 10, seed)


    individual = gp.PrimitiveTree.from_string(equation, pset)
    func = toolbox.compile(expr=individual)
    
    original_y = apply_equation(X, func)


    y, thresh, median_random_assignment = reclassify(original_y, 0.5)

    return X, y, original_y, func, thresh, median_random_assignment


def reclassify(y, p):

    order = np.argsort(y)

    second_half = y[order][int(len(y) * .5):]
    first_half = y[order][:int(len(y) * .5)]
    
    median = np.median(y)

    medians_in_first_half = (first_half == median).sum() 
    medians_in_second_half = (second_half == median).sum()

    number_of_medians = medians_in_first_half + medians_in_second_half

    if number_of_medians == 0:
        threshold_prob_0 = 0
    else:
        threshold_prob_0 = medians_in_first_half/number_of_medians

    new_y = np.zeros(len(y))
    new_y[order[int(len(y) * p):]] = 1
    return new_y.astype(int), y[order[int(len(y) * p)]], threshold_prob_0

def reclassify_with_old_threshold(y, threshold, threshold_prob_0):
    binary_y = np.zeros_like(y, dtype=int)
    binary_y[y > threshold] = 1

    threshold_indices = np.where(y == threshold)[0]
    num_threshold_indices = len(threshold_indices)
    num_threshold_0 = int(threshold_prob_0 * num_threshold_indices)
    num_threshold_1 = num_threshold_indices - num_threshold_0
    if num_threshold_0 > 0:        
        binary_y[threshold_indices[:num_threshold_0]] = 0
    if num_threshold_1 > 0:
        binary_y[threshold_indices[num_threshold_0:]] = 1
    
    noise = min(num_threshold_0, num_threshold_1)/len(y)
    return binary_y, noise


def reclassify_with_old_threshold_all_to_max(y, threshold, threshold_prob_0):
    binary_y = np.zeros_like(y, dtype=int)
    binary_y[y > threshold] = 1

    threshold_indices = np.where(y == threshold)[0]
    num_threshold_indices = len(threshold_indices)

    num_threshold_0 = int(threshold_prob_0 * num_threshold_indices)
    num_threshold_1 = num_threshold_indices - num_threshold_0
    
    if threshold_prob_0>=0.5:
        binary_y[threshold_indices] = 0
    else:
        binary_y[threshold_indices] = 1
    
    noise = min(num_threshold_0, num_threshold_1)/len(y)
    return binary_y, noise


local_cache_dir = '/home/ribeirop/common/Projects/tpot_digen_paper1/Datasets'

benchmark = digen.Benchmark()

In [24]:
#create new test set with same noise

all_new_test_sets = {}
all_new_test_sets_no_noise = {}
threshold_prob_0_list = []
for i, dset in enumerate(benchmark.dataset_names):
    X_digen, y_digen, y_digen_original, func, threshold, threshold_prob_0 = regenerate(dataset=dset)
    X, y = benchmark.load_dataset(dset, separate_target=True, local_cache_dir=local_cache_dir)

    newX = get_random_data(10000, 10, i)
    newy_original = apply_equation(newX, func)
    newy, noise = reclassify_with_old_threshold(newy_original, threshold, threshold_prob_0)
    newy_no_noise, _ = reclassify_with_old_threshold_all_to_max(newy_original, threshold, threshold_prob_0)
    
    all_new_test_sets[dset] = {'X': newX, 'y': newy, 'noise': noise, 'y_no_noise': newy_no_noise}
    threshold_prob_0_list.append(threshold_prob_0)

pickle.dump(all_new_test_sets, open('all_new_test_sets.pkl', 'wb'))

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statistics
import digen
local_cache_dir = '/home/ribeirop/common/Projects/tpot_digen_paper1/Datasets'

benchmark = digen.Benchmark()
df_dict = {}

for d in benchmark.dataset_names:
    digen_X, digen_y = benchmark.load_dataset(d,separate_target=True, local_cache_dir=local_cache_dir)
    X, y, original_y, fun, thresh, threshold_prob_0 = regenerate(dataset=d)

    if sum(np.sum(abs(X - digen_X))) > 0.00001 or np.sum(abs(y - digen_y)) > 0.00001:
        print("GENERATION NOT ACCURATE")

    second_half = original_y[np.argsort(original_y)][int(len(y) * .5):]
    first_half = original_y[np.argsort(original_y)][:int(len(y) * .5)]
    
    median = statistics.median(original_y)

    medians_in_first_half = (first_half == median).sum() 
    medians_in_second_half = (second_half == median).sum()

    if medians_in_first_half == 0 and medians_in_second_half == 0:
        df_dict[d] = {  "median" : median, 
                        "medians_in_first_half" : medians_in_first_half, 
                        "medians_in_second_half" : medians_in_second_half, 
                        "min incorrectly labeled" : 0, 
                        "percent error": 0/1000,
                        "equation to make min error" : f"y> {thresh} or y>= {thresh}", 
                        "error direction" : "0 flipped labels",
                         }

    if medians_in_second_half > medians_in_first_half:
        df_dict[d] = {  "median" : median, 
                        "medians_in_first_half" : medians_in_first_half, 
                        "medians_in_second_half" : medians_in_second_half, 
                        "min incorrectly labeled" : medians_in_first_half, 
                        "percent error": medians_in_first_half/1000,
                        "equation to make min error" : f" y>= {thresh}", 
                        "error direction" : f"{medians_in_first_half} zeros set to one", 
                        }
        
    
    elif medians_in_second_half < medians_in_first_half:
        df_dict[d] = {  "median" : median, 
                        "medians_in_first_half" : medians_in_first_half, 
                        "medians_in_second_half" : medians_in_second_half, 
                        "min incorrectly labeled" : medians_in_second_half, 
                        "percent error": medians_in_second_half/1000,
                        "equation to make min error" : f" y> {thresh}", 
                        "error direction" :f"{medians_in_second_half} ones set to zero",
                        }
        
        
    elif medians_in_first_half == medians_in_second_half:
        df_dict[d] = {  "median" : median, 
                        "medians_in_first_half" : medians_in_first_half, 
                        "medians_in_second_half" : medians_in_second_half, 
                        "min incorrectly labeled" : medians_in_second_half, 
                        "percent error": medians_in_second_half/1000,
                        "equation to make min error" : f"y> {thresh} or y>= {thresh} depending on which side you want flipped", 
                        "error direction" :f"{medians_in_first_half} zeros set to one OR {medians_in_second_half} ones set to zero",
                        }
    else:
        print("something went wrong")

In [6]:
df = pd.DataFrame.from_dict(df_dict, orient='index')
df.to_csv("digen_noise.csv")
df

Unnamed: 0,median,medians_in_first_half,medians_in_second_half,min incorrectly labeled,percent error,equation to make min error,error direction
digen8_4426,-0.065211,0,0,0,0.0,y> -0.059973504797118375 or y>= -0.05997350479...,0 zeros set to one OR 0 ones set to zero
digen32_5191,0.000213,0,0,0,0.0,y> 0.0003804312756615266 or y>= 0.000380431275...,0 zeros set to one OR 0 ones set to zero
digen39_5578,0.0,500,1,1,0.001,y> 0.0,1 ones set to zero
digen36_466,0.0,500,37,37,0.037,y> 0.0,37 ones set to zero
digen13_769,0.0,500,11,11,0.011,y> 0.0,11 ones set to zero
digen35_4426,0.062639,0,0,0,0.0,y> 0.0661354086536567 or y>= 0.066135408653656...,0 zeros set to one OR 0 ones set to zero
digen1_6265,0.0,500,21,21,0.021,y> 0.0,21 ones set to zero
digen10_8322,1.0,4,500,4,0.004,y>= 1.0,4 zeros set to one
digen23_5191,-0.008089,0,0,0,0.0,y> -0.007339878624249983 or y>= -0.00733987862...,0 zeros set to one OR 0 ones set to zero
digen22_2433,1.0,8,500,8,0.008,y>= 1.0,8 zeros set to one


In [7]:
df.sort_values(by=['percent error'], ascending=False)

Unnamed: 0,median,medians_in_first_half,medians_in_second_half,min incorrectly labeled,percent error,equation to make min error,error direction
digen29_8322,1.0,191,500,191,0.191,y>= 1.0,191 zeros set to one
digen15_5311,0.0,500,49,49,0.049,y> 0.0,49 ones set to zero
digen5_6949,1.0,39,500,39,0.039,y>= 1.0,39 zeros set to one
digen36_466,0.0,500,37,37,0.037,y> 0.0,37 ones set to zero
digen1_6265,0.0,500,21,21,0.021,y> 0.0,21 ones set to zero
digen3_769,0.0,500,13,13,0.013,y> 0.0,13 ones set to zero
digen13_769,0.0,500,11,11,0.011,y> 0.0,11 ones set to zero
digen16_5390,1.0,10,500,10,0.01,y>= 1.0,10 zeros set to one
digen27_860,1.0,9,500,9,0.009,y>= 1.0,9 zeros set to one
digen22_2433,1.0,8,500,8,0.008,y>= 1.0,8 zeros set to one
