# ENaN

In [244]:
import pandas as pd
from sklearn.neighbors import KDTree

def natural_neighbour(df) -> pd.DataFrame:
    kdt = KDTree(df.values, leaf_size=30, metric='euclidean')
    nan = pd.DataFrame(columns=['natural_neighbours', 'num_neighbours'])
    
    r = 1
    num_instances_with_no_neighbour = 0
    
    stop = False
    last_iter = False
    while not stop:
        
        if last_iter:
                stop = True
                
        for i, row in df.iterrows():
            i_neighbours = kdt.query(row.values.reshape(1, -1), k=r+1, return_distance=False)
            i_natural_neighbours = []
            i_num_neighbours = 0

            for j in i_neighbours[0]:
                if j != i:
                    j_neighbours = kdt.query(df.iloc[j].values.reshape(1, -1), k=r+1, return_distance=False)
                    if i in j_neighbours:
                        i_natural_neighbours.append(j)

            try:
                nan.iloc[i].natural_neighbours = list(set(nan.iloc[i].natural_neighbours) | set(i_natural_neighbours))
                nan.iloc[i].num_neighbours = len(nan.iloc[i].natural_neighbours)
            except:
                nan = nan.append({
                    "natural_neighbours": i_natural_neighbours,
                    "num_neighbours": len(i_natural_neighbours)
                }, ignore_index=True)

        if num_instances_with_no_neighbour == len(nan[(nan['num_neighbours'] == 0)]):
            last_iter = True 
        else:
            last_iter = False
            stop = False
        
        num_instances_with_no_neighbour = len(nan[(nan['num_neighbours'] == 0)])
        r += 1
        
    return nan

def select_prototypes(df, nan, labels):
    filtered_df = pd.DataFrame()
    
    for i, row in nan.iterrows():
        i_natural_neighbours_df = df.iloc[nan.iloc[i].natural_neighbours]
        if len(i_natural_neighbours_df) > 0:
            kdt = KDTree(i_natural_neighbours_df.values, leaf_size=30, metric='euclidean')
            i_natural_neigbour_array_index = kdt.query(df.iloc[i].values.reshape(1, -1), k=1, return_distance=False)[0][0]
            i_nn = nan.iloc[i].natural_neighbours[i_natural_neigbour_array_index]
            if labels.iloc[i_nn] == labels.iloc[i]:
                filtered_df = filtered_df.append(df.iloc[i])
        
    return filtered_df

def run_pre_proc(df, labels):
    return select_prototypes(nan=natural_neighbour(df), df=df, labels=labels)

# ATISA1

In [331]:
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
import math

# vizinho mais próximo de @data com classe diferente de @target
def differentClass(data, dataset, classes, target):
    if (len(dataset) != len(classes)):
        raise Exception(
            'Lista de classes deve ter o mesmo tamanho da lista de dados.')
    distance = float('inf')
    i = 0
    n = len(dataset)
    index = 0
    while (i < n):
        if(classes[i] != target):
            dist = euclidianDistance(dataset[i], data)
            if (dist < distance and dist != 0):
                distance = dist
                index = i
        i += 1
    return (index, distance)

# vizinho mais próximo de @data com classe @target
def withTargetClass(data, dataset, classes, target):
    if (len(dataset) != len(classes)):
        raise Exception(
            'Lista de classes deve ter o mesmo tamanho da lista de dados.')
    distance = float('inf')
    i = 0
    n = len(dataset)
    index = 0
    while (i < n):
        if(classes[i] == target):
            dist = euclidianDistance(dataset[i], data)
            if (dist < distance and dist != 0):
                distance = dist
                index = i
        i += 1
    return (index, distance)

def nearestNeighbour(data, dataset, classes):
    if (len(dataset) != len(classes)):
        raise Exception(
            'Lista de classes deve ter o mesmo tamanho da lista de dados.')
    distance = float('inf')
    i = 0
    n = len(dataset)
    index = 0
    while (i < n):
        dist = euclidianDistance(dataset[i], data)
        if (dist < distance and dist != 0):
            distance = dist
            index = i
        i += 1
    return (index, distance)

def threeNN(data, data_class, dataset, classes):
    avg_distance = 0
    n = len(dataset)
    if (n >= 3): m = 3
    else: m = n
    results = [(float('inf'), 0)] * m
    for i in range(n):
        dist = euclidianDistance(data, dataset[i])
        j = 0
    while(j < m):
        if (dist < results[j][0] and dist != 0):
            results.insert(j, (dist, i))
            results.pop()
            j = m
        j += 1
    same = 0
    dif = 0
    for i in range(m):
        avg_distance = avg_distance + results[i][0]
        if(results[i][1] == data_class): same += 1
        else: dif += 1
    tf = (same > dif)
    avg_distance = avg_distance/3
    return (tf, avg_distance)

def euclidianDistance(data1, data2):
    if (len(data1) != len(data2)):
        raise Exception(
            'Tentando calcular distância entre dois padrões de tamanhos diferentes.')
    num = len(data1)
    result = 0
    i = 0
    while (i < num):
        dif = data1[i] - data2[i]
        result += dif * dif
        i += 1
    return math.sqrt(result)

def enn(dataset, classes):
    if (len(dataset) != len(classes)):
        raise Exception(
            'Lista de classes deve ter o mesmo tamanho da lista de dados.')
    subset_data = []
    subset_classes = []
    i = 0
    n = len(dataset)
    neighbours = NearestNeighbors(
        n_neighbors=2, algorithm='ball_tree').fit(dataset)
    distances, indexes = neighbours.kneighbors(dataset)
    while (i < n):
        if (classes[i] == classes[indexes[i][1]]):
            subset_data.append(dataset[i])
            subset_classes.append(classes[i])
        i += 1
    return (subset_data, subset_classes)

def calculateThresholdDistances(dataset, classes):
    if (len(dataset) != len(classes)):
        raise Exception(
            'Lista de classes deve ter o mesmo tamanho da lista de dados.')
    distances = []
    i = 0
    n = len(dataset)
    while(i < n):
        index, distance = differentClass(
            dataset[i], dataset, classes, classes[i])
        distances.append(distance)
        i += 1
    return distances

def subset(dataset, classes, thresholdDistances):
    subset_data = []
    subset_classes = []
    i = 1
    n = len(dataset)
    subset_data.append(dataset[0])
    subset_classes.append(classes[0])
    while(i < n):
        classified, distance = threeNN(dataset[i], classes[i], subset_data, subset_classes)
        if( classified or distance > thresholdDistances[i] ):
            subset_classes.append(classes[i])
            subset_data.append(dataset[i])
        i += 1
    return (subset_data, subset_classes)

def runATISA1(dataset, classes):
#     preprocess_data, preprocess_classes = enn(dataset, classes)
    thresholdDistances = calculateThresholdDistances(
        dataset, classes)
    subset_data, subset_classes = subset(
        dataset, classes, thresholdDistances)
    return (subset_data, subset_classes)


# TRKNN

In [346]:
def buildChain(data_index, dataset, classes):
    if (len(dataset) != len(classes)):
        raise Exception(
            'Lista com classes deve ter o mesmo tamanho da lista de dados.')
    n = len(dataset)
    same = False
    chains = []
    chains.append((data_index, -1))
    data = dataset[data_index]
    target = classes[data_index]
    old_index = data_index
    i = 0

    while (not same):
        if (i > 0):
            old_index = chains[i-1][0]

        if (i % 2 == 0):
            (index, distance) = differentClass(
                data, dataset, classes, target)
        else:
            (index, distance) = withTargetClass(
                data, dataset, classes, target)
        data = dataset[index]

        if (index == old_index):
            same = True
        else:
            chains.append((index, distance))
        i += 1

    return chains

# marca para remoção os padrões que podem ser removidos


def mark(data_index, chain, alpha, markedForRemoval):
    i = 0
    n = len(chain)
    while(i+2 < n):
        if (not markedForRemoval[chain[i][0]]):  # Se já está marcado, pula
            if(chain[i+1][1] > alpha*chain[i+2][1]):
                markedForRemoval[chain[i][0]] = True
        i += 2

# retorna o subconjunto com dados e classes gerados pelo trknn


def runTRKNN(dataset, classes, alpha):
    if (len(dataset) != len(classes)):
        raise Exception(
            'Array com classes deve ter o mesmo tamanho do Array de dados.')
    n = len(dataset)
    chains = [None] * n
    markedForRemoval = [False] * n

    i = 0
    while(i < n):
        chains[i] = buildChain(i, dataset, classes)
        mark(i, chains[i], alpha, markedForRemoval)
        i += 1

    i = 0
    subset_data = []
    subset_classes = []
    while(i < n):
        if (not markedForRemoval[i]):
            subset_data.append(dataset[i])
            subset_classes.append(classes[i])
        i += 1
    return (subset_data, subset_classes)

# Crossvalidation

In [332]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

def cross_validation(df, labels):
    one_nn_classifier = KNeighborsClassifier(n_neighbors=1)
    return cross_val_score(one_nn_classifier, df, labels, cv=10)

# Test for ATISA

In [367]:
import time

def testATISA1(data, labels):
    start = time.time()
    pre_proc_df = run_pre_proc(df=data, labels=labels)
    selected_prototypes = runATISA1(pre_proc_df.values, labels.iloc[pre_proc_df.index.tolist()].values)
    end = time.time()
    accuracy = cross_validation(df=selected_prototypes[0], labels=selected_prototypes[1]).mean()
    reduction = (len(data) - len(selected_prototypes[0]))/len(data)
    return pd.Series({
        "accuracy": accuracy, 
        "time": end - start, 
        "reduction": reduction
    })

# Test for TRKNN

In [410]:
def testTRKNN(data, labels):
    start = time.time()
    pre_proc_df = run_pre_proc(df=data, labels=labels)
    selected_prototypes = runTRKNN(pre_proc_df.values, labels.iloc[pre_proc_df.index.tolist()].values, 1.2)
    end = time.time()
    accuracy = cross_validation(df=selected_prototypes[0], labels=selected_prototypes[1]).mean()
    reduction = len(selected_prototypes[0])/len(wine)
    return pd.Series({
        "accuracy": accuracy, 
        "time": end - start, 
        "reduction": reduction
    })

# Test for ENaN

In [390]:
def testENaN(data, labels):
    start = time.time()
    pre_proc_df = run_pre_proc(df=data, labels=labels)
    end = time.time()
    accuracy = cross_validation(df=pre_proc_df, labels=labels.iloc[pre_proc_df.index.tolist()]).mean()
    reduction = (len(data) - len(selected_prototypes[0]))/len(data)
    return pd.Series({
        "accuracy": accuracy, 
        "time": end - start, 
        "reduction": reduction
    })

# Load datasets

In [412]:
wine = pd.read_csv("ENaN/datasets/wine.txt", header=None)
liver = pd.read_csv("ENaN/datasets/liver.txt", header=None)
glass = pd.read_csv("ENaN/datasets/glass.txt", header=None)
haberman = pd.read_csv("ENaN/datasets/haberman.txt", header=None)
transfusion = pd.read_csv("ENaN/datasets/transfusion.txt")
hepatitis = pd.read_csv("ENaN/datasets/hepatitis.txt", header=None)
spambase = pd.read_csv("ENaN/datasets/spambase.txt", header=None)

# Test ATISA1

In [None]:
resultsATISA1 = pd.DataFrame()

resultsATISA1 = resultsATISA1.append(testATISA1(wine.drop([0], axis='columns'), wine[0]), ignore_index=True)
resultsATISA1 = resultsATISA1.append(testATISA1(liver.drop([6], axis='columns'), liver[6]), ignore_index=True)
# resultsATISA1 = resultsATISA1.append(testATISA1(glass.drop([10], axis='columns'), glass[10]), ignore_index=True)
resultsATISA1 = resultsATISA1.append(testATISA1(haberman.drop([3], axis='columns'), haberman[3]), ignore_index=True)
resultsATISA1 = resultsATISA1.append(testATISA1(transfusion.drop(['whether he/she donated blood in March 2007'], axis='columns'), transfusion['whether he/she donated blood in March 2007']), ignore_index=True)
resultsATISA1 = resultsATISA1.append(testTRKNN(spambase.drop([57], axis='columns'), spambase[57]), ignore_index=True)
resultsATISA1 = resultsATISA1.set_index(pd.Index(['wine', 'liver', 'haberman', 'transfusion', 'spambase']))
resultsATISA1

In [382]:
resultsATISA1.to_csv('./MATISA.csv')

# Test TRKNN

In [None]:
resultsTRKNN = pd.DataFrame()

resultsTRKNN = resultsTRKNN.append(testTRKNN(wine.drop([0], axis='columns'), wine[0]), ignore_index=True)
resultsTRKNN = resultsTRKNN.append(testTRKNN(liver.drop([6], axis='columns'), liver[6]), ignore_index=True)
# resultsTRKNN = resultsTRKNN.append(testTRKNN(glass.drop([10], axis='columns'), glass[10]), ignore_index=True)
resultsTRKNN = resultsTRKNN.append(testTRKNN(haberman.drop([3], axis='columns'), haberman[3]), ignore_index=True)
resultsTRKNN = resultsTRKNN.append(testTRKNN(transfusion.drop(['whether he/she donated blood in March 2007'], axis='columns'), transfusion['whether he/she donated blood in March 2007']), ignore_index=True)
resultsATISA1 = resultsTRKNN.append(testTRKNN(spambase.drop([57], axis='columns'), spambase[57]), ignore_index=True)
resultsTRKNN = resultsTRKNN.set_index(pd.Index(['wine', 'liver', 'haberman', 'transfusion', 'spambase']))
resultsTRKNN

In [383]:
resultsTRKNN.to_csv('./MTRKNN.csv')

# Test ENaN

In [None]:
resultsENaN = pd.DataFrame()

resultsENaN = resultsENaN.append(testENaN(wine.drop([0], axis='columns'), wine[0]), ignore_index=True)
resultsENaN = resultsENaN.append(testENaN(liver.drop([6], axis='columns'), liver[6]), ignore_index=True)
resultsENaN = resultsENaN.append(testENaN(glass.drop([10], axis='columns'), glass[10]), ignore_index=True)
resultsENaN = resultsENaN.append(testENaN(haberman.drop([3], axis='columns'), haberman[3]), ignore_index=True)
resultsENaN = resultsENaN.append(testENaN(transfusion.drop(['whether he/she donated blood in March 2007'], axis='columns'), transfusion['whether he/she donated blood in March 2007']), ignore_index=True)
resultsATISA1 = resultsENaN.append(testENaN(hepatitis.drop([19], axis='columns'), hepatitis[19]), ignore_index=True)
resultsATISA1 = resultsENaN.append(testENaN(spambase.drop([57], axis='columns'), spambase[57]), ignore_index=True)
resultsENaN = resultsENaN.set_index(pd.Index(['wine', 'liver', 'haberman', 'transfusion', 'spambase']))
resultsENaN

In [394]:
resultsENaN.to_csv('./ENaN.csv')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.000,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.000,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.000,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.000,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.000,0.135,0.000,0.000,3.537,40,191,1
5,0.00,0.00,0.00,0.0,1.85,0.00,0.00,1.85,0.00,0.00,...,0.000,0.223,0.000,0.000,0.000,0.000,3.000,15,54,1
6,0.00,0.00,0.00,0.0,1.92,0.00,0.00,0.00,0.00,0.64,...,0.000,0.054,0.000,0.164,0.054,0.000,1.671,4,112,1
7,0.00,0.00,0.00,0.0,1.88,0.00,0.00,1.88,0.00,0.00,...,0.000,0.206,0.000,0.000,0.000,0.000,2.450,11,49,1
8,0.15,0.00,0.46,0.0,0.61,0.00,0.30,0.00,0.92,0.76,...,0.000,0.271,0.000,0.181,0.203,0.022,9.744,445,1257,1
9,0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.00,0.06,0.00,...,0.040,0.030,0.000,0.244,0.081,0.000,1.729,43,749,1


In [None]:
testTRKNN(spambase.drop([57], axis='columns'), spambase[57])