In [1]:
import raha
import json
import numpy
import math
import os
import sklearn
from sklearn.semi_supervised import LabelPropagation
import time
import random
import copy
import matplotlib.pyplot as plt
import statistics

In [None]:
def run_test(dd,lb,cf,mode,opt1,opt2,prop):
    det = raha.detection.Detection()
    d = det.initialize_dataset(dd)
    if lb != 0: det.LABELING_BUDGET = lb
    if cf != "": det.CLASSIFICATION_MODEL = cf
    if mode != "": det.COMPARE_MODE = mode
    if opt1 != "": det.COMPARE_SIMILARITY = opt1
    if opt2 != "": det.COMPARE_DISTANCE = opt2
    det.run_strategies(d)
    det.generate_features(d)
    det.build_clusters(d)
    while len(d.labeled_tuples) < lb:
        det.sample_tuple(d)
        if d.has_ground_truth:
            det.label_with_ground_truth(d)
    if prop == "normal":
        det.propagate_labels(d)
        det.predict_labels(d)
    if prop == "test":
        det.propagate_weighted_labels7(d)
        det.propagate_weighted_heterogene_labels4(d)
        det.predict_weighted_labels2(d)
    if prop == "test1":
        det.propagate_weighted_labels8(d)
        det.propagate_weighted_heterogene_labels6(d)
        det.predict_weighted_labels2(d)
    data = raha.dataset.Dataset(dd)
    p, r, f = data.get_data_cleaning_evaluation(d.detected_cells)[:3]
    print("Clusters:",det.COUNT_CLUSTER)
    print("Raha's performance on {}:\nPrecision = {:.2f}\nRecall = {:.2f}\nF1 = {:.2f}".format(data.name, p, r, f))
    return d, p, r, f

In [None]:
error_cells = [(3,1),(3,2),(7,2),(8,1),(10,1),(11,2)]
data_dict_1 = {
    "name": "hospital",
    "path": "hospital_dirty.csv",
    "clean_path": "hospital_clean.csv"
}
data_dict_2 = {
    "name": "test",
    "path": "dirty_test.csv",
    "clean_path": "clean_test.csv"
}
data_dict_3 = {
    "name": "flights",
    "path": "flights_dirty.csv",
    "clean_path": "flights_clean.csv"
}
data_dict_4 = {
    "name": "movies",
    "path": "movies_dirty.csv",
    "clean_path": "movies_clean.csv"
}
data_dict_5 = {
    "name": "rayyan",
    "path": "rayyan_dirty.csv",
    "clean_path": "rayyan_clean.csv"
}
data_dict_6 = {
    "name": "toy",
    "path": "toy_dirty.csv",
    "clean_path": "toy_clean.csv"
}
d_test,_,_,f = run_test(data_dict_1,20,"GBC","distance","","euclidean","test1")

#print(d_test.labeled_cells_j_c)
stats = [[],[],[]]
# for i in range(0):
#     _,p1,r1,f1 = run_test(data_dict_3,20,"GBC","","distance","euclidean","test1")
#     _,p2,r2,f2 = run_test(data_dict_3,20,"GBC","","distance","euclidean","test2")
#     _,p3,r3,f3 = run_test(data_dict_3,20,"GBC","","","","normal")
#     stats[0].append(f1)
#     stats[1].append(f2)
#     stats[2].append(f3)

In [None]:
def evaluate(runs,dd):
    det = raha.detection.Detection()
    d = det.initialize_dataset(dd)
    det.run_strategies(d)
    det.generate_features(d)
    
    classifier = ["GBC"]
    prop_config = ["normal","test"]
    prop_test = ["test3","test6"]
    dds = []
    stats = []
    clusters = []
    
    for r in range(runs):
        print("Run:",r)
        det_tmp = copy.deepcopy(det)
        d_tmp = copy.deepcopy(d)
        det_tmp.build_clusters(d_tmp)
        while len(d_tmp.labeled_tuples) < 20:
            det_tmp.sample_tuple(d_tmp)
            if d_tmp.has_ground_truth:
                det_tmp.label_with_ground_truth(d_tmp)
        for prop in prop_config:
            if prop == "normal":
                print(prop)
                det_tmp_tmp = copy.deepcopy(det_tmp)
                d_tmp_tmp = copy.deepcopy(d_tmp)
                det_tmp_tmp.propagate_labels(d_tmp_tmp)
                det_tmp_tmp.predict_labels(d_tmp_tmp)
                data = raha.dataset.Dataset(dd)
                p, r, f = data.get_data_cleaning_evaluation(d_tmp_tmp.detected_cells)[:3]
                stats.append([p,r,f,(prop)])
            if prop == "test":
                for pr in prop_test:
                    for cl in classifier:
                        print(prop,pr,cl)
                        det_tmp_tmp = copy.deepcopy(det_tmp)
                        det_tmp_tmp.CLASSIFICATION_MODEL = cl
                        d_tmp_tmp = copy.deepcopy(d_tmp)
                        if pr == "test0": det_tmp_tmp.propagate_weighted_labels4(d_tmp_tmp) # like normal
                        if pr == "test1": det_tmp_tmp.propagate_weighted_labels5(d_tmp_tmp) # distance to nearest labeled cells
                        if pr == "test2": det_tmp_tmp.propagate_weighted_labels6(d_tmp_tmp) # only labeled cells
                        if pr == "test3":
                            det_tmp_tmp.propagate_weighted_labels7(d_tmp_tmp) # only dense cells around labbele cells
                            clusters.append(det_tmp_tmp.COUNT_CLUSTER)
                        if pr == "test4":
                            det_tmp_tmp.LABEL_PROPAGATION_METHOD = "heterogeneity" # heterogene cells also. no dbscan in heterogene cluster
                            det_tmp_tmp.propagate_weighted_labels7(d_tmp_tmp)
                            det_tmp_tmp.propagate_weighted_heterogene_labels4(d_tmp_tmp)
                        if pr == "test5":
                            det_tmp_tmp.LABEL_PROPAGATION_METHOD = "heterogeneity" #  heterogene cells also. dbscan in heterogene cluster
                            det_tmp_tmp.propagate_weighted_labels7(d_tmp_tmp)
                            det_tmp_tmp.propagate_weighted_heterogene_labels5(d_tmp_tmp)
                        if pr == "test6":
                            det_tmp_tmp.propagate_weighted_labels8(d_tmp_tmp)
                        det_tmp_tmp.predict_weighted_labels2(d_tmp_tmp)
                        data = raha.dataset.Dataset(dd)
                        p, r, f = data.get_data_cleaning_evaluation(d_tmp_tmp.detected_cells)[:3]
                        stats.append([p,r,f,(prop,pr,cl)])
    return stats,clusters
st,cc = evaluate(20,data_dict_1)
cc

In [None]:
runs = 20
configs = 3
clr = ["y-","r-","b-","g-","y-","y-"]
plt.figure(1)
for config in range(configs):
    x = [i for i in range(runs)]
    y = [st[i*configs+config][2] for i in range(runs)]
    plt.plot(x,y,clr[config])
plt.show()
for config in range(configs):
    print(clr[config],st[config][3],sum([st[i*configs+config][2] for i in range(runs)])/runs)
# print("y:",st[0][0][3],sum([st[0][i*configs][2] for i in range(runs)]))
# print("r:",st[0][1][3],sum([st[0][i*configs+1][2] for i in range(runs)]))
# print("b:",st[0][2][3],sum([st[0][i*configs+2][2] for i in range(runs)]))
# print("g:",st[0][3][3],sum([st[0][i*configs+3][2] for i in range(runs)]))

In [None]:
d = copy.deepcopy(d_test)
k = len(d.labeled_tuples) + 2 - 1
for j in range(d.dataframe.shape[1]):
    for c in d.clusters_k_j_c_ce[k][j]:
        if (sum(d.labels_per_cluster[(j, c)].values()) not in [0, len(d.labels_per_cluster[(j, c)])]) and len(d.labels_per_cluster[(j, c)]) != 0:
            x = [d.column_features[j][i] for i in range(d.dataframe.shape[0]) if (i,j) in d.clusters_k_j_c_ce[k][j][c]]
            cells = [(i,j) for i in range(d.dataframe.shape[0]) if (i,j) in d.clusters_k_j_c_ce[k][j][c]]
            for sub_cluster in range(20):
                km = sklearn.cluster.KMeans(n_clusters=(sub_cluster+2)).fit(x)
                sub_clusters = {sub_c: [] for sub_c in range(sub_cluster+2)}
                labeled_sub_clusters = []
                for i,cell in enumerate(cells):
                    #print(cell,km.labels_[i])
                    sub_clusters[km.labels_[i]].append(cell)
                homogene = True
                for sub_c in sub_clusters:
                    sub_labeled = {}
                    for cell in sub_clusters[sub_c]:
                        if cell in d.labeled_cells_j_c[j][c][0]: sub_labeled[cell] = d.labeled_cells_j_c[j][c][0][cell][1]
                    if len(sub_labeled) != 0 and sum(sub_labeled.values()) not in [0,len(sub_labeled.values())]: homogene = False
                if homogene:
                    sub_cluster_number = set()
                    for cell in d.labeled_cells_j_c[j][c][0]:
                        for sub_c in sub_clusters:
                            if cell in sub_clusters[sub_c]:sub_cluster_number.add((sub_c,d.labeled_cells_j_c[j][c][0][cell][1]))
                    print(sub_cluster_number)
                    
                    for n in sub_cluster_number:
                        sub_c = n[0]
                        x_tmp = [x[i] for i,cell in enumerate(cells) if cell in sub_clusters[sub_c]]
                        cells_tmp = [cell for cell in cells if cell in sub_clusters[sub_c]]
                        if len(x_tmp) > 1:
                            nbrs = sklearn.neighbors.NearestNeighbors(n_neighbors=2).fit(x)
                            distances, indices = nbrs.kneighbors(x)
                            distances = numpy.sort(distances, axis=0)[:,1]
                            #calc if slope is over 1% between two distances then choose value as eps
                            eps = distances[0]
                            v1 = 0
                            for x1 in range(len(distances)-1):
                                v2 = distances[x1+1]-distances[x1]
                                if 100*v2 > 101*v1:
                                    eps = distances[x1+1]
                                    break
                                v1 = v2
                            label = n[1]
                            if eps == 0:
                                for cell in sub_clusters[sub_c]:
                                    if cell in d.labeled_cells_j_c[j][c][1]: d.labeled_cells_j_c[j][c][1][cell] = (1.0,label)
                            else:
                                db = sklearn.cluster.DBSCAN(eps=eps).fit(x_tmp)
                                db_cluster = []
                                closest_dist = {}
                                for i,cell in enumerate(cells_tmp):
                                    if cell in d.labeled_cells_j_c[j][c][0] and db.labels_[i] != -1:
                                        db_cluster.append(db.labels_[i])
                                for i,cell in enumerate(cells_tmp):
                                    if db.labels_[i] in db_cluster and cell in d.labeled_cells_j_c[j][c][1]:
                                        x_cell = x[cells.index(cell)]
                                        max_dist = 0
                                        for cell2 in d.labeled_cells_j_c[j][c][0]:
                                            x_cell2 = x[cells.index(cell2)]
                                            dist = get_distance(x_cell,x_cell2,"euclidean")
                                            if dist > max_dist: max_dist = dist
                                        d.labeled_cells_j_c[j][c][1][cell] = (1-max_dist,label)
                                print(d.labeled_cells_j_c[j][c])
                    break


In [50]:
def get_similarity(vec1,vec2,config):
    #collect 2x2 0/0,0/1,1/0,1/1
    tbl = [0]*4
    for i in range(len(vec1)):
        if vec1[i] == 1 and vec2[i] == 1: tbl[0] += 1
        if vec1[i] == 1 and vec2[i] == 0: tbl[1] += 1
        if vec1[i] == 0 and vec2[i] == 1: tbl[2] += 1
        if vec1[i] == 0 and vec2[i] == 0: tbl[3] += 1
    if config == "matching":
        sim_coeff = (tbl[0]+tbl[3])/sum(tbl)
    if config == "jaccard":
        sim_coeff = tbl[0]/(tbl[0]+tbl[1]+tbl[2])
    if config == "dice":
        sim_coeff = 2*tbl[0]/(2*tbl[0]+tbl[1]+tbl[2])
    if config == "antidice":
        sim_coeff = tbl[0]/(tbl[0]+2*tbl[1]+2*tbl[2])
    if config == "sneath":
        sim_coeff = (2*tbl[0]+2*tbl[3])/(2*tbl[0]+tbl[1]+tbl[2]+2*tbl[3])
    if config == "rogers":
        sim_coeff = (tbl[0]+tbl[3])/(tbl[0]+2*tbl[1]+2*tbl[2]+tbl[3])
    return sim_coeff

def get_distance(vec1,vec2,config):    
    if config == "euclidean":
        dist = numpy.linalg.norm(numpy.subtract(vec1,vec2),ord=2) / numpy.linalg.norm(numpy.ones(len(vec1)))
    if config == "hamming":
        dist = numpy.linalg.norm(numpy.subtract(vec1,vec2),ord=1) / len(vec1)
    if config == "normal":
        dist = numpy.linalg.norm(numpy.subtract(vec1,vec2),ord=2)
    if config == "cosine":
        if sum(vec1) == 0 or sum(vec2) == 0: dist = 0
        else: dist = numpy.dot(vec1,vec2)/(numpy.linalg.norm(vec1)*numpy.linalg.norm(vec2))
    if config == "dot":
        dist = numpy.dot(vec1,vec2)/len(vec1)
    return dist

In [55]:
get_distance([1,1],[1,1],"cosine")

0.9999999999999998

In [None]:
def propagate_weigthed_labels2(d):
    k = len(d.labeled_tuples) + 2 - 1
    labeled_cells_j_c = {j: {c: [{},{}] for c in range(k)} for j in range(d.dataframe.shape[1])}
    # index 0 = labeled, 1 = unlabeled
    for j in range(d.dataframe.shape[1]):
        for c in d.clusters_k_j_c_ce[k][j]:
            for cell in d.clusters_k_j_c_ce[k][j][c]:
                if cell in d.labeled_cells:
                    labeled_cells_j_c[j][c][0][cell] = d.labeled_cells[cell][0]
                else:
                    labeled_cells_j_c[j][c][1][cell] = (1,1) #weight,label
            #homogene
            if (sum(labeled_cells_j_c[j][c][0].values()) in [0,len(labeled_cells_j_c[j][c][0].values())]) and (len(labeled_cells_j_c[j][c][0].values()) != 0):
                sub_cluster_train = [d.column_features[j][i] for i in range(d.dataframe.shape[0]) if (i,j) in d.clusters_k_j_c_ce[k][j][c]]
                sub_cluster_cells = [(i,j) for i in range(d.dataframe.shape[0]) if (i,j) in d.clusters_k_j_c_ce[k][j][c]]
                sub_cluster_labels = []
                for i in range(d.dataframe.shape[0]):
                    if (i,j) in labeled_cells_j_c[j][c][0]:
                        sub_cluster_labels.append(labeled_cells_j_c[j][c][0][(i,j)])
                    elif (i,j) in labeled_cells_j_c[j][c][1]:
                        sub_cluster_labels.append(-1)
                label_prop = LabelPropagation().fit(sub_cluster_train,sub_cluster_labels)
                for i in range(len(sub_cluster_train)):
                    if sub_cluster_cells[i] in labeled_cells_j_c[j][c][1]: labeled_cells_j_c[j][c][1][sub_cluster_cells[i]] = (max(label_prop.label_distributions_[i]),label_prop.transduction_[i])
    return labeled_cells_j_c
lc = propagate_weigthed_labels2(d_test)   

In [None]:
def propagate(d):
    k = len(d.labeled_tuples) + 2 - 1
    labeled_cells_j_c = {j: {c: [{},{}] for c in range(k)} for j in range(d.dataframe.shape[1])}
    # index 0 = labeled, 1 = unlabeled
    print(d.labeled_tuples)
    for j in range(d.dataframe.shape[1]):
        for c in d.clusters_k_j_c_ce[k][j]:
            #print(c,d.clusters_k_j_c_ce[k][j][c])
            for cell in d.clusters_k_j_c_ce[k][j][c]:
                #print(cell)
                if cell in d.labeled_cells:
                    labeled_cells_j_c[j][c][0][cell] = (1,d.labeled_cells[cell][0])
                else:
                    labeled_cells_j_c[j][c][1][cell] = (1,1) #weight,label to fit weight and y_train to ai
            #homogene
            if (sum(d.labels_per_cluster[(j, c)].values()) in [0, len(d.labels_per_cluster[(j, c)])]) and len(d.labels_per_cluster[(j, c)]) != 0:
                label = list(d.labels_per_cluster[(j, c)].values())[0]
                for cell in labeled_cells_j_c[j][c][1]:
                    labeled_cells_j_c[j][c][1][cell] = (1,label)
    return labeled_cells_j_c

In [None]:
lc = propagate(d_test)
lsc = {j: [] for j in range(d_test.dataframe.shape[1])}


In [None]:
def propagate_heterogene_3(d,labeled_cells_j_c):
    k = len(d.labeled_tuples) + 2 - 1
    labeled_sub_cells_j_c = {j: [] for j in range(d.dataframe.shape[1])}
    for j in range(d.dataframe.shape[1]):
        for c in labeled_cells_j_c[j]:
            if sum(labeled_cells_j_c[j][c][0].values()) not in [0,len(labeled_cells_j_c[j][c][0].values())]:
                print(labeled_cells_j_c[j][c][0])
                sub_cluster_train = [d.column_features[j][i] for i in range(d.dataframe.shape[0]) if (i,j) in d.clusters_k_j_c_ce[k][j][c]]
                sub_cluster_cells = [(i,j) for i in range(d.dataframe.shape[0]) if (i,j) in d.clusters_k_j_c_ce[k][j][c]]
                sub_cluster_labels = []
                for i in range(d.dataframe.shape[0]):
                    if (i,j) in labeled_cells_j_c[j][c][0]:
                        sub_cluster_labels.append(labeled_cells_j_c[j][c][0][(i,j)])
                    elif (i,j) in labeled_cells_j_c[j][c][1]:
                        sub_cluster_labels.append(-1)
                closest_labeled_point = []
                sub_cluster_labeled_features = [sub_cluster_train[i] for i in range(len(sub_cluster_train)) if sub_cluster_cells[i] in labeled_cells_j_c[j][c][0]]
                for feature in sub_cluster_train:
                    min_val = get_distance(numpy.zeros(len(feature)),numpy.ones(len(feature)),"normal")
                    for feature2 in sub_cluster_labeled_features:
                        tmp = get_distance(feature,feature2,"normal")
                        if tmp < min_val: min_val = tmp
                    closest_labeled_point.append(min_val)
                #print(closest_labeled_point)
                
                label_prop = LabelPropagation().fit(sub_cluster_train,sub_cluster_labels)
                for i in range(len(sub_cluster_train)):
                    if sub_cluster_cells[i] in labeled_cells_j_c[j][c][1]:
                        labeled_cells_j_c[j][c][1][sub_cluster_cells[i]] = (max(label_prop.label_distributions_[i])*(0.9**closest_labeled_point[i]),label_prop.transduction_[i])
                        #print(sub_cluster_cells[i],label_prop.transduction_[i],max(label_prop.label_distributions_[i])*(0.9**closest_labeled_point[i]))
    return labeled_cells_j_c                                        
lc = propagate_heterogene_3(d_test,lc)

In [None]:
def propagate_heterogene_2(d,labeled_cells_j_c):
    k = len(d.labeled_tuples) + 2 - 1
    for j in range(d.dataframe.shape[1]):
        for c in labeled_cells_j_c[j]:
            if sum(labeled_cells_j_c[j][c][0].values()) not in [0,len(labeled_cells_j_c[j][c][0].values())]:
                print(labeled_cells_j_c[j][c][0])
                for cell1 in labeled_cells_j_c[j][c][1]:
                    mode = "similarity"
                    if mode == "distance":
                        min_val = (1,1) # (weight,label)
                        for cell2 in labeled_cells_j_c[j][c][0]:
                            distance = get_distance(d.column_features[j][cell1[0]],d.column_features[j][cell2[0]],"euclidean")
                            if min_val[0] > distance: min_val = (distance,labeled_cells_j_c[j][c][0][cell2])
                        labeled_cells_j_c[j][c][1][cell1] = (1-min_val[0],min_val[1])
                    if mode == "similarity":
                        max_val = (0,1)
                        for cell2 in labeled_cells_j_c[j][c][0]:
                            similarity = get_similarity(d.column_features[j][cell1[0]],d.column_features[j][cell2[0]],"antidice")
                            if max_val[0] < similarity: max_val = (similarity,labeled_cells_j_c[j][c][0][cell2])
                        labeled_cells_j_c[j][c][1][cell1] = max_val
                #print(labeled_cells_j_c[j][c])
    return labeled_cells_j_c
lc = propagate_heterogene_2(d_test,lc)
lsc = {j: [] for j in range(d_test.dataframe.shape[1])}

In [None]:
def predict_weighted_features(d,labeled_cells_j_c):
    #append weight to features
    heterogene = False
    k = len(d.labeled_tuples) + 2 - 1
    detected_cells_dictionary = {}
    extended_labeled_cells = {}
    for j in range(d.dataframe.shape[1]):
        for c in labeled_cells_j_c[j]:
            if len(labeled_cells_j_c[j][c][0]) != 0:
                extended_labeled_cells.update(labeled_cells_j_c[j][c][0])
                if not heterogene and sum(d.labels_per_cluster[(j, c)].values()) in [0,len(d.labels_per_cluster[(j, c)].values())]:
                    extended_labeled_cells.update(labeled_cells_j_c[j][c][1])
                elif heterogene:
                    extended_labeled_cells.update(labeled_cells_j_c[j][c][1])
    for j in range(d.dataframe.shape[1]):
        feature_vectores = d.column_features[j]
        x_train = [feature_vectores[i] for i in range(d.dataframe.shape[0]) if (i,j) in extended_labeled_cells]
        y_train = [extended_labeled_cells[(i,j)][1] for i in range(d.dataframe.shape[0]) if (i,j) in extended_labeled_cells]
        weights = [extended_labeled_cells[(i,j)][0] for i in range(d.dataframe.shape[0]) if (i,j) in extended_labeled_cells]
        print("Len x:",len(x_train))
        if sum(y_train) == len(y_train):
            y_pred = numpy.ones(d.dataframe.shape[0])
        elif sum(y_train) == 0 or len(x_train[0]) == 0:
            y_pred = numpy.zeros(d.dataframe.shape[0])
        else:
            classifier = "GBC"
            if classifier == "GNB":
                model = sklearn.naive_bayes.GaussianNB()
            if classifier == "LIR":
                model = sklearn.linear_model.LinearRegression()
            if classifier == "LGR":
                model = sklearn.linear_model.LogisticRegression()
            if classifier == "RFR":
                model = sklearn.ensemble.RandomForestRegressor()
            if classifier == "ABC":
                model = sklearn.ensemble.AdaBoostClassifier(n_estimators=50)
            if classifier == "GBC":
                model = sklearn.ensemble.GradientBoostingClassifier()
            if classifier == "SGDC":
                model = sklearn.linear_model.SGDClassifier(loss="hinge", penalty="l2")
            model.fit(x_train,y_train,sample_weight=weights)
            y_pred = model.predict(d.column_features[j])
        for i,y in enumerate(y_pred):
            #print((i,j),y)
            if (i in d.labeled_tuples and d.labeled_cells[(i,j)][0]) or (i not in d.labeled_tuples and y):
                detected_cells_dictionary[(i,j)] = "Just a dummy value"
    return detected_cells_dictionary
    for cell in detected_cells_dictionary:
        print(cell,detected_cells_dictionary[cell])
#{cluster_i: [labeled_cells in cluster_i,rest in cluster_i]}
detected_cells = predict_weighted_features(d_test,lc)
data_test = raha.dataset.Dataset(data_dict_3)
p,r,f = data_test.get_data_cleaning_evaluation(detected_cells)[:3]
print(p,r,f)

In [None]:
def propagate_heterogene(d,labeled_cells_j_c):
    k = len(d.labeled_tuples) + 2 - 1
    DISCOUNT = 0.8
    SUB_CLUSTER_SIZE = 20
    labeled_sub_cells_j_c = {j: {} for j in range(d.dataframe.shape[1])}
    for j in range(d.dataframe.shape[1]):
        for c in labeled_cells_j_c[j]:
            #print(c,d.clusters_k_j_c_ce[k][j][c])
            if (sum(labeled_cells_j_c[j][c][0].values()) not in [0,len(labeled_cells_j_c[j][c][0].values())]) and len(labeled_cells_j_c[j][c][0].values()) != 0:
                #heterogene
                sub_cluster_train = [d.column_features[j][i] for i in range(d.dataframe.shape[0]) if (i,j) in d.clusters_k_j_c_ce[k][j][c]]
                sub_cluster_cells = [(i,j) for i in range(d.dataframe.shape[0]) if (i,j) in d.clusters_k_j_c_ce[k][j][c]]
                #test cluster size to get homogene sub cluster
                for c_size in range(SUB_CLUSTER_SIZE-2):
                    print("Iteration",c_size)
                    homogene = 0
                    km = sklearn.cluster.KMeans(n_clusters=(c_size+2)).fit(sub_cluster_train)
                    sub_cluster = {sub_c: [{},{}] for sub_c in range(c_size+2)}
                    #sub cluster index 0 = labeled , 1 = else 
                    for i,cell in enumerate(sub_cluster_cells):
                        if cell in labeled_cells_j_c[j][c][0]:
                            sub_cluster[km.labels_[i]][0][cell] = labeled_cells_j_c[j][c][0][cell]
                        else:
                            sub_cluster[km.labels_[i]][1][cell] = labeled_cells_j_c[j][c][1][cell]
                    #check of sub cluster labels are homogene (counts empty too)
                    for sub_c in sub_cluster:
                        print(sub_c,sub_cluster[sub_c][0].values())
                        if sum(sub_cluster[sub_c][0].values()) in [0,len(sub_cluster[sub_c][0].values())]:
                            homogene += 1
                    if homogene == len(sub_cluster):
                        #calc weights
                        for sub_c in sub_cluster:
                            if len(sub_cluster[sub_c][0]) != 0:
                                for cell1 in sub_cluster[sub_c][1]:
                                    mode = "distance"
                                    if mode == "distance":
                                        min_val = 1
                                        for cell2 in sub_cluster[sub_c][0]:
                                            distance = get_distance(d.column_features[j][cell1[0]],d.column_features[j][cell2[0]],"euclidean")
                                            if min_val > distance: min_val = distance
                                        sub_cluster[sub_c][1][cell1] = ((1-distance)*DISCOUNT,sub_cluster[sub_c][0][cell2])
                                    if mode == "similarity":
                                        max_val = 0
                                        for cell2 in sub_cluster[sub_c][0]:
                                            similarity = get_similarity(d.column_features[j][cell1[0]],d.column_features[j][cell2[0]],"matching")
                                            if max_val < similarity: min_val = similarity
                                        sub_cluster[sub_c][1][cell1] = (similarity*DISCOUNT,sub_cluster[sub_c][0][cell2])
                        break
                for sub_c in sub_cluster:
                    for cell in sub_cluster[sub_c][1]:
                        labeled_cells_j_c[j][c][1][cell] = sub_cluster[sub_c][1][cell]
    #print(labeled_sub_cells_j_c)
    return labeled_cells_j_c
lsc = propagate_heterogene(d_test,lc)
lsc

In [None]:
def cluster_test(dd,lb):
    det = raha.detection.Detection()
    d = det.initialize_dataset(dd)
    det.run_strategies(d)
    det.generate_features(d)
    det.build_clusters(d)
    while len(d.labeled_tuples) < lb:
        det.sample_tuple(d)
        if d.has_ground_truth:
            det.label_with_ground_truth(d)
    het = 0
    hom = 0
    k = len(d.labeled_tuples) + 2 - 1
    for j in range(d.dataframe.shape[1]):
        for c in d.clusters_k_j_c_ce[k][j]:        
            if len(d.labels_per_cluster[(j,c)].values()) != 0 and sum(d.labels_per_cluster[(j,c)].values()) in [0,len(d.labels_per_cluster[(j,c)].values())]:
                hom += 1
            elif len(d.labels_per_cluster[(j,c)].values()) != 0:
                het += 1
    return hom,het

In [None]:
t = (0,0)
t += (1,1)
t