In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import collections
import itertools
import math
import random
import warnings
warnings.filterwarnings('ignore')
import time

import sys
sys.path.append('./')
import holoclean
from detect import NullDetector, ViolationDetector
from repair.featurize import *

import numpy as np
import math

def random_combination(iterable, r):
    "Random selection from itertools.combinations(iterable, r)"
    pool = tuple(iterable)
    n = len(pool)
    indices = sorted(random.Random(time.time()).sample(range(n), r))
    return tuple(pool[i] for i in indices)

# Holoclean Functions
### #TODO: Create an abstract "CleaningAlgorithm" class and make holoclean and naiveAlgo objects inherit from them

In [2]:
def naive2(df, constraints):
    df_c = df.copy()           
    if 3 in constraints:
        for i, r in df_c.iterrows():
            if r.City != "NULL" and r.County != "NULL":
                errors = df_c[(df_c.City == r.City) & (df_c.County != r.County) & (df_c.County != "NULL")]
                if errors.shape[0] > 0:
                    c = collections.Counter(df_c[df_c.City == r.City].County)
                    if "NULL" in c:
                        c.pop("NULL")
                    if len(c) > 0:
                        if c.most_common(1)[0][1] > 1:
                            df_c["County"][i] = c.most_common(1)[0][0]
                        

    return df_c

In [3]:
##### HoloClean #####

def create_constraints_file(constraints_path, relevant_attr):
    '''
    For Holoclean every contraint should only include real columns from the table. 
    So there is a need to create a custom constraints file for every table.
    '''
    with open(constraints_path) as fr:
        attributes_to_keep = set()
        constraints_to_keep = set()
        for line in fr:
            for attr in relevant_attr:
                attributes_to_keep.add(attr)
                if attr in line:
                    constraints_to_keep.add(line)
                    for item in line.split("t2.")[1:]:
                        if "IQ" in item and attr in item:
                            attributes_to_keep.add(item.split(")")[0])
    with open('./temp_constraints.txt', "w+") as fw:
        for c in constraints_to_keep:
            if "\n" in c:
                fw.write(c)
            else:
                fw.write(c + "\n")
    return attributes_to_keep


def holoclean_init(data_df):
    '''
    #TODO: Find optimal parameters. Maybe function of something?
    '''
    hc = holoclean.HoloClean(
        db_name='holo',
        domain_thresh_1=0.1,
        domain_thresh_2=0,
        weak_label_thresh=0.5,
        max_domain=10000,
        cor_strength=0.05,
        nb_cor_strength=0.3,
        epochs=20,
        weight_decay=0.01,
        learning_rate=0.05,
        threads=1,
        batch_size=1,
        verbose=False,
        timeout=1*60000,
        feature_norm=False,
        weight_norm=False,
        print_fw=False
    ).session

    hc.load_data('Name', data_df)
    hc.load_dcs('./temp_constraints.txt')
    hc.ds.set_constraints(hc.get_dcs())
    hc.setup_domain(list(data_df.columns))
    return hc

In [4]:
def partition (list_in, n):
    random.Random(time.time()).shuffle(list_in)
    n = int(n)
    return [list_in[i::n] for i in range(n)]

def holoclean_detect(hc):
    detectors = [NullDetector(), ViolationDetector()]
    featurizers = [
        InitAttrFeaturizer(),
        OccurAttrFeaturizer(),
        FreqFeaturizer(),
        ConstraintFeaturizer(),
    ]
    
    hc.detect_errors(detectors)
    hc.repair_errors(featurizers)

    return hc

def run_holoclean(df, constraints_path, columns):
    relevant_attributes = create_constraints_file(constraints_path, columns)
    print(relevant_attributes)
    df_in = df.copy()
    df_in = df_in[relevant_attributes]
    hc = holoclean_init(df_in)
    hc = holoclean_detect(hc)
    return hc.ds.repaired_data.df    # This is only for holoclean


In [47]:
def naive(df, constraints):
    df_c = df.copy()
    if 1 in constraints:
        for i, r in df_c.iterrows():
            if r.Team != "NULL" and r.City != "NULL":
                errors = df_c[(df_c.Team == r.Team) & (df_c.City != r.City) & (df_c.City != "NULL")]
                if errors.shape[0] > 0:
                    c = collections.Counter(df_c[df_c.Team == r.Team].City)
                    if "NULL" in c:
                        c.pop("NULL")
                    if len(c) > 0:
                        if c.most_common(1)[0][1] > 1:
                            df_c["City"][i] = c.most_common(1)[0][0] 

    if 2 in constraints:
        for i, r in df_c.iterrows():
            if r.League != "NULL" and r.Country != "NULL":
                errors = df_c[(df_c.League == r.League) & (df_c.Country != r.Country) & (df_c.Country != "NULL")]
                if errors.shape[0] > 0:
                    c = collections.Counter(df_c[df_c.League == r.League].Country)
                    if "NULL" in c:
                        c.pop("NULL")
                    if len(c) > 0:
                        if c.most_common(1)[0][1] > 1:
                            df_c["Country"][i] = c.most_common(1)[0][0]
                                   
                        
    if 3 in constraints:
        for i, r in df_c.iterrows():
            if r.City != "NULL" and r.County != "NULL":
                errors = df_c[(df_c.City == r.City) & (df_c.County != r.County) & (df_c.County != "NULL")]
                if errors.shape[0] > 0:
                    c = collections.Counter(df_c[df_c.City == r.City].County)
                    if "NULL" in c:
                        c.pop("NULL")
                    if len(c) > 0:
                        if c.most_common(1)[0][1] > 1:
                            df_c["County"][i] = c.most_common(1)[0][0]


    
    return df_c

In [52]:
import operator as op
from functools import reduce

def ncr(n, r):
    r = min(r, n-r)
    numer = reduce(op.mul, range(n, n-r, -1), 1)
    denom = reduce(op.mul, range(1, r+1), 1)
    return numer // denom  # or / in Python 2

def nCr(n,r):
    f = math.factorial
    return f(n) / f(r) / f(n-r)

In [87]:
m = 50
df = pd.read_csv('./testdata/hospital_100_2.csv')
constraints_path = './testdata/hospital_constraints.txt'
constraints_for_alg = [1, 2, 3]
start = time.time()
# df = pd.read_csv('./testdata/La_liga2.csv')
# constraints_path = './testdata/La_liga_constraints.txt'

# data = [
#     ["Barcelona F.C.", "La Liga", "Barcelona", "Spain"],
#     ["Real", "La Liga", "Madrid", "Espana"],
#     ["Athletico", "La Liga", "Madrid", "Spain"],
#     ["Athletico", "Spanish_League", "Madrid", "Spain"],
#     ["Real", "La Liga", "Madrid", "Spain"],
# ]


# df = pd.DataFrame(data, columns=["Team", "League", "City", "Country"])

df_copy = df.copy()

cell_repair = (40, "County")
row_repair, col_repair = cell_repair[0], cell_repair[1]
column = cell_repair[1]

before_fix = df_copy[column][cell_repair[0]]


#######              1           ################
# df_repair = run_holoclean(df_copy, constraints_path, [column])
df_repair = naive2(df_copy, [1, 2, 3])
#######              1           ################


after_fix = df_repair[column][cell_repair[0]]

print(after_fix)
print(df_copy[column][cell_repair[0]])

relevant_rows = df_repair[df_repair[column]==df_repair[column][cell_repair[0]]]
relevant_attributes = create_constraints_file(constraints_path, [column])
# relevant_attributes = df.columns
cells = list(itertools.product(relevant_rows.index, relevant_attributes))

cells.remove(cell_repair)
cells_copy = cells.copy()

memo_dict = {}
for cell in cells:
    memo_dict[str(cell)] = {"is_fix_with_cell" : {}, "is_fix_without_cell" : {}}

weights = []
params = []
    
# start = time.time()
# r = np.random.binomial(len(cells),0.5)
# p = 0.0
# r = int(len(cells) * p)

for i in range(m):
    p = random.uniform(0.2, 0.8)
#     p = 1.0
    r = np.random.binomial(len(cells),p)
    if 1:
        comb = set(random_combination(cells, r))
        comb_bar = set(cells) - comb
        df_c = df.copy()
        
        S = r
        T = len(cells)

        for cell in comb:
            df_c[cell[1]][cell[0]] = str(cell)

        #######              2           ################
        # df_repair = run_holoclean(df_copy, constraints_path, [column])
        df_repair = naive2(df_copy, [1, 2, 3])
        #######              2           ################

        is_repair_without = (df_repair[col_repair][row_repair] == after_fix)
#         print("Was it fixed with comb hided?   :   " + str(is_repair_without))
        
        for cell in comb:
            memo_dict[str(cell)]["is_fix_without_cell"][i] = is_repair_without
        for cell in comb_bar:
            memo_dict[str(cell)]["is_fix_with_cell"][i] = is_repair_without
        
        if not is_repair_without:
            factor = pow(p,S-1)*pow(1-p, T-S+1)*T*ncr(T-1,S-1)
            for cell in comb:
                df_c[cell[1]][cell[0]] = df[cell[1]][cell[0]] 
                
                #######              3           ################
                # df_repair = run_holoclean(df_copy, constraints_path, [column])
                df_repair = naive2(df_copy, [1, 2, 3])
                #######              3           ################

                is_repair_with = (df_repair2[col_repair][row_repair] == after_fix)
                memo_dict[str(cell)]["is_fix_with_cell"][i] = is_repair_with
                df_c[cell[1]][cell[0]] = str(cell)
            for cell in comb_bar:
                memo_dict[str(cell)]["is_fix_without_cell"][i] = True
#             r = r - math.ceil(len(cells)*0.05)
#             p = p - 0.05
        else:
            factor = pow(p,S+1)*pow(1-p, T-S-1)*T*ncr(T-1,S+1)
            for cell in comb_bar:
                df_c[cell[1]][cell[0]] = str(cell)
                
                #######              4           ################
                # df_repair = run_holoclean(df_copy, constraints_path, [column])
                df_repair = naive2(df_copy, [1, 2, 3])
                #######              4           ################


                is_repair_with = (df_repair2[col_repair][row_repair] == after_fix)
                memo_dict[str(cell)]["is_fix_without_cell"][i] = is_repair_with
                df_c[cell[1]][cell[0]] = df[cell[1]][cell[0]]
            for cell in comb:
                memo_dict[str(cell)]["is_fix_with_cell"][i] = False
#             r = r + math.ceil(len(cells)*0.05)
#             p = p + 0.05
    
        weights.append(1 / factor)
        params.append((S, T, p))
#     except:
#         print("!!!")
#         pass       
            
end = time.time()
print("Time took for repeat {} is: ".format(m) + str(end - start))
print(r)
        

jefferson
jxffxrson
Time took for repeat 50 is: 165.14221453666687
6


In [90]:
results= {}
std_list = []
for cell in cells:
    results[str(cell)] = 0
    results[str(cell) + "!"] = 0

for j in range(50):
    std_count = 0
    for cell in cells:
        try:
            if not memo_dict[str(cell)]["is_fix_without_cell"][j] and memo_dict[str(cell)]["is_fix_with_cell"][j]:
#                 results[str(cell)] += weights[j]
                results[str(cell) + "!"] += 1
                std_count += 1
        except:
            pass
    std_list.append(std_count)
        
results
{k: v for k, v in sorted(results.items(), key=lambda item: -item[1])}


{"(40, 'City')!": 21,
 "(1, 'City')!": 8,
 "(42, 'City')!": 7,
 "(41, 'City')!": 6,
 "(42, 'County')!": 6,
 "(1, 'County')!": 5,
 "(41, 'County')!": 5,
 "(2, 'City')!": 4,
 "(2, 'County')!": 3,
 "(0, 'County')": 0,
 "(0, 'County')!": 0,
 "(0, 'City')": 0,
 "(0, 'City')!": 0,
 "(1, 'County')": 0,
 "(1, 'City')": 0,
 "(2, 'County')": 0,
 "(2, 'City')": 0,
 "(13, 'County')": 0,
 "(13, 'County')!": 0,
 "(13, 'City')": 0,
 "(13, 'City')!": 0,
 "(27, 'County')": 0,
 "(27, 'County')!": 0,
 "(27, 'City')": 0,
 "(27, 'City')!": 0,
 "(40, 'City')": 0,
 "(41, 'County')": 0,
 "(41, 'City')": 0,
 "(42, 'County')": 0,
 "(42, 'City')": 0,
 "(63, 'County')": 0,
 "(63, 'County')!": 0,
 "(63, 'City')": 0,
 "(63, 'City')!": 0}