In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os

if "/nas/home/minhpham/workspace/kb-data-cleaning/" not in sys.path:
    sys.path.insert(0, "/nas/home/minhpham/workspace/kb-data-cleaning/")

os.chdir("../../..")

In [None]:
from pathlib import Path

import pandas as pd

name2raw = {}
name2cleaned = {}
name2groundtruth = {}

data_path = Path("data/test/ed2/")

for file_path in (data_path / "raw").iterdir():
    name2raw[file_path.name] = pd.read_csv(file_path, keep_default_na=False, dtype=str)
    name2cleaned[file_path.name] = pd.read_csv(
        data_path / "cleaned" / file_path.name, keep_default_na=False, dtype=str
    )
    name2groundtruth[file_path.name] = (
        name2raw[file_path.name] == name2cleaned[file_path.name]
    )

In [None]:
from kbclean.cleaning.detection.deep import DeepUnDetector
from kbclean.utils.inout import load_config

configs = load_config("config")

deep_detector = DeepUnDetector(configs.deep_clean)

In [None]:
import regex as re
# from nltk.util import trigrams

def _to_regex(x):
    try:
        if x is None:
            return ""
        x = re.sub(r"[A-Z]", "A", x)
        x = re.sub(r"[0-9]", "0", x)
        x = re.sub(r"[a-z]", "a", x)
        return x
    except Exception as e:
        print(e)
        return x


# def ngram_featurize(str_):
#     feature_dict = {}
#     if len(str_) < 3:
#         str_ += "|" * (3 - len(str_))
#     for trigram in trigrams(str_):
#         feature_dict[f"{''.join(trigram)}"] = 1

#     for trigram in trigrams(_to_regex(str_)):
#         feature_dict[f"pattern_{''.join(trigram)}"] = 1

#     return feature_dict

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
import torch

def detect_values(raw_data, groundtruth):
#     vectorizer = DictVectorizer()
#     feature_dicts = [ngram_featurize(val) for val in raw_data]
#     X = vectorizer.fit_transform(feature_dicts)
    patterns = list(map(_to_regex, raw_data))
    tensors = []

    for i in range(0, len(patterns), deep_detector.hparams.batch_size):
        tensor = deep_detector.lm_model.encode(patterns[i : i + deep_detector.hparams.batch_size])
        tensors.append(tensor)
    probs = torch.cat(tensors, dim=0).detach().cpu().numpy()
        
    X_train, X_test, y_train, y_test = train_test_split(
        probs, groundtruth, test_size=0.5, random_state=42
    )

    random_forest = RandomForestClassifier(n_jobs=64)
    random_forest.fit(X_train, y_train)

    y_predict = random_forest.predict(X_test)
    return y_predict, y_test

In [None]:
def detect(raw_data, groundtruth):
    combined_y_test = []
    combined_y_predict = []
    for column in raw_data.columns:
        y_predict, y_test = detect_values(
            raw_data[column].values.tolist(), groundtruth[column].values.tolist()
        )
        combined_y_predict.extend(y_predict)
        combined_y_test.extend(y_test)
    return combined_y_test, combined_y_predict

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

name2report = {}

for name, raw_data in name2raw.items():
    ground_truth = name2groundtruth[name]
    combined_y_test, combined_y_predict = detect(raw_data, ground_truth)
    name2report[name] = pd.DataFrame(
        classification_report(combined_y_test, combined_y_predict, output_dict=True)
    ).transpose()

In [None]:
from labext.prelude import A, M, W

M.DataTable.register()

def render(index):
    item = list(name2report.items())[index]
    display(item[0], item[1])


A.slider(render, max=len(name2report.values()) - 1)