# Error Detection - Training LSTM model (char + word embedding)

## Setting up environment (Pytorch + Pandas + Numpy)

In [1]:
%load_ext autoreload
%autoreload 2

import collections
import sys

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

if r"../../../kb-data-cleaning/kbclean" not in sys.path:
    sys.path.append(r"../../../kb-data-cleaning/kbclean")

In [2]:
from utils.config import load_hparams

hparams = load_hparams("../../config/hparams.yaml")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Loading data using TorchText

In [3]:
import regex as re


def preprocess(str1):
    str1 = "".join(str1)
    str1 = re.sub("[A-Z]", "A", str1)
    str1 = re.sub("[a-z]", "a", str1)
    str1 = re.sub("[0-9]", "0", str1)

    return list(str1)


type_to_regex = {
    "UPPERCASE": "[A-Z]+",
    "LOWERCASE": "[a-z]+",
    "DIGIT": "[0-9]+",
    "ALPHABET": "[A-Za-z]+",
    "ALPHANUM": "[A-Za-z0-9]+",
}


def mask_word(tokens):
    masked_tokens = []
    for token in tokens:
        for type_, regex in type_to_regex.items():
            if re.match(f"^{regex}$", token):
                masked_tokens.append(type_)
                break
        else:
            masked_tokens.append(token)
    return masked_tokens

In [4]:
from torch.utils.data import DataLoader, random_split
from torchtext.data import Field, LabelField, NestedField, TabularDataset

nesting_field = Field(
    tokenize=list,
    pad_token="<cpad>",
    init_token="<w>",
    eos_token="</w>",
    batch_first=True,
    fix_length=hparams.max_char_length,
    preprocessing=preprocess,
)

char1w_field = NestedField(nesting_field, pad_token="<wpad>", include_lengths=True)

word_field = Field(
    pad_token="<wpad>", batch_first=True, lower=False, include_lengths=True,
)

label = LabelField()

dataset = TabularDataset(
    path="../../data/train/train_500000c.csv",
    format="csv",
    fields={
        "str1": [("src_word", word_field), ("src_char", char1w_field)],
        "str2": [("trg_word", word_field), ("trg_char", char1w_field)],
        "sim": [("lbl", label)],
    },
)

## Building language vocabulary from data

In [5]:
from torchtext import vocab
from pathlib import Path
from torchtext.vocab import GloVe

word_field.build_vocab(dataset.src_word, dataset.trg_word)
char1w_field.build_vocab(dataset.src_char, dataset.trg_char)

label.build_vocab(dataset.lbl)
label.vocab.stoi = {"True": 0, "False": 1}

hparams.word_vocab_size = len(word_field.vocab)
hparams.char_vocab_size = len(char1w_field.vocab)

hparams.char_vocab_size, hparams.word_vocab_size

(126, 33014)

## Building and training LSTM model

In [6]:
from pytorch_lightning import Trainer

from ml.base import CharCNN2L, MultiCharCNN
from ml.nets import CharCNNLSTM


glove = vocab.GloVe(name="6B", dim=300)

char_cnn = MultiCharCNN(
    char_vocab_size=hparams.char_vocab_size,
    char_embedding_size=hparams.char_embedding_size,
)

lstm = CharCNNLSTM(
    char_cnn,
    word_vocab_size=hparams.word_vocab_size,
    embedding_size=hparams.embedding_size,
    hidden_size=hparams.hidden_size,
    pretrained_embeddings=glove.vectors,
)

checkpoint = torch.load(
    "../../checkpoints/lightning_logs/version_6/checkpoints/epoch=2.ckpt"
)
lstm.load_state_dict(checkpoint["state_dict"])
lstm = lstm.to(device)

In [7]:
from er.detector import SVMDetector
import spacy

nlp_model = spacy.load("en_core_web_lg")

detector = SVMDetector(lstm, hparams=hparams, device=device,)

In [8]:
from collections import defaultdict
from torchtext.data import BucketIterator
from sklearn.metrics import classification_report, f1_score
from sklearn.manifold import TSNE
from sklearn.mixture import BayesianGaussianMixture
from pyod.models.knn import KNN
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from scipy import stats

import warnings

warnings.filterwarnings("ignore")

fn_to_errors = defaultdict(list)

fn_to_true_count = {}
fn_to_acc = {}
fn_to_dataset = {}
fn_to_df = {}

tsne = TSNE()

for file_path in Path("../../data/test/labeled_ijcai").iterdir():
    if file_path.suffix == ".csv":
        dataset = TabularDataset(
            path=str(file_path),
            format="csv",
            fields={
                "reviewText": [("src_word", word_field), ("src_char", char1w_field)],
            },
        )
        fn_to_df[file_path.stem] = pd.read_csv(file_path)
        data_iterator = BucketIterator(
            dataset, batch_size=hparams.batch_size, device=device
        )
        fn_to_dataset[file_path.stem] = data_iterator

fn_to_reports = {}
f1s = []

for file, data_iterator in fn_to_dataset.items():
    df = fn_to_df[file]

    df["outlier"] = df["outlier"] == 1
    encoded_vecs = detector.encode(data_iterator)
    df["x"] = encoded_vecs[:, 0]
    df["y"] = encoded_vecs[:, 1]
    model = AgglomerativeClustering(n_clusters=None, distance_threshold=2)
    model.fit(encoded_vecs)

    y_pred = model.labels_
    mode = stats.mode(y_pred)
    df["cluster"] = y_pred
    df["predictions"] = y_pred != mode[0]

    df["result"] = df["outlier"] == df["predictions"]

    report = classification_report(df["outlier"], df["predictions"], output_dict=True)
    f1 = f1_score(df["outlier"], df["predictions"], pos_label=1)

In [9]:
from utils.widgets import init_datatable_mode
from ipywidgets import interact
import ipywidgets as widgets

init_datatable_mode()

def fn(x):
    df = fn_to_df[x]
    display(df[["reviewText", "outlier", "predictions", "result", "cluster"]])


w = interact(
    fn,
    x=widgets.widgets.Dropdown(
        options=fn_to_df.keys(), value="bd1", description="File:", disabled=False,
    ),
)

<IPython.core.display.Javascript object>

interactive(children=(Dropdown(description='File:', index=7, options=('bd3', 'birth', 'dimensions', 'avg_year'…

In [10]:
# def fn(x):
#     df = fn_to_reports[x]
#     display(df)

# interact(fn, x=widgets.widgets.Dropdown(
#     options=fn_to_reports.keys(),
#     value='bd1',
#     description='File:',
#     disabled=False,
# ))

In [11]:
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.io import show, output_notebook
from bokeh.layouts import row

import copy

output_notebook()


def fn(x):
    df = copy.deepcopy(fn_to_df[x])
    colormap = {True: "red", False: "blue"}
    df["colors1"] = [colormap[x] for x in df["predictions"]]
    colormap = {True: "red", False: "blue"}
    df["colors2"] = [colormap[x] for x in df["outlier"]]
    source = ColumnDataSource(df)

    hover = [
        ("text", "@reviewText"),
        ("(x,y)", "(@x, @y)"),
        ("desc", "@desc"),
    ]

    p1 = figure(tooltips=hover)
    p1.scatter(x="x", y="y", source=source, color="colors1")

    p2 = figure(tooltips=hover)
    p2.scatter(x="x", y="y", source=source, color="colors2")

    p = row(p1, p2)

    show(p, notebook_handle=True)


w = interact(
    fn,
    x=widgets.widgets.Select(
        options=fn_to_df.keys(), value="bd1", description="File:", disabled=False,
    ),
)

interactive(children=(Select(description='File:', index=7, options=('bd3', 'birth', 'dimensions', 'avg_year', …

In [12]:
from torchtext.data import Example


def sim(str1, str2, fields):
    vectors1 = []
    vectors2 = []
    for field in fields:
        vectors1.append(field.process(field.preprocess(str1), device=device))
        vectors2.append(field.process(field.preprocess(str2), device=device))
    return lstm(*vectors1, *vectors2)

In [13]:
sim(['29"" x 25""'], ['25.5"" x 25.5""'], [word_field, char1w_field])

tensor([[0.0085]], device='cuda:0', grad_fn=<RsubBackward1>)