# Setup Models

This code block downloads Sentence-BERT models and loads them.

In [None]:
from sentence_transformers import SentenceTransformer, util

model_names = [
    # Official models
    "all-mpnet-base-v2",
    "multi-qa-mpnet-base-dot-v1",
    "all-distilroberta-v1",
    "all-MiniLM-L12-v2",
    "multi-qa-distilbert-cos-v1",
    "all-MiniLM-L6-v2",
    "multi-qa-MiniLM-L6-cos-v1",
    "paraphrase-multilingual-mpnet-base-v2",
    "paraphrase-albert-small-v2",
    "paraphrase-multilingual-MiniLM-L12-v2",
    "paraphrase-MiniLM-L3-v2",
    "distiluse-base-multilingual-cased-v1",
    "distiluse-base-multilingual-cased-v2",
    # Third-party models
    "nikcheerla/nooks-amd-detection-v2-full",
    "jhgan/ko-sroberta-multitask",
    "ceggian/sbert_pt_reddit_softmax_512",
    "BlueAvenir/sti_security_class_model",
    "jhgan/ko-sbert-sts",
    "nikcheerla/nooks-amd-detection-realtime",
    "sentence-transformers/LaBSE",
    "kwoncho/ko-sroberta-multitask-suspicious"
]
model_max_seqs = [
    # Official models
    384,
    512,
    512,
    256,
    512,
    256,
    512,
    128,
    256,
    128,
    128,
    128,
    128,
    # Third-party models
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512
]

def makeModel(model_name, max_seq_len):
    print(model_name)
    model = SentenceTransformer(model_name)
    model.max_seq_length = max_seq_len
    return model

models = list()
for i in range(0, len(model_names)):
    new_model = [model_names[i]]
    new_model.append(makeModel(model_names[i], model_max_seqs[i]))
    models.append(new_model)
    
print(models)

# Gather Data

This code block loads necessary data from disk.

In [None]:
import csv

def readData(file_name):
    data = list()
    header = None
    with open(file_name, "r") as f:
        csv_reader = csv.reader(f, delimiter=",", quotechar="\"", quoting=csv.QUOTE_MINIMAL)
        header = next(csv_reader) # Skip header row
        for line in csv_reader:
            data.append(line)
            
    return data, header

data_files = [
    "data/human_errors.csv",           # Human Errors from GitHub and Interviews
    "data/these_descriptions.csv",     # THESE Categories (S01, S02, ...)
    "data/these_type_descriptions.csv" # THESE Types (Slip, Lapse, Mistake)
    # Comment out the above 2 lines and uncomment the next 2 lines to run this
    # experiment with improved THESE descriptions
    # "data/these_descriptions_v4.csv"
    # "data/these_type_descriptions_v4.csv"
]

errors, errors_header = readData(data_files[0])
errors_noaps = errors[202:] # Remove Apologies
categories, categories_header = readData(data_files[1])
categories.pop(-1) # Remove "Other"
types, types_header = readData(data_files[2])

print(len(errors))       # Expecting 368
print(len(errors_noaps)) # Expecting 166
print(len(categories))   # Expecting 31
print(len(types))        # Expecting 3

# Compute Embeddings

This code block computes embeddings for our data. Change `cleantext=True` to `cleantext=False` to run the experiment without preprocessing. This takes about 10 minutes to run with 16 CPUs.

In [None]:
%%time

import re

RE_PUNCT = re.compile(r"[\.,:;?!]")
RE_WHITESPACE = re.compile(r"[\n\r\t\v\f]") # everything but regular spaces
RE_DUPLICATE_SPACES = re.compile(r"[\s]+")

error_embeddings = list()
category_embeddings = list()
type_embeddings = list()

def cleanup(sentence):
    sent = RE_PUNCT.sub(" ", sentence.lower())
    sent = RE_WHITESPACE.sub(" ", sent)
    sent = RE_DUPLICATE_SPACES.sub(" ", sent)
    return sent

def computeEmbeddings(models, data, comment_index, cleantext=True):
    if cleantext:
        sentences = [cleanup(d[comment_index]) for d in data]
    else:
        sentences = [d[comment_index] for d in data]
    embeddings = list()
    for model in models:
        embeddings.append(model[1].encode(sentences, convert_to_tensor=True))
        
    return embeddings, sentences

error_embeddings, error_sentences = computeEmbeddings(models, errors, 1)
error_noaps_embeddings, error_noaps_sentences = computeEmbeddings(models, errors_noaps, 1)
category_embeddings, category_sentences = computeEmbeddings(models, categories, 3)
type_embeddings, type_sentences = computeEmbeddings(models, types, 1)

print(len(error_embeddings))       # Expecting 20
print(len(error_noaps_embeddings)) # Expecting 20
print(len(category_embeddings))    # Expecting 20
print(len(type_embeddings))        # Expecting 20

print(len(error_embeddings[0]))       # Expecting 368
print(len(error_noaps_embeddings[0])) # Expecting 166
print(len(category_embeddings[0]))    # Expecting 31
print(len(type_embeddings[0]))        # Expecting 3

# Multi-Class

This section runs multi-class classification.

## Cosine Similarity

In [None]:
error_labels = [e[2] for e in errors]
error_noaps_labels = [e[2] for e in errors_noaps]
category_labels = [c[0] for c in categories]
type_labels = [t[0] for t in types]

category_assigned = list()
category_type_assigned = list()
type_assigned = list()

def compareLabels(model_index, err_embeddings, err_sentences, label_embeddings, label_sentences, labels):
    assigned_labels = list()
    for i in range(0, len(err_embeddings[model_index])):
        cosine_scores = list()
        for j in range(0, len(label_embeddings[model_index])):
            # Compute cosine similarity between a single error sentence and a single label sentence
            cosine_scores.append(util.cos_sim(
                err_embeddings[model_index][i],
                label_embeddings[model_index][j]
            ))
        
        max_index = -999
        max_value = -999
        for k in range(0, len(cosine_scores)):
            if cosine_scores[k] >= max_value:
                max_value = cosine_scores[k]
                max_index = k
                
        assigned_labels.append(labels[max_index])
        
    return assigned_labels

## Confusion Matrix w/ Apologies

In [None]:
from pprint import pprint

def confusionMatrix(assigned_labels, error_labels):
    conf_mat = [
        [0, 0, 0], #Assigned-Actual: Slip-Slip, Slip-Lapse, Slip-Mistake
        [0, 0, 0], #Assigned-Actual: Lapse-Slip, Lapse-Lapse, Lapse-Mistake
        [0, 0, 0]  #Assigned-Actual: Mistake-Slip, Mistake-Lapse, Mistake-Mistake
    ]
    for i in range(0, len(error_labels)):
        #print(i)
        if error_labels[i][0] == "S":
            if assigned_labels[i][0] == "S":
                conf_mat[0][0] += 1
            elif assigned_labels[i][0] == "L":
                conf_mat[1][0] += 1
            elif assigned_labels[i][0] == "M":
                conf_mat[2][0] += 1
        elif error_labels[i][0] == "L":
            if assigned_labels[i][0] == "S":
                conf_mat[0][1] += 1
            elif assigned_labels[i][0] == "L":
                conf_mat[1][1] += 1
            elif assigned_labels[i][0] == "M":
                conf_mat[2][1] += 1
        elif error_labels[i][0] == "M":
            if assigned_labels[i][0] == "S":
                conf_mat[0][2] += 1
            elif assigned_labels[i][0] == "L":
                conf_mat[1][2] += 1
            elif assigned_labels[i][0] == "M":
                conf_mat[2][2] += 1
                
    return conf_mat
    
def calcTP(conf_mat, label):
    if label == "SLIP":
        return conf_mat[0][0]
    elif label == "LAPSE":
        return conf_mat[1][1]
    elif label == "MISTAKE":
        return conf_mat[2][2]

def calcTN(conf_mat, label):
    if label == "SLIP":
        return conf_mat[1][1] + conf_mat[1][2] + conf_mat[2][1] + conf_mat[2][2]
    elif label == "LAPSE":
        return conf_mat[0][0] + conf_mat[0][2] + conf_mat[2][0] + conf_mat[2][2]
    elif label == "MISTAKE":
        return conf_mat[0][0] + conf_mat[0][1] + conf_mat[1][0] + conf_mat[1][1]

def calcFP(conf_mat, label):
    if label == "SLIP":
        return conf_mat[0][1] + conf_mat[0][2]
    elif label == "LAPSE":
        return conf_mat[1][0] + conf_mat[1][2]
    elif label == "MISTAKE":
        return conf_mat[2][0] + conf_mat[2][1]

def calcFN(conf_mat, label):
    if label == "SLIP":
        return conf_mat[1][0] + conf_mat[2][0]
    elif label == "LAPSE":
        return conf_mat[0][1] + conf_mat[2][1]
    elif label == "MISTAKE":
        return conf_mat[0][2] + conf_mat[1][2]
    
def calcPrecision(tp, fp):
    return tp / (tp + fp)

def calcRecall(tp, fn):
    return tp / (tp + fn)

def calcF1(tp, fp, fn):
    return (2 * tp) / ((2 * tp) + fp + fn)
    
for model_index in range(0, len(models)):
    assigned_labels = compareLabels(
        model_index,
        error_embeddings,
        error_sentences,
        type_embeddings,
        type_sentences,
        type_labels
    )
    conf_mat = confusionMatrix(assigned_labels, error_labels)
    # Slips
    tp = calcTP(conf_mat, "SLIP")
    fp = calcFP(conf_mat, "SLIP")
    tn = calcTN(conf_mat, "SLIP")
    fn = calcFN(conf_mat, "SLIP")
    precision = round(calcPrecision(tp, fp), 3)
    recall = round(calcRecall(tp, fn), 3)
    f1 = round(calcF1(tp, fp, fn), 3)
    print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fp,tp,fn,tn,precision,recall,f1))
    
print("\n")
    
for model_index in range(0, len(models)):
    assigned_labels = compareLabels(
        model_index,
        error_embeddings,
        error_sentences,
        type_embeddings,
        type_sentences,
        type_labels
    )
    conf_mat = confusionMatrix(assigned_labels, error_labels)
    # Lapses
    tp = calcTP(conf_mat, "LAPSE")
    fp = calcFP(conf_mat, "LAPSE")
    tn = calcTN(conf_mat, "LAPSE")
    fn = calcFN(conf_mat, "LAPSE")
    precision = round(calcPrecision(tp, fp), 3)
    recall = round(calcRecall(tp, fn), 3)
    f1 = round(calcF1(tp, fp, fn), 3)
    print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fp,tp,fn,tn,precision,recall,f1))
    
print("\n")
    
for model_index in range(0, len(models)):
    assigned_labels = compareLabels(
        model_index,
        error_embeddings,
        error_sentences,
        type_embeddings,
        type_sentences,
        type_labels
    )
    conf_mat = confusionMatrix(assigned_labels, error_labels)
    # Mistakes
    tp = calcTP(conf_mat, "MISTAKE")
    fp = calcFP(conf_mat, "MISTAKE")
    tn = calcTN(conf_mat, "MISTAKE")
    fn = calcFN(conf_mat, "MISTAKE")
    precision = round(calcPrecision(tp, fp), 3)
    recall = round(calcRecall(tp, fn), 3)
    f1 = round(calcF1(tp, fp, fn), 3)
    print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fp,tp,fn,tn,precision,recall,f1))

## Confusion Matrix w/o Apologies

In [None]:
for model_index in range(0, len(models)):
    assigned_labels = compareLabels(
        model_index,
        error_noaps_embeddings,
        error_noaps_sentences,
        type_embeddings,
        type_sentences,
        type_labels
    )
    conf_mat = confusionMatrix(assigned_labels, error_noaps_labels)
    # Slips
    tp = calcTP(conf_mat, "SLIP")
    fp = calcFP(conf_mat, "SLIP")
    tn = calcTN(conf_mat, "SLIP")
    fn = calcFN(conf_mat, "SLIP")
    precision = round(calcPrecision(tp, fp), 3)
    recall = round(calcRecall(tp, fn), 3)
    f1 = round(calcF1(tp, fp, fn), 3)
    print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fp,tp,fn,tn,precision,recall,f1))

print("\n")

for model_index in range(0, len(models)):
    assigned_labels = compareLabels(
        model_index,
        error_noaps_embeddings,
        error_noaps_sentences,
        type_embeddings,
        type_sentences,
        type_labels
    )
    conf_mat = confusionMatrix(assigned_labels, error_noaps_labels)
    # Lapses
    tp = calcTP(conf_mat, "LAPSE")
    fp = calcFP(conf_mat, "LAPSE")
    tn = calcTN(conf_mat, "LAPSE")
    fn = calcFN(conf_mat, "LAPSE")
    precision = round(calcPrecision(tp, fp), 3)
    recall = round(calcRecall(tp, fn), 3)
    f1 = round(calcF1(tp, fp, fn), 3)
    print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fp,tp,fn,tn,precision,recall,f1))

print("\n")

for model_index in range(0, len(models)):
    assigned_labels = compareLabels(
        model_index,
        error_noaps_embeddings,
        error_noaps_sentences,
        type_embeddings,
        type_sentences,
        type_labels
    )
    conf_mat = confusionMatrix(assigned_labels, error_noaps_labels)
    # Mistakes
    tp = calcTP(conf_mat, "MISTAKE")
    fp = calcFP(conf_mat, "MISTAKE")
    tn = calcTN(conf_mat, "MISTAKE")
    fn = calcFN(conf_mat, "MISTAKE")
    precision = round(calcPrecision(tp, fp), 3)
    recall = round(calcRecall(tp, fn), 3)
    f1 = round(calcF1(tp, fp, fn), 3)
    print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fp,tp,fn,tn,precision,recall,f1))

# Single Class

This section runs single class classification.

## Compute Embeddings & Cosine Similarity

This code block computes embeddings for our data. Change `cleantext=True` to `cleantext=False` to run the experiment without preprocessing. This takes a few minutes to run with 16 CPUs.

In [None]:
def computeEmbeddings(models, data, cleantext=True):
    if cleantext:
        sentences = [cleanup(d) for d in data]
    else:
        sentences = data
    embeddings = list()
    for model in models:
        embeddings.append(model[1].encode(sentences, convert_to_tensor=True))
        
    return embeddings

error_labels_slips = ["SLIP" if e[2].startswith("S") else "NOPE" for e in errors]
type_labels_slips = ["SLIP", "NOPE"]
type_sents_slips = [type_sentences[0], type_sentences[1] + " " + type_sentences[2]]
type_embeddings_slips = computeEmbeddings(models, type_sents_slips)

error_labels_lapses = ["LAPSE" if e[2].startswith("L") else "NOPE" for e in errors]
type_labels_lapses = ["LAPSE", "NOPE"]
type_sents_lapses = [type_sentences[1], type_sentences[0] + " " + type_sentences[2]]
type_embeddings_lapses = computeEmbeddings(models, type_sents_lapses)

error_labels_mistakes = ["MISTAKE" if e[2].startswith("M") else "NOPE" for e in errors]
type_labels_mistakes = ["MISTAKE", "NOPE"]
type_sents_mistakes = [type_sentences[2], type_sentences[0] + " " + type_sentences[1]]
type_embeddings_mistakes = computeEmbeddings(models, type_sents_mistakes)

error_noaps_labels_slips = ["SLIP" if e[2].startswith("S") else "NOPE" for e in errors_noaps]
error_noaps_labels_lapses = ["LAPSE" if e[2].startswith("L") else "NOPE" for e in errors_noaps]
error_noaps_labels_mistakes = ["MISTAKE" if e[2].startswith("M") else "NOPE" for e in errors_noaps]

def compareLabels(model_index, err_embeddings, err_sentences, label_embeddings, label_sentences, labels):
    assigned_labels = list()
    for i in range(0, len(err_embeddings[model_index])):
        cosine_scores = list()
        for j in range(0, len(label_embeddings[model_index])):
            # Compute cosine similarity between a single error sentence and a single label sentence
            cosine_scores.append(util.cos_sim(
                err_embeddings[model_index][i],
                label_embeddings[model_index][j]
            ))
        
        max_index = -999
        max_value = -999
        for k in range(0, len(cosine_scores)):
            if cosine_scores[k] >= max_value:
                max_value = cosine_scores[k]
                max_index = k
                
        assigned_labels.append(labels[max_index])
        
    return assigned_labels

## Confusion Matrix w/ Apologies

In [None]:
from pprint import pprint
import math

def confusionMatrix(assigned_labels, err_labels):
    conf_mat = [
        [0, 0], #Assigned-Actual: NOPE-NOPE, NOPE-Target      TN, FP
        [0, 0]  #Assigned-Actual: Target-NOPE, Target-Target  FN, TP
    ]
    for i in range(0, len(err_labels)):
        #print(i)
        if err_labels[i] in ["SLIP", "LAPSE", "MISTAKE"]:
            if assigned_labels[i] == err_labels[i]: #TP
                conf_mat[1][1] += 1
            else: #FN
                conf_mat[1][0] += 1
        elif err_labels[i] == "NOPE":
            if assigned_labels[i] == err_labels[i]: #TN
                conf_mat[0][0] += 1
            else: #FP
                conf_mat[0][1] += 1
                
    return conf_mat
    
def calcTP(conf_mat):
    return conf_mat[1][1]

def calcTN(conf_mat):
    return conf_mat[0][0]

def calcFP(conf_mat):
    return conf_mat[0][1]

def calcFN(conf_mat):
    return conf_mat[1][0]
    
def calcPrecision(tp, fp):
    return tp / (tp + fp)

def calcRecall(tp, fn):
    return tp / (tp + fn)

def calcF1(tp, fp, fn):
    return (2 * tp) / ((2 * tp) + fp + fn)

def printConfusion(type_labels, err_labels, type_embs):
    for model_index in range(0, len(models)):
        assigned_labels = compareLabels(
            model_index,
            error_embeddings,
            error_sentences,
            type_embs,
            type_sentences,
            type_labels
        )
        conf_mat = confusionMatrix(assigned_labels, err_labels)
        tp = calcTP(conf_mat)
        fp = calcFP(conf_mat)
        tn = calcTN(conf_mat)
        fn = calcFN(conf_mat)
        precision = round(calcPrecision(tp, fp), 3)
        recall = round(calcRecall(tp, fn), 3)
        f1 = round(calcF1(tp, fp, fn), 3)
        print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(fp,tp,fn,tn,precision,recall,f1))

printConfusion(type_labels_slips, error_labels_slips, type_embeddings_slips)
print("\n")
printConfusion(type_labels_lapses, error_labels_lapses, type_embeddings_lapses)
print("\n")
printConfusion(type_labels_mistakes, error_labels_mistakes, type_embeddings_mistakes)

## Confusion Matrix w/o Apologies

In [None]:
printConfusion(type_labels_slips, error_noaps_labels_slips, type_embeddings_slips)
print("\n")
printConfusion(type_labels_lapses, error_noaps_labels_lapses, type_embeddings_lapses)
print("\n")
printConfusion(type_labels_mistakes, error_noaps_labels_mistakes, type_embeddings_mistakes)