In [1]:
import pandas as pd
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
import numpy as np
import regex #dev
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.meteor_score import meteor_score
from langchain_community.embeddings import GPT4AllEmbeddings


In [2]:

def score_bleu(pred:str, y:str, type=2): 
    if type == 4:
        weights = [0.25, 0.25, 0.25, 0.25]
    elif type == 3:
        weights = [0.33, 0.33, 0.33]
    elif type == 2:
        weights = [0.5, 0.5]
    res = []
    for xe, ye in zip(pred, y):
        res.append(sentence_bleu([list(xe)], list(ye), weights))
    
    return np.average(res)
    
def _embedding_cosine_similarity(x:str, y:str, embedding_function) -> float:
    X = embedding_function.embed_query(x)
    Y = embedding_function.embed_query(y)
    return cosine_similarity([X], [Y])

def embedding_cosine_similarity(x:str, y:str, embedding_function) -> float:
    res = []
    for xe, ye in zip(x, y):
        res.append(_embedding_cosine_similarity(xe, ye, embedding_function))
    return np.average(res)

def score_meteor(pred:str, y:str) -> float:
    res = []
    for xe, ye in zip(pred, y):
        res.append(meteor_score([xe.split()], ye.split()))
    
    return np.average(res)

In [3]:
from math import ceil
import random


def calculate_threasholds(comp_samples:np.array, coverage_procentage:float=0.1, embedding_function=GPT4AllEmbeddings(), info:str=""):
    
    samples_c = len(comp_samples)

    c_range = ceil(samples_c*coverage_procentage)
    print(f"Making {c_range} comparisons...")#dev
    comp_ids_1 = [random.randrange(0, samples_c-1) for _ in range(c_range)]
    comp_ids_2 = [random.randrange(0, samples_c-1) for _ in range(c_range)]

    bleu_2 = score_bleu(comp_samples[comp_ids_1], comp_samples[comp_ids_2], 2)
    print("bleu_2")
    bleu_3 = score_bleu(comp_samples[comp_ids_1], comp_samples[comp_ids_2], 3)
    print("bleu_3")
    bleu_4 = score_bleu(comp_samples[comp_ids_1], comp_samples[comp_ids_2], 4)
    print("bleu_4")
    meteor = score_meteor(comp_samples[comp_ids_1], comp_samples[comp_ids_2])
    print("meteor")
    cosine_sim = embedding_cosine_similarity(comp_samples[comp_ids_1], comp_samples[comp_ids_2], embedding_function)
    print("cosine_sim")

    adder_metrics_threasholds = {"bleu_2":bleu_2, "bleu_3":bleu_3, "bleu_4":bleu_4, "meteor":meteor, "cosine_sim":cosine_sim, "info":info}
    
    return adder_metrics_threasholds

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [4]:
DATABASE_FILE = "_cbr_database"

df = pd.read_csv(f"{DATABASE_FILE}.csv")
samples = df.steps.to_numpy()


In [5]:
# calculate_threasholds(samples, coverage_procentage=3, info="3 coverage")

In [6]:
# 1000
metrics_threasholds = {'bleu_2': 0.4854855502611209,
 'bleu_3': 0.3674074412576021,
 'bleu_4': 0.2722650963284384,
 'meteor': 0.1246965969044918,
 'cosine_sim': 0.42583562424830085,
 'info': ''}

# 30 k
metrics_threasholds = {'bleu_2': 0.48610981609361903,
 'bleu_3': 0.36762298708981883,
 'bleu_4': 0.2725605103366232,
 'meteor': 0.12422512973849278,
 'cosine_sim': 0.4289302823367131,
 'info': '3 coverage'}

In [7]:

def _validate_example(text:str, db_examples:pd.DataFrame, adder_metrics_threasholds, ic:int, ez:float, embedding_function) -> bool:
    input_texts = [text for _ in range(ic)]

    comp_samples = db_examples #np.array(self.get_texts())
    samples_c = len(comp_samples)

    c_range = ic
    # print(f"Making {c_range} comparisons...")#dev
    comp_ids_1 = [random.randrange(0, samples_c-1) for _ in range(c_range)]

    bleu_2 = score_bleu(comp_samples[comp_ids_1], input_texts, 2)
    # print(f"bleu_2: {bleu_2} vs {adder_metrics_threasholds['bleu_2']}")
    if bleu_2 < adder_metrics_threasholds["bleu_2"]*ez: return False

    bleu_3 = score_bleu(comp_samples[comp_ids_1], input_texts, 3)
    # print(f"bleu_3: {bleu_3} vs {adder_metrics_threasholds['bleu_3']}")
    if bleu_3 < adder_metrics_threasholds["bleu_3"]*ez: return False

    bleu_4 = score_bleu(comp_samples[comp_ids_1], input_texts, 4)
    # print(f"bleu_4: {bleu_4} vs {adder_metrics_threasholds['bleu_4']}")
    if bleu_4 < adder_metrics_threasholds["bleu_4"]*ez: return False

    meteor = score_meteor(comp_samples[comp_ids_1], input_texts)
    # print(f"meteor: {meteor} vs {adder_metrics_threasholds['meteor']}")
    if meteor < adder_metrics_threasholds["meteor"]*ez: return False

    cosine_sim = embedding_cosine_similarity(comp_samples[comp_ids_1], input_texts, embedding_function)
    # print(f"cosine_sim: {cosine_sim} vs {adder_metrics_threasholds['cosine_sim']}")
    if cosine_sim < adder_metrics_threasholds["cosine_sim"]*ez: return False

    return True
    

def validate_examples(texts, db_examples:pd.DataFrame, adder_metrics_threasholds, ic:int=100, ez=0.9, embedding_function=GPT4AllEmbeddings()):
    """Adds new examples to vector database

    :param list[str] texts: _description_
    :param list[dict] metadatas: _description_, defaults to None
    """        
    res_mask = []

    for txt in texts:
        if _validate_example(txt, db_examples, adder_metrics_threasholds=adder_metrics_threasholds, ic=ic, ez=ez, embedding_function=embedding_function): 
            res_mask.append(True)
        else:
            if _validate_example(txt, db_examples, adder_metrics_threasholds=adder_metrics_threasholds, ic=ic, ez=ez, embedding_function=embedding_function): 
                res_mask.append(True)
            else:
                if _validate_example(txt, db_examples, adder_metrics_threasholds=adder_metrics_threasholds, ic=ic, ez=ez, embedding_function=embedding_function): 
                    res_mask.append(True)
                else:
                    if _validate_example(txt, db_examples, adder_metrics_threasholds=adder_metrics_threasholds, ic=ic, ez=ez, embedding_function=embedding_function): 
                        res_mask.append(True)
                    else:
                        res_mask.append(False)

        # res_texts.append(txt)
        # res_metadatas.append(md)

    return res_mask

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [8]:
embedding_function=GPT4AllEmbeddings()

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [9]:
DFF = "cbr_augmentation_12_gpt2_results_p_ix"

gen_1 = pd.read_csv(f"{DFF}.csv")
gen_comp = gen_1
comp = gen_comp.steps.to_numpy()

res = validate_examples(comp, samples, metrics_threasholds, 10, 0.65, embedding_function)
from collections import Counter
nres = [not e for e in res]

print(Counter(res))
gen_comp[res].to_csv(f"{DFF}_f.csv", index=False)
gen_comp[nres].to_csv(f"{DFF}_nf.csv", index=False)


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Counter({True: 8866, False: 1760})


In [None]:
# Counter({True: 8866, False: 1760})

In [10]:
raise

RuntimeError: No active exception to reraise

In [None]:
DFF = "cbr_augmentation_1_gpt2_results_p"

gen_1 = pd.read_csv(f"{DFF}.csv")
gen_comp = gen_1
comp = gen_comp.result.to_numpy()

res = validate_examples(comp, samples, metrics_threasholds, 10, 0.65, embedding_function)
from collections import Counter
nres = [not e for e in res]

print(Counter(res))
gen_comp[res].to_csv(f"{DFF}_f.csv", index=False)


In [None]:
DFF = "cbr_augmentation_2_gpt2_results_p"

gen_1 = pd.read_csv(f"{DFF}.csv")
gen_comp = gen_1
comp = gen_comp.result.to_numpy()

res2 = validate_examples(comp, samples, metrics_threasholds, 10, 0.65, embedding_function)
from collections import Counter
nres2 = [not e for e in res2]

print(Counter(res2))
gen_comp[res2].to_csv(f"{DFF}_f.csv", index=False)
