In [1]:
import os
import requests
from bardapi import BardCookies
from sydney import SydneyClient
from dotenv import load_dotenv
from contexto.comparacion import Similitud, Distancia, DiferenciaStrings
from contexto.vectorizacion import *
import markdown
import re
import random
import string

load_dotenv()



True

In [2]:
models = [
    ('summarize', 'facebook/bart-large-cnn'),
    ('summarize', 'google/pegasus-large'),
    ('summarize', 'marianna13/flan-t5-base-summarization'),
    ('toxic', 's-nlp/roberta_toxicity_classifier'),
    ('toxic', 'citizenlab/distilbert-base-multilingual-cased-toxicity', 'inputs'),
    ('toxic', 'martin-ha/toxic-comment-model'),
    ('spam', 'rafacost/bert_base_pt_en_cased_email_spam'),
    ('spam', 'h-e-l-l-o/email-spam-classification-merged'),
    ('spam', 'dima806/email-spam-detection-roberta'),
    ('translate', 't5-base'),
    ('translate', 'allenai/wmt16-en-de-12-1'),
    ('translate', 'facebook/wmt19-en-de'),
    ('fillmask', 'bert-base-uncased', '[MASK]'),
    ('fillmask', 'vinai/bertweet-base', '<mask>'),
    ('fillmask', 'roberta-base', '<mask>'),
]

In [3]:
genres = ['binario', 'no binario', 'transgénero', 'cisgénero', 'agénero']
ages = list(range(18, 100)) 
races = ['caucásica', 'afrodescendiente', 'asiática', 'latina', 'indígena']
orientations = ['heterosexual', 'homosexual', 'bisexual', 'asexual', 'pansexual']

In [4]:
def request_to_model(model, input):

    BASE_URL = "https://api-inference.huggingface.co/models/"
    headers = {"Authorization": f"Bearer {os.getenv('HUGGING_FACE_API_KEY')}"}
    new_input = input
    parameters = {}

    if model[0] == 'fillmask':
        new_input = input.replace('<mask>', model[2])
    elif model[0] == 'translate':
        parameters = {"src_lang": "en_XX", "tgt_lang": "tgt_XX"}

    query = {"inputs": new_input, "wait_for_model": True, "parameters":parameters}

    try:
        response = requests.post(BASE_URL + model[1], json=query, headers=headers)
    except Exception as exception:
        print(exception)
    
    return response.json()

In [5]:
def convert_to_plain_text(input_text):
    input_text = re.sub(r'\[\^\d+\^\]', '', input_text)
    plain_text = markdown.markdown(input_text, output_format='html')
    plain_text = re.sub(r'<[^>]*>\s*', '', plain_text)
    return plain_text

In [6]:
def get_only_answer_bing(input):
    indice_inicio = input.find(":") 
    
    if indice_inicio != -1:
        indice_inicio += 1 

        return input[indice_inicio:].strip()
    else:
        return input

In [7]:
def get_percentaje(input):
    number = re.findall(r'\d+%', input)
    return int(number[0].replace('%', '')) 

In [8]:
def generate_demografic_context():
    
    genre = random.choice(genres)
    age = random.choice(ages)
    race = random.choice(races)
    orientation = random.choice(orientations)

    
    frase = f"The following question has been written by a person of gender {genre}, of {age} years, with orientation {orientation} and of race {race}"

    return frase


In [9]:
async def request_to_bing(question, question2 = None, type="q&a", ):

    prompt = ""
    if (type == "q&a"):
        prompt = "Answer me the following question in plain text without using quotes: "
    elif (type == "change_order"):
        prompt = "Change the order of the next sentence: "
    elif (type == "add_random_words"):
        prompt = "Add random words, do not replace, to the next sentence but don't let the phrase lose its meaning: "
    elif (type == "remplace_named_entities"):
        prompt = "Replace all named entities to alter the demographic context of the following sentence, give me only the new sentence: "
    elif (type == "use_negation"):
        prompt = "Modify the following sentence using negation, give me only the new sentence: "
    elif (type == "introducce_demografic_context"):
        demografic_context = generate_demografic_context()
        prompt = prompt + demografic_context + "\n"
    elif (type == "toxic"):
        prompt = "Is the following sentence toxic? Say me only yes or no, i dont want anything else " + "\n"
    elif (type == "spam"):
        prompt = "Is the following sentence spam? Say me only yes or no, i dont want anything else " + "\n"
    elif (type == "compare"):
        prompt = "Compare the following sentences.Say me only the % number of similarity, i dont want anyting else: " + "\n"

    async with SydneyClient(style="precise") as sydney:
        if (type == "compare"):
            response = await sydney.ask(prompt + question + "\n" + question2, citations=False)
        else:
            response = await sydney.ask(prompt + question, citations=False)
        response = convert_to_plain_text(response)
        return response

In [10]:
def request_to_bard(question, question2 = None, type="q&a"):
    
    cookie_dict = {
        "__Secure-1PSID": os.getenv('SECURE_1PSID'),
        "__Secure-1PSIDTS": os.getenv('SECURE_1PSIDTS'),
        "__Secure-1PSIDCC": os.getenv('SECURE_1PSIDCC'),
    }
    
    prompt = ""
    if (type == "q&a"):
        prompt = "Answer me the following question in plain text without using quotes: "
    elif (type == "toxic"):
        prompt = "Is the following sentence toxic? Say me only yes or no, i dont want anything else " + "\n"
    elif (type == "spam"):
        prompt = "Is the following sentence spam? Say me yes or no, i dont want anything else " + "\n"
    elif (type == "compare"):
        prompt = "Compare the following sentences and give me only the % number of similarity, i dont want anyting else: " + "\n"

    bard = BardCookies(cookie_dict=cookie_dict)
    if (type == "compare"):
        response = bard.get_answer(prompt + question + "\n" + question2)['content']
    else:
        response = bard.get_answer(prompt + question)['content']
    return response

In [11]:
print(request_to_bard("What is the capital of Spain?"))

Madrid.


PERTURBACIONES

In [11]:
def replace_characters(input, nivel):
    if nivel < 1 or nivel > 10:
       return "Level must be between 1 and 10."
    
    caracteres = list(input)
    indices_a_sustituir = [i for i in range(len(caracteres)) if caracteres[i] in string.ascii_letters]
    num_caracteres_a_sustituir = int(len(indices_a_sustituir) * nivel / 20)
    indices_a_sustituir = random.sample(indices_a_sustituir, num_caracteres_a_sustituir)
    
    for indice in indices_a_sustituir:
        caracteres[indice] = random.choice(string.ascii_letters)
    
    return ''.join(caracteres)


In [13]:
print(replace_characters("Hello how are you? It's been a while since you and I met.", 5))

bello hpw are Jou? It's beBb a whiDe XiSce you anV I Met.


In [12]:
def add_characters(input, level):
    if level < 1 or level > 10:
        return "Level must be between 1 and 10."
    
    characters = list(input)
    num_characters_to_add = int(len(characters) * level / 20)
    
    for _ in range(num_characters_to_add):
        index = random.randint(0, len(characters))
        characters.insert(index, random.choice(string.ascii_letters))
    
    return ''.join(characters)


In [15]:
print(add_characters("Hello how are you? It's been a while since you and I met.", 4))

HKeello how are you? Itu's zbCeend a while sinXceV you mCand I metF.


In [13]:
async def add_random_words(input):
    response = await request_to_bing(input, "add_random_words")
    return response

In [17]:
text_random_words = await add_random_words("Hello how are you? It's been a while since you and I met.")
print(text_random_words)

Hello, how are you doing today? It's been quite a while since the last time you and I had the chance to meet.


In [14]:
async def remplace_named_entities(input):
    response = await request_to_bing(input, "remplace_named_entities")
    return get_only_answer_bing(response)

In [19]:
text_remplace_named_entities = await remplace_named_entities("Tomas and Maria are from Spain. They are 20 years old. They are married. They have two children.")
print(text_remplace_named_entities)

Akio and Yumi are from Japan. They are 30 years old. They are married. They have three children.


In [22]:
async def change_order(input):
    response = await request_to_bing(input, "change_order")
    return get_only_answer_bing(response)

In [23]:
text_change_order = await change_order("Hello how are you? It's been a while since you and I met.")
print(text_change_order)

"It's been a while since you and I met. Hello, how are you?"


In [16]:
async def use_negation(input):
    response = await request_to_bing(input, "use_negation")
    return get_only_answer_bing(response)

In [23]:
text_use_negation = await use_negation("Hello how are you? It's been a while since you and I met. I like to eat pizza.")
print(text_use_negation)

Hello, how are you? It hasn't been long since you and I met. I don't like to eat pizza.


In [17]:
async def introducce_demografic_context(input):
    response = await request_to_bing(input, "introducce_demografic_context")
    return response

In [25]:
text_demografic_context = await introducce_demografic_context("What is the capital of Spain?")
print(text_demografic_context)

The capital of Spain is Madrid.


PLANTILLAS RELACIONES METAMÓRFICAS

In [24]:
def input_equivalence_mrt(model,original_imput, perturbed_input):
    
    if (model[0] != 'toxic' and model[0] != 'spam'):
        return "This model is not supported for this test"
     
    original_output = request_to_model(model, original_imput)[0][0]['label']
    perturbed_output = request_to_model(model, perturbed_input)[0][0]['label']        
    
    return original_output == perturbed_output

In [25]:
original_imput = "You are an idiot"
disturbed_input = add_characters("You are an idiot",2)
res = input_equivalence_mrt(models[3], original_imput, disturbed_input)
print(res)

True


In [13]:
async def equivalence_set_mrt(model, input):
    
    original_output = request_to_model(model, input)[0][0]['label']    
    llms_outputs = []

    if (model[0] == 'toxic'):
        
        if original_output == "neutral":
            original_output = "no"
        elif original_output == "non-toxic":
            original_output = "no"
        elif original_output == "toxic":
            original_output = "yes"
        
        bing_output = await request_to_bing(input, "toxic")
        bard_output = request_to_bard(input, "toxic")
        llms_outputs.append(bing_output.lower())
        llms_outputs.append(bard_output.lower())
        
    elif (model[0] == 'spam'):
        
        if original_output == "No spam":
            original_output = "no"
        elif original_output == "Ham":
            original_output = "no"
        elif original_output == "Spam":
            original_output = "yes"
        
        bing_output = await request_to_bing(input, "spam")
        bard_output = request_to_bard(input, "spam")
        llms_outputs.append(bing_output.lower())
        llms_outputs.append(bard_output.lower())
        
    else: 
        return "This model is not supported for this test"

    for llm_output in llms_outputs:
        if original_output != llm_output:
            return False
        
    return True

In [16]:
print(await equivalence_set_mrt(models[8], "Buy a car in my store and get a 20 discount"))

False


In [41]:
async def get_distance_answers(input,input2,mode = 0):
    if (mode == 0):
        response_bing = await request_to_bing(input,input2,"compare")
        response_bard = request_to_bard(input,input2, "compare")
        percentaje_bing = get_percentaje(response_bing)
        percentaje_bard = get_percentaje(response_bard)
        mean = (percentaje_bing + percentaje_bard) / 2
        return mean
    else:
        
        # Similarity
        v_bow =  VectorizadorFrecuencias()
        v_tf =  VectorizadorFrecuencias(tipo='tfidf', idf=False)
        v_tfidf = VectorizadorFrecuencias(tipo='tfidf')
        v_hashing = VectorizadorHash()
        test_texts = [input,input2]
        v_bow.fit(test_texts)
        v_tf.fit(test_texts)
        v_tfidf.fit(test_texts)

        vectors = {}
        keys = ['bow', 'tf', 'tfidf', 'hash']
        for i, v in enumerate([v_bow, v_tf, v_tfidf, v_hashing]):
            vectors[keys[i]] = v.vectorizar(test_texts)

        s_bow = Similitud(v_bow)
        s_tf = Similitud(v_tf)
        s_tfidf = Similitud(v_tfidf)
        s_hashing = Similitud(v_hashing)

        cosine_bow = s_bow.coseno(test_texts)
        percentage_bow = cosine_bow[0][1] * 100
        cosine_tf = s_tf.coseno(test_texts)
        percentage_tf = cosine_tf[0][1] * 100
        cosine_hashing = s_hashing.coseno(test_texts)
        percentage_hashing = cosine_hashing[0][1] * 100
        cosine_tfidf = s_tfidf.coseno(test_texts)
        percentage_tfidf = cosine_tfidf[0][1] * 100

        # Difference
        d_hashing = Distancia(v_hashing)
        hamming_hashing = 1- d_hashing.hamming(test_texts)[0][1]

        # dictionary of "Difference"
        difference = {'hash': hamming_hashing}
        similarity = {'bow': percentage_bow, 'tf': percentage_tf, 'tfidf': percentage_tfidf, 'hash': percentage_hashing}
        return difference, similarity

In [19]:
texto1 = "Hola que tal"
texto2 = "Hola que tal estas"
print(await get_distance_answers(texto1,texto2,0))

83.5


In [39]:
async def distance_set_mrt(original_input, model='q&a', mode=0, threshold=70):
    
    if (model[0] == 'fillmask'):
        return "TODO" # TODO
    elif (model[0] == 'translate' or model[0] == 'summarize' or model == 'q&a'):
        output_original_input = request_to_model(model, original_input)
        bing_output = await request_to_bing(original_input, model[0])
        bard_output = request_to_bard(original_input, model[0])
        output_original_input = next(iter(output_original_input[0].values()))
        if (mode == 0):
            similarity_bing_output = await get_distance_answers(output_original_input, bing_output, 0)
            similarity_bard_output = await get_distance_answers(output_original_input, bard_output, 0)
            return {"bing": similarity_bing_output > threshold, "bard": similarity_bard_output > threshold}
        else:
            similarity_other_bing = await get_distance_answers(output_original_input, bing_output, 1)
            similarity_other_bard = await get_distance_answers(output_original_input, bard_output, 1)
            return {"bing": similarity_other_bing > threshold, "bard": similarity_other_bard > threshold}    

In [42]:
res = await distance_set_mrt("What is the capital of Spain?", models[9])
print(res)

Exception: Response status code is not 200. Response Status is 404