In [31]:
import torch
from transformers import AutoTokenizer, RobertaForMaskedLM, RobertaForTokenClassification, MT5ForConditionalGeneration
from Levenshtein import distance as levenshtein_distance
from sklearn.metrics.pairwise import cosine_similarity
import re
import json

In [32]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', model_max_length=128)
custom_tokens = [f"<extra_id_{i}>" for i in range(100)]
tokenizer.add_tokens(custom_tokens)

model_masked = RobertaForMaskedLM.from_pretrained('xlm-roberta-large')

mt5_model_directory = "models/mt5-base-finetuned-unmask/checkpoint-28000"
mt5_model= MT5ForConditionalGeneration.from_pretrained(mt5_model_directory)
mt5_tokenizer = AutoTokenizer.from_pretrained(mt5_model_directory)

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


In [33]:
add_model= RobertaForTokenClassification.from_pretrained("models/add-roberta", local_files_only=True)
mod_model= RobertaForTokenClassification.from_pretrained("models/delete-modify-roberta", local_files_only=True)

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


In [65]:
add_model= RobertaForTokenClassification.from_pretrained("models/add-roberta-100k", local_files_only=True)
mod_model= RobertaForTokenClassification.from_pretrained("models/delete-modify-roberta-100k", local_files_only=True)

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


In [66]:
print(tokenizer.decode(250002))

<extra_id_0>


In [67]:
print(tokenizer("Salut ce faci? Cum o mai duci"))
print(tokenizer("Salut faci tu? Cum o mai druci"))

{'input_ids': [0, 48721, 405, 10965, 32, 7140, 36, 409, 115, 318, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [0, 48721, 10965, 370, 32, 7140, 36, 409, 26223, 318, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [26]:
tokens1 = [0, 48721, 405, 10965, 32, 7140, 36, 409, 115, 318, 2]

In [27]:
tokens2 = [0, 48721, 10965, 370, 32, 7140, 36, 409, 26223, 318, 2]

In [30]:
for token in tokens1:
    print(str(token) + '=' + tokenizer.decode(token), end =" ")

for token in tokens2:
    print(str(token) + '=' + tokenizer.decode(token), end =" ")

print("")

for token in tokens1:
    print(tokenizer.decode(token), end =" ")

for token in tokens2:
    print(tokenizer.decode(token), end =" ")

0=<s> 48721=Salut 405=ce 10965=faci 32=? 7140=Cum 36=o 409=mai 115=du 318=ci 2=</s> 0=<s> 48721=Salut 10965=faci 370=tu 32=? 7140=Cum 36=o 409=mai 26223=dru 318=ci 2=</s> 
<s> Salut ce faci ? Cum o mai du ci </s> <s> Salut faci tu ? Cum o mai dru ci </s> 

In [45]:
def generate_del_mod(inputs):
    with torch.no_grad():
        logits = mod_model(**inputs).logits

    predicted_token_class_ids = logits.argmax(-1)
    return predicted_token_class_ids[0].numpy().tolist()

def generate_add(inputs):
    with torch.no_grad():
        logits = add_model(**inputs).logits

    predicted_token_class_ids = logits.argmax(-1)
    return predicted_token_class_ids[0].numpy().tolist()

def generate_mt5(input_text):
  input_ids = mt5_tokenizer(input_text, return_tensors="pt").input_ids.to("cpu")
  output = mt5_model.generate(input_ids, max_length=200)
  return mt5_tokenizer.decode(output[0], skip_special_tokens=True)


def procentage_similarity(Q, Mi):
  levDis = levenshtein_distance(Q, Mi)
  bigger = max(len(Q), len(Mi))
  return (bigger - levDis) / bigger

def combine_add(original, to_replace, nr_of_tokens):
    for i in range(nr_of_tokens):
        result = re.search(custom_tokens[i]+'(.*)'+custom_tokens[i+1], to_replace)
        original = original.replace(custom_tokens[i], result.group(1))
    
    return original

def correct_mod(inputs):
    outputs = model_masked(**inputs)
    predictions = outputs[0]
    _, sorted_idx = predictions[0].sort(dim=-1, descending=True)
    predicted_index = [sorted_idx[i, 0].item() for i in range(0,len(sorted_idx))]
    return predicted_index


def score(input_ids):
    """Calculate the perplexity of a batch of tokenized sentences."""
    with torch.no_grad():
        loss = model_masked(input_ids, labels=input_ids).loss
    return torch.exp(loss).tolist()  # Perplexity is the exponential of the loss

def calculate_similarity(embedding1, embedding2):
    return cosine_similarity([embedding1], [embedding2])[0][0]

def correct(text, fill_mask, mode):
    inputs = tokenizer(text, return_tensors="pt")
    if len(inputs['input_ids'] > 128):
        pass

    out_mod = generate_del_mod(inputs)
    out_add = generate_add(inputs)

    #if 1 in out_add:
    #    print("WE HAVE ONE")

    mod_i = 0
    to_mod = inputs
    to_mod = to_mod['input_ids'][0]

    while mod_i != len(out_mod):
        if out_mod[mod_i] == 2:
            to_mod = torch.cat((to_mod[:mod_i], to_mod[mod_i+1:]))
            out_mod.pop(mod_i)

            if mode == 'mt5':
                add = out_add[mod_i]
                out_add.pop(mod_i)
                if add != 0:
                    #print("WE HAVE ONE")
                    #print(text)
                    if mod_i != len(out_add):
                        out_add[mod_i] += add
                    else:
                        out_add[mod_i - 1] += add
            continue
        mod_i += 1

    print("OUT MOD")
    print(out_mod)

    print("OUT ADD")
    print(out_add)

    print(len(out_mod))
    print(len(to_mod))


    mod_i = 0

    while mod_i != len(out_mod):

        if out_mod[mod_i] == 1:
            t2 = torch.tensor([250001])
            to_mod = torch.cat((to_mod[:mod_i], t2, to_mod[mod_i+1:]))

        mod_i += 1
    to_mod_inter = inputs
    to_mod_inter['input_ids'] = to_mod.unsqueeze(0)
    to_mod_inter['attention_mask'] = torch.ones(to_mod.size()).unsqueeze(0)
    res = correct_mod(to_mod_inter)
    if mode == 'mt5':
        appearences = 0
        for i in range(len(out_add)):
            if out_add[i] == 1:
                extra = 250002 + appearences
                res.insert(i, extra)
                appearences +=1
        to_mt5 = tokenizer.decode(res[1:-1])
        print(to_mt5)
        out_mt5 = generate_mt5(to_mt5)
        to_mt5 = combine_add(to_mt5, out_mt5, appearences)
        to_mt5 = to_mt5.strip() 
        return to_mt5 + '\n'
    else:
         return tokenizer.decode(res[1:-1]) + '\n'


In [1]:
text = "Mrg pe jos"
print(correct(text, "base", "mt5")) #default, mt5

NameError: name 'correct' is not defined

In [16]:
#SAFE TO USE
with open("corpus/results_2/golden_corpus_128_filtered_well_formed_no_duplicates_original.txt", "r", encoding="utf-8") as infile:
    content = infile.readlines()
    i = 0
    while i < len(content):
        res = correct(content[i], "base", "mt5")
        i += 1

KeyboardInterrupt: 

In [8]:
#WILL OVERWRITE
#file_in = "corpus/results_2/W-sentence-original.txt"
#file_out = "corpus/results_2/W-sentence-prediction-10M-base.txt"
file_in = "corpus/results_2/NAC-sentences-original.txt"
file_out = "corpus/results_2/NAC-sentences-prediction-100k-mt5.txt"
with open(file_in, "r", encoding="utf-8") as infile:
    content = infile.readlines()
    with open(file_out, "w", encoding="utf-8") as outfile:
        i = 0
        while i < len(content):
            #res = correct(content[i], "base", "mt5")
            res = correct(content[i], "base", "mt5")
            outfile.write(res)
            i += 1

AttributeError: 'NoneType' object has no attribute 'group'

In [4]:
def tokenize_dataset(filename_old_dataset, filename_new_dataset):
  file_old = open(filename_old_dataset, "r")
  file_new = open(filename_new_dataset, "w")
  
  while True:
    line = file_old.readline()
    if (not line):
      break
    
    encoding = tokenizer(line)
    json_encoding = json.dumps(encoding.data)
    file_new.write(json_encoding + '\n')

  file_old.close()
  file_new.close()
  return

original_dataset = 'corpus/results_2/golden_corpus_128_filtered_well_formed_no_duplicates.txt'
new_dataset = 'corpus/results_2/golden_corpus_128_filtered_well_formed_no_duplicates_inter.txt'

file_new = open(new_dataset, "w")
file_new.write('')
file_new.close()

tokenize_dataset(original_dataset, new_dataset)