In [None]:
import os
import re
from transformers import RobertaForTokenClassification, AutoTokenizer, pipeline
from spacy.lang.es import Spanish
from glob import glob

TXTS_PATH = "~/scratch/baritone/ReumaCCC_all-txt/txt/"
ANNS_PATH = "~/scratch/baritone/ReumaCCC_all-txt/txt"
MODEL_PATH = "/gpfs/projects/bsc14/MN4/bsc14/hugging_face_models/ner/baritone-sintoma-tagger-v1"
REPLACE = False

In [1]:
txts_paths

NameError: name 'txts_paths' is not defined

In [2]:
import torch
print(torch.cuda.is_available()) 

True


In [3]:
def get_added_spaces(sentence, sentence_pretokenized):
    i = j = 0
    added_spaces = []
    while j < len(sentence_pretokenized):
        if sentence[i] == sentence_pretokenized[j]:
            i += 1
            j += 1
        elif sentence[i] == sentence_pretokenized[j+1] and sentence_pretokenized[j] == ' ':
            added_spaces.append(j)
            j += 1
        else:
            print("sentence:", len(sentence), sentence)
            print("sentence_pretokenized:", len(sentence_pretokenized), sentence_pretokenized)
            print("i, j:", i, j)
            print(sentence[i-3:i+3])
            print(sentence_pretokenized[j-3:j+3])
            raise AssertionError("This should never be called.")
    return added_spaces

In [4]:
def align_results(results_pre, added_spaces, start_sent_offset):
    aligned_results = []
    for entity in results_pre:
        aligned_entity = entity.copy()
        num_added_spaces_before = len(list(filter(lambda offset: offset < entity['start'], added_spaces)))
        num_added_spaces_after = len(list(filter(lambda offset: offset < entity['end'], added_spaces)))
        added_spaces_between = list(filter(lambda offset: (offset > entity['start']) & (offset < entity['end']), added_spaces))
        aligned_entity['word'] = entity['word'].strip()
        aligned_entity['word'] = ''.join([char for i, char in enumerate(aligned_entity['word']) if i + aligned_entity['start'] not in added_spaces_between])
        aligned_entity['start'] = start_sent_offset + entity['start'] - num_added_spaces_before
        aligned_entity['end'] = start_sent_offset + entity['end'] - num_added_spaces_after
        aligned_results.append(aligned_entity)
    return aligned_results

In [5]:
def write_to_ann(ann_path, results):
    results_ann_str = "\n".join([f"T{tid+1}\t{result['entity_group']} {result['start']} {result['end']}\t{result['word']}" for tid, result in enumerate(results)]) + "\n"
    with open(ann_path, "w+") as file:
        file.write(results_ann_str)

In [6]:
# Initialize global variables
model = RobertaForTokenClassification.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
PRETOKENIZATION_REGEX = re.compile(
    r'([0-9A-Za-zÀ-ÖØ-öø-ÿ]+|[^0-9A-Za-zÀ-ÖØ-öø-ÿ])')
nlp = Spanish()
nlp.add_pipe("sentencizer")
pipe = pipeline("token-classification", model=MODEL_PATH, aggregation_strategy='simple', device=0) # "simple" allows for different tags in a word, otherwise "first", "average", or "max".

In [2]:
txts_paths = glob(f"{TXTS_PATH}/**/*.txt", recursive=True)
if REPLACE == False:
    existing_anns = glob(f"{ANNS_PATH}/**/*.ann", recursive=True)
    existing_anns = list(map(lambda ann: ann.replace(".ann", ".txt"), existing_anns))
    txts_paths = list(set(txts_paths).difference(existing_anns))

NameError: name 'glob' is not defined

In [None]:
%%time
for i_txt, txt_path in enumerate(txts_paths):
    ann_path = txt_path.replace(".txt", ".ann")
    lines = open(txt_path, "r").readlines()
    #print("txt_path:", txt_path)
    results_file = []
    start_sent_offset = 0
    for line in lines:
        doc = nlp(line)
        sents = list(doc.sents)
        for sentence in sents:
            pretokens = [t for t in PRETOKENIZATION_REGEX.split(sentence.text) if t]
            #print("pretokens1:", pretokens)
            # Add space between two non-space pretokens
            i_pret = 1
            len_pretokens = len(pretokens)
            while i_pret < len_pretokens:
                if (not pretokens[i_pret-1].isspace() and not pretokens[i_pret].isspace()):
                    #print(f"Adding space because '{pretokens[i_pret-1]}' and '{pretokens[i_pret]}'")
                    pretokens.insert(i_pret, " ")
                    len_pretokens = len(pretokens)
                    i_pret += 1 # We have to move one more because we added one before
                i_pret += 1
            #print("pretokens2:", pretokens)
            sentence_pretokenized = ''.join(pretokens)
            #print("sentence_pretokenized:", sentence_pretokenized)
            added_spaces = get_added_spaces(sentence.text, sentence_pretokenized)
            results_pre = pipe(sentence_pretokenized)
            results_sent = align_results(results_pre, added_spaces, start_sent_offset)
            #print("results_sent:", results_sent)
            results_file.extend(results_sent)
            start_sent_offset += len(sentence.text)
    #print(results_file)
    write_to_ann(ann_path, results_file)
    print(f"Finished {i_txt+1}/{len(txts_paths)} ({(i_txt+1)*100/len(txts_paths)}%)")



Finished 1/70207 (0.0014243593943623855%)
Finished 2/70207 (0.002848718788724771%)
Finished 3/70207 (0.004273078183087156%)
Finished 4/70207 (0.005697437577449542%)
Finished 5/70207 (0.0071217969718119275%)
Finished 6/70207 (0.008546156366174312%)
Finished 7/70207 (0.009970515760536699%)
Finished 8/70207 (0.011394875154899084%)
Finished 9/70207 (0.01281923454926147%)
Finished 10/70207 (0.014243593943623855%)
Finished 11/70207 (0.01566795333798624%)
Finished 12/70207 (0.017092312732348625%)
Finished 13/70207 (0.01851667212671101%)
Finished 14/70207 (0.019941031521073398%)
Finished 15/70207 (0.021365390915435784%)
Finished 16/70207 (0.022789750309798167%)
Finished 17/70207 (0.024214109704160554%)
Finished 18/70207 (0.02563846909852294%)
Finished 19/70207 (0.027062828492885323%)
Finished 20/70207 (0.02848718788724771%)
Finished 21/70207 (0.029911547281610096%)
Finished 22/70207 (0.03133590667597248%)
Finished 23/70207 (0.032760266070334866%)
Finished 24/70207 (0.03418462546469725%)
Finish