In [1]:
# Spacy library for NER
import spacy
from spacy.matcher import Matcher
from spacy.scorer import Scorer, get_ner_prf
from spacy import displacy

# NLTK for removing stopwords from corpus
import nltk
from nltk.corpus import stopwords

# Aux lib
from glob import glob        # For searching the dataset path
import json                  # For reading the dataset
import numpy as np           # util
import pandas as pd          # util
from tqdm import tqdm        # util


import re                    # For removing punctuation

# viz libs
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer

import copy

import pickle 
import os

# FLAIR
from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings
from flair.data import Sentence
from flair.models import SequenceTagger

from flair.data import Corpus
from flair.datasets import CONLL_03_SPANISH
from flair.embeddings import WordEmbeddings, StackedEmbeddings, FlairEmbeddings


# Google Cloud
from google.cloud import language_v1



def normalize_str(s):
    replacements = (
        ("á", "a"),
        ("é", "e"),
        ("í", "i"),
        ("ó", "o"),
        ("ú", "u"),
    )
    for a, b in replacements:
        s = s.replace(a, b).replace(a.upper(), b.upper())
    return s

def remove_nextlines(s):
    s = s.replace('\r\n','\n').replace('-\n','')
    s = s.replace('\n\n','\n').replace('\n',' ')
    return s

def remove_punctuation(s, sub=' '):
    s = re.sub(r'[^\w\s]', sub, s)
    
    for i in range(5):
        s = s.replace('  ',' ')
        
    s = s.strip()
    
    return s 

def remove_stopwords(s):
    """
        s is a lowercase string
    """
    s = s.lower()
    regular_expr = r'\b(' + '|'.join(stopwords.words('spanish')) + r')\b'
    s = re.sub(regular_expr, '', s)
    
    for i in range(5):
        s = s.replace('  ',' ')
        
    s = s.strip()
    
    return s

def remove_len3(s, stop_words=[]):
    s_split = s.split(' ')
    s_new = []
    for token_ in s_split:
        token = token_.strip()
        if len(token)>3 or token.isupper():
            s_new.append(token)
            
    return ' '.join(s_new)

def prepare_string(s):
    s = remove_len3(s)
    
    s = s.lower()
    
    s = remove_stopwords(s)     # this funct has to be before normalize_str(s) funct
    s = normalize_str(s)
    s = remove_nextlines(s)
    s = remove_punctuation(s)
    
    s = s.strip()
    
    return s



print("PRUEBA:",prepare_string("No se de que me estás hablando si de la CIA o de otra cosa que no conozco"))


  from .autonotebook import tqdm as notebook_tqdm
2022-08-19 06:16:20.891608: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-19 06:16:20.891678: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


PRUEBA: hablando cia cosa conozco


## Testeo con dataset WikiNer y CoNLL

#### CoNLL

In [2]:
def get_conll_pre(path='conll'):
    with open('conll/esp.testa.txt') as f:
        testa = f.read()

    with open('conll/esp.testb.txt') as f:
        testb = f.read()


    conll_pre=[]
    for t in f'{testa}\n\n- O\n\n{testb}'.split('\n\n- O\n\n'):
        texto = t.replace('\n\n','_SEPARADOR_')
        texto = '\n'.join(texto.strip().split('_SEPARADOR_')[:-1])
        if texto:
            conll_pre.append(texto)
            
            
    print("CoNLL - Num documentos: ", len(conll_pre))
    return conll_pre
    

    
def generate_corpus_from_conll(path='conll'):
    conll_pre = get_conll_pre(path=path)
    corpus = []
    for text in conll_pre:
        new_text = []
        for token in text.split('\n'):
            new_text.append(token.split(' ')[0])
        
        corpus.append(' '.join(new_text))
        
    return corpus

#### Wikiner

In [3]:
def get_wikiner_pre(path='wikiner'):
    with open(f'{path}/aij-wikiner-es-wp2', 'r') as f:
        wikiner2 = f.read()

    with open(f'{path}/aij-wikiner-es-wp3', 'r') as f:
        wikiner3 = f.read()

    wikiner_pre = f'{wikiner2}\n\n{wikiner3}'.split('\n')
    print("Wikiner - Num documentos: ", len(wikiner_pre))
    
    return wikiner_pre


def generate_corpus_from_wikiner(path='wikiner'):
    wikiner_pre = get_wikiner_pre(path=path)
    corpus = []
    for i,text in enumerate(wikiner_pre):
        new_text = []
        text = text.replace('\n',' ')
        for j, token in enumerate(text.split(' ')):
            new_text.append(token.split('|')[0])

        corpus.append(' '.join(new_text))
        
    return corpus

#print("Original: ",get_wikiner_pre()[0])
print()
#print("Processed: ",generate_corpus_from_wikiner()[0])

corpus = generate_corpus_from_wikiner()

print(len(corpus))
"""

nlp = spacy.load("es_core_news_lg")
docs = nlp.pipe(generate_corpus_from_wikiner(corpus_pre))

doc = next(docs)

print(doc.ents)
del(nlp)
"""



Wikiner - Num documentos:  286953
286953


'\n\nnlp = spacy.load("es_core_news_lg")\ndocs = nlp.pipe(generate_corpus_from_wikiner(corpus_pre))\n\ndoc = next(docs)\n\nprint(doc.ents)\ndel(nlp)\n'

In [4]:
def generate_spacy_binary_format_wikinier(corpus_pre):
    """
        No funciona (05/08/2022)
    """
    data = []
    for text_id in tqdm(range(len(corpus_pre))):
        if corpus_pre[text_id] == '':
            continue 
            
        paragraphs = [] 
        token_id = 0
        offset = 0
        start = 0
        end = 0
        for paragraph_id in range(len(corpus_pre[text_id].split('\n'))):
            raw_pre = corpus_pre[text_id].split('\n')[paragraph_id]
            
            sentences = []
            for sentence_id in range(len(raw_pre.split('.|FS|O'))):
                tokens = []
                curr_entity = []  # list of tokens (will become an entity)
                last_ner = ''
                for token_idx in range(len(raw_pre.split('.|FS|O')[sentence_id].split(' '))):
                    token_labeled = raw_pre.split('.|FS|O')[sentence_id].split(' ')[token_idx]
                    if len(token_labeled)<=1:
                        continue
                       
                    
                    try:
                        curr_tag = token_labeled.split('|')[1]
                    except:
                        print(token_labeled)
                        print(token_labeled.split('|'))
                        print(raw_pre.split('.|FS|O')[sentence_id].split(' '))
                        raise
                        
                    curr_ner = token_labeled.split('|')[2]
                    curr_token = token_labeled.split('|')[0]
                    dep = ''
                    head = offset
                    orth = '' 
                    
                    if len(curr_ner)==0:
                        print(token_labeled)

                    if ('O' == curr_ner or last_ner.split('-')[-1]!=curr_ner.split('-')[-1]) and len(curr_entity)>0:
                        # Cambio de entidad                   
                        curr_entity = []
                        last_ner = ''
                    elif 'B' == curr_ner[0]:
                        # Comienzo de entidad
                        end = offset-1
                        start = offset
                        curr_entity = [curr_token] 
                        last_ner = curr_ner

                    elif 'I' == curr_ner[0]:
                        # Comienzo o continuación de entidad (depende del dataset)
                        if len(curr_entity)>0:
                            curr_ner = 'B-'+curr_ner.split('-')[-1]
                            
                        curr_entity.append(curr_token)
                        last_ner = curr_ner
                        if start == 0:
                            start = offset

                    offset += len(curr_token) +1

                   
                    token= {
                            "id": token_id,
                            "dep": dep,
                            "head": head,
                            "tag": curr_tag,
                            "orth": orth,
                            "ner":curr_ner
                        }
                    tokens.append(token)
                        
                        
                    token_id +=1
                
                sentence = { "tokens": tokens}
                sentences.append(sentence)
                
            aux = []
            for token_aux in raw_pre.split(' '):
                aux.append(token_aux.split('|')[0])
            raw = ' '.join(aux)
                
            paragraph = { "raw": raw,
                          "sentences": sentences}
            paragraphs.append(paragraph)
                           
        data.append({"id":text_id, "paragraphs": paragraphs})
        
    
                           
    with open('wikiner.spacy','w') as f:
        json.dump(data, f)
        



"""   
print("Original: ",corpus_pre[0])
print()
print("Processed: ",generate_corpus_from_wikiner(corpus_pre)[0])
  

nlp = spacy.load("es_core_news_lg")
docs = nlp.pipe(generate_corpus_from_wikiner(corpus_pre))

doc = next(docs)

print(doc.ents)
del(nlp)
"""


'   \nprint("Original: ",corpus_pre[0])\nprint()\nprint("Processed: ",generate_corpus_from_wikiner(corpus_pre)[0])\n  \n\nnlp = spacy.load("es_core_news_lg")\ndocs = nlp.pipe(generate_corpus_from_wikiner(corpus_pre))\n\ndoc = next(docs)\n\nprint(doc.ents)\ndel(nlp)\n'

In [5]:
#generate_spacy_binary_format_wikinier(corpus_pre)

In [6]:
def show_metrics(dataset, modelo, sufijo_nombre_archivo):
    TPTNFPFN_file = f'{dataset}/TPTNFPFN_{modelo}{sufijo_nombre_archivo}.json'
    
    with open(TPTNFPFN_file) as f:
        dic_confusion = json.load(f)
        
    # Detection
    dic_det =  dic_confusion['detection']
    #accuracy_det = (dic_det['TP']+dic_det['TN'])/(dic_det['TP']+dic_det['TN']+dic_det['FP']+dic_det['FN'])
    recall_det = (dic_det['TP'])/(dic_det['TP']+dic_det['FN']) if dic_det['TP']+dic_det['FN']>0 else 0
    precision_det = (dic_det['TP'])/(dic_det['TP']+dic_det['FP']) if dic_det['TP']+dic_det['FP']>0 else 0
    f1score_det = 2*(recall_det*precision_det)/(recall_det+precision_det) if recall_det+precision_det>0 else 0
    
    jaccard_det = dic_det['TP']/(dic_det['TP']+dic_det['FP']+dic_det['FN']) if dic_det['TP']+dic_det['FP']+dic_det['FN']>0 else 0 # IoU 
    
    dic_metrics_det = {
                        #'acc': accuracy_det,
                        'rec': recall_det,
                        'prec': precision_det,
                        'f1': f1score_det,
                        'jac': jaccard_det
                        }
    
    # Classification    
    dic_clf = dic_confusion['classification']
    dic_metrics_clf = {}
    for class_ in dic_clf.keys():
        dic_class = dic_clf[f'{class_}']
        

        #accuracy_class = (dic_class['TP']+dic_class['TN'])/(dic_class['TP']+dic_class['TN']+dic_class['FP']+dic_class['FN'])
        recall_class = (dic_class['TP'])/(dic_class['TP']+dic_class['FN']) if dic_class['TP']+dic_class['FN']>0 else 0
        precision_class = (dic_class['TP'])/(dic_class['TP']+dic_class['FP']) if dic_class['TP']+dic_class['FP']>0 else 0
        f1score_class = 2*(recall_class*precision_class)/(recall_class+precision_class) if recall_class+precision_class>0 else 0

        jaccard_class = dic_class['TP']/(dic_class['TP']+dic_class['FP']+dic_class['FN']) if dic_class['TP']+dic_class['FP']+dic_class['FN']>0 else 0 # IoU

        
        
        dic_metrics_clf[f'{class_}'] = {
                                        #'acc': accuracy_class,
                                        'rec': recall_class,
                                        'prec': precision_class,
                                        'f1': f1score_class,
                                        'jac': jaccard_class
                                        }
    # Save results
    dic_results = {
                   'detection': dic_metrics_det,
                   'classification': dic_metrics_clf
                  }
    
    
    with open(f'{dataset}/metrics_{modelo}{sufijo_nombre_archivo}.json', 'w') as f:
        json.dump(dic_results, f)
    
    
    # Show metrics
    print("\n")
    print(('='*10+'\n')*3)
    print(f"{dataset.upper()} - {modelo.upper()}")
    
    print('DETECTION:')
    for metric in dic_metrics_det.keys():
        print('\t', metric.upper(),': ',dic_metrics_det[f'{metric}'])
        
        
    
    print('CLASSIFICATION:')
    for class_ in dic_metrics_clf.keys():
        print('\t Class ', class_.upper())
        for metric in dic_metrics_clf[f'{class_}'].keys():
            print('\t\t', metric.upper(),': ',dic_metrics_clf[f'{class_}'][f'{metric}'])    
    
    print(('='*10+'\n')*3)
    
    return dic_results


In [274]:
def extract_true_named_entities(nombre_corpus: str):
    path = f'{nombre_corpus}/true_named_entities.pkl'
    if os.path.exists(path):
        with open(path, 'rb') as f:
            true_named_entities = pickle.load(f)
            
    else:
        if nombre_corpus=='wikiner':
            corpus_pre = get_wikiner_pre(path=nombre_corpus)
            corpus = generate_corpus_from_wikiner()
            split_token = ' '
            split_ner = '|' 
        elif nombre_corpus=='conll':
            corpus_pre = get_conll_pre(path=nombre_corpus)
            corpus = generate_corpus_from_conll()
            split_token = '\n'
            split_ner = ' '
        else:
            # No contemplado
            raise
            
            
            
        true_named_entities = [] # axis 0: textos, axis 1: pares de entidades [entidad, tipo]
        for i in tqdm(range(len(corpus_pre))):
            text_processed = corpus[i]
            text = corpus_pre[i].strip().replace('\n', split_token)
            
            text_true_ne = [] # list of entities

            curr_entity = []  # list of tokens (will become an entity)
            last_tag = ''
            offset = 0
            start = 0
            for token_labeled in text.split(split_token):
                if offset>0:
                    offset += 1
                    
                if len(token_labeled)==0:
                    continue
                    
                curr_tag = token_labeled.split(split_ner)[-1].strip().upper()
                curr_token = token_labeled.split(split_ner)[0]
                
                if ('O' == curr_tag[0] or last_tag.split('-')[-1]!=curr_tag.split('-')[-1]) and len(curr_entity)>0:
                    # Cambio de entidad
                    ent_name = ' '.join(curr_entity)                    
                    text_true_ne.append([ent_name, last_tag.split('-')[-1], start, start+len(ent_name)])   
                    start = 0
                    end = 0
                    
                    curr_entity = []
                    last_tag = ''

                elif 'B' == curr_tag[0]:
                    # Comienzo de entidad
                    if len(curr_entity)>0:
                        ent_name = ' '.join(curr_entity)
                        text_true_ne.append([ent_name, last_tag.split('-')[-1], start, start+len(ent_name)])
                        
                    start = offset
                    curr_entity = [curr_token] 
                    last_tag = curr_tag

                elif 'I' == curr_tag[0]:
                    # Comienzo o continuación de entidad (depende del dataset)
                    curr_entity.append(curr_token)
                    last_tag = curr_tag
                    if start == 0:
                        start = offset
                    
                
                offset += len(curr_token)

                

            true_named_entities.append(text_true_ne)
        
        with open(path,'wb') as f:
            pickle.dump(true_named_entities, f)
        
    return true_named_entities


from nltk.tokenize import word_tokenize

def predict_named_entities(modelo: str, nombre_corpus: str):
    # Load model
    path = f'{nombre_corpus}/{modelo}_pred_named_entities.pkl'
    if os.path.exists(path):        
        with open(path, 'rb') as f:
            pred_named_entities = pickle.load(f)
            
    else:
        
        if nombre_corpus=='wikiner':
            corpus_pre = get_wikiner_pre(path=nombre_corpus)
            generate_corpus = generate_corpus_from_wikiner
        elif nombre_corpus=='conll':
            corpus_pre = get_conll_pre(path=nombre_corpus)
            generate_corpus = generate_corpus_from_conll
        else:
            # No contemplado
            raise
        
        corpus = generate_corpus()
        
        if modelo == 'spacy':
            nlp = spacy.load("es_core_news_lg")
            docs = nlp.pipe(generate_corpus())

        elif modelo == 'flair':
            tagger = SequenceTagger.load('es-ner-large')
            
        elif modelo == 'gc': # google.cloud.language_v1
            client = language_v1.LanguageServiceClient()

            # Available types: PLAIN_TEXT, HTML
            type_ = language_v1.Document.Type.PLAIN_TEXT

            # Optional. If not specified, the language is automatically detected.
            # For list of supported languages:
            # https://cloud.google.com/natural-language/docs/languages
            language = "es"

            # Available values: NONE, UTF8, UTF16, UTF32
            encoding_type = language_v1.EncodingType.UTF32

        else:
            # No contemplado
            raise

        pred_named_entities = []
        for i, text in enumerate(tqdm(corpus_pre[:])):                
            if modelo == 'spacy':
                doc = next(docs)
                text_pred_ne = [ [str(ent), ent.label_, ent.start_char, ent.end_char] for ent in doc.ents]

            elif modelo == 'flair':
                s = corpus[i]
                sentence = Sentence(s)

                # predict NER tags
                tagger.predict(sentence)
                entidades = sentence.get_spans('ner')
                text_pred_ne = [[str(ent.text) , ent.get_label("ner").value, ent.start_position, ent.end_position] for ent in sentence.get_spans('ner')]
            
            elif modelo == 'gc':
                document = {"content": corpus[i], "type_": type_, "language": language}
                response = client.analyze_entities(request = {'document': document, 'encoding_type': encoding_type})  
                
                text_pred_ne = []
                for entity in response.entities:
                    tipo = language_v1.Entity.Type(entity.type_).name
                    if tipo == 'UNKNOWN':
                        continue
                    elif tipo == 'PERSON':
                        tipo = 'PER'
                    elif tipo == 'LOCATION':
                        tipo = 'LOC'
                    elif tipo == 'ORGANIZATION':
                        tipo = 'ORG'
                    else:
                        tipo = 'MISC'
                    
                    for mention in entity.mentions:                            
                        text_pred_ne.append( [str(mention.text.content), tipo, mention.text.begin_offset, mention.text.begin_offset+len(str(mention.text.content))] )
                        
                    text_pred_ne.sort(key = lambda i: i[2])

            pred_named_entities.append(text_pred_ne)
            
        with open(path,'wb') as f:
            pickle.dump(pred_named_entities, f)

    return pred_named_entities


In [275]:
a = extract_true_named_entities('conll')
b = predict_named_entities('gc', 'conll')

print(len(a))
print(len(b))

CoNLL - Num documentos:  404
CoNLL - Num documentos:  404


100%|██████████| 404/404 [01:52<00:00,  3.60it/s]

404
404





In [276]:
a = extract_true_named_entities('conll')
tipos = []
for t in a:
    for token in t:
        tipos.append(token[1])
        
print(len(tipos))
print(set(tipos))

6645
{'ORG', 'MISC', 'PER', 'LOC'}


In [277]:
def test_v0(corpus_pre, modelo: str, nombre_corpus: str):
    """
        Primera versión de test.
        
        En esta versión se procesa el texto a la vez que se 
        va testeando. Se tienen en cuenta, por lo tanto, los
        True Negative. El testeo se hace a nivel de TOKEN.
        
        Problemas:
            - Mucho tiempo para testear nuevas métricas
    """
    modelo = modelo.lower().strip()
    nombre_corpus = nombre_corpus.lower().strip()
    assert(modelo == 'spacy' or modelo == 'flair')
    
    
        
    # Diccionario base
    dic_TPTNFPFN = {'TP':0,'TN':0,'FP':0,'FN':0}

    # Detección de entidades
    dic_detection = copy.deepcopy(dic_TPTNFPFN)
    
    # Clasificación de entidades: cada elemento del diccionario será un diccionario con claves TP, TN, FP, FN
    dic_clases = {
                    'PER': copy.deepcopy(dic_TPTNFPFN),
                    'MISC': copy.deepcopy(dic_TPTNFPFN),
                    'LOC': copy.deepcopy(dic_TPTNFPFN),
                    'ORG': copy.deepcopy(dic_TPTNFPFN)
                }
            
            
    if modelo == 'spacy':
        nlp = spacy.load("es_core_news_lg")
        docs = nlp.pipe(generate_corpus_from_wikiner(corpus_pre))

    elif modelo == 'flair':
        tagger = SequenceTagger.load('es-ner-large')

    pred_named_entities = []
    for i, text in enumerate(tqdm(corpus_pre)):           
        
        if modelo == 'spacy':
            doc = next(docs)
            entidades = doc.ents
            
        elif modelo == 'flair':
            s = generate_corpus_from_wikiner([text])[0]
            sentence = Sentence(s)
            
            # predict NER tags
            tagger.predict(sentence)
            entidades = sentence.get_spans('ner')
        

        curr_token = 0
        tokens = text.split(' ')
        last_token = -1 # Va a permitir no sumar TN_det y FN_det cuando estudiemos una misma entidad
                        # ya que un problema que había era que un "token" original podría ser: "UGR/Universidad|NC|ORG"
                        # y el modelo los predice como dos entidades distintas, entonces no podemos saltar de token en "UGR"
                        # con curr_token+=1, porque en la siguiente interacción perderíamos la entidad "Universidad".
                        # Pero tampoco tiene sentido contarlo como error si es un token normal que no da problemas. Por eso,
                        # con last_token conseguimos no contar esos errores, pero sí inspeccionar de nuevo el último token
                        # para contarlo como entidad positiva en caso de serlo.
        for ent_ in entidades:
            # Get the label entity prediction
            if modelo == 'spacy':
                pred_label = ent_.label_
            elif modelo == 'flair':
                pred_label = ent_.get_label("ner").value
                ent_ = ent_.text
            
            
            for ent in str(ent_).split(' '):
                encontrado = str(ent) in tokens[curr_token].split('|')[0] # {!=} --> {not in}
                
                while not encontrado:   
                    
                    if curr_token > last_token:
                        if 'O' == tokens[curr_token].split('|')[-1].strip().upper():
                            dic_detection['TN']+= 0
                        else:
                            dic_detection['FN'] += 1
                        
                    curr_token+=1
                    try:
                        encontrado = str(ent) in tokens[curr_token].split('|')[0]
                    except:
                        # debug
                        print('ent_.text', str(ent_))
                        print('ent',str(ent))
                        print('curr_token',curr_token)
                        print('tokens[curr_token-1]',tokens[curr_token-1])
                        print('tokens', tokens)
                        raise # throw an error
                    
                last_token = curr_token 
                actual_label = tokens[curr_token].split('|')[-1].strip().upper().split('-')[-1]
                        
                if 'O' == actual_label:
                    dic_detection['FP'] += 1 # una NO entidad ha sido detectada como entidad
                else:
                    dic_detection['TP'] += 1 # una entidad ha sido detectada como entidad
                    
                    if pred_label in actual_label:
                        try:
                            # una entidad ha sido detectada como entidad y además se ha clasificado bien
                            # Sumamos los TP de esa clase de entidad
                            dic_clases[f'{pred_label}']['TP'] += 1 
                        except:
                            dic_clases[f'{pred_label}']['TP'] = 1 
                            
                        for key in dic_clases.keys():
                            # Sumamos como TN al resto de clases de entidad
                            try:
                                # una entidad ha sido detectada como entidad y además se ha clasificado bien
                                dic_clases[f'{key}']['TN'] += 1 
                            except:
                                dic_clases[f'{key}']['TN'] = 1 
                    else:
                        # una entidad ha sido detectada como entidad y PERO NO se ha clasificado bien
                        # Sumamos los FP de la clase predicha
                        try:
                            dic_clases[f'{pred_label}']['FP'] += 1 
                        except:
                            dic_clases[f'{pred_label}']['FP'] = 1 
                            
                            
                        # Sumamos los FN de la clase real
                        try:
                            dic_clases[f'{actual_label}']['FN'] += 1 
                        except:
                            dic_clases[f'{actual_label}']['FN'] = 1 

        
    
    dic_confusion = { 'detection': dic_detection,
                     'classification': dic_clases
                    }
    
    
    with open(f'{nombre_corpus}/TPTNFPFN_{modelo}.json', 'w') as f:
        json.dump(dic_confusion, f)
        
    dic_results = show_metrics(nombre_corpus, modelo)
        
    return dic_results

In [286]:
def test_v1(true_named_entities, pred_named_entities, modelo: str, nombre_corpus: str, clases_ignoradas=[], sufijo_nombre_archivo = ""):
    """
        Segunda versión de test.
        
        En esta versión se mejora el tiempo de testeo 
        guardando en disco la extracción de entidades 
        echas por el modelo y la extracción de las 
        entidades reales del texto, haciendo que solo
        sea necesario calcularse en su primera ejecución.
        
        Además, se han eliminado las STOPWORDS de las entidades.
    """
    
    modelo = modelo.lower().strip()
    nombre_corpus = nombre_corpus.lower().strip()
    assert(modelo == 'spacy' or modelo == 'flair' or modelo == 'gc' or modelo == 'test')
    
    
    dic_errores = {'detection': [],
                   'classification':{
                                        'LOC': [],
                                        'MISC': [],
                                        'ORG': [],
                                        'PER': []
                                    }
                  }
    
    # Detección de entidades
    dic_detection = {'TP':0,'TN':0,'FP':0,'FN':0}
    
    # Clasificación de entidades: cada elemento del diccionario será un diccionario con claves TP, TN, FP, FN
    dic_clases = {
                    'LOC': {'TP':0,'TN':0,'FP':0,'FN':0},
                    'MISC': {'TP':0,'TN':0,'FP':0,'FN':0},
                    'ORG': {'TP':0,'TN':0,'FP':0,'FN':0},
                    'PER': {'TP':0,'TN':0,'FP':0,'FN':0}
                }
    
    for texto_idx in tqdm(range(len(pred_named_entities))):
        last_true_ent_founded_idx = 0
        falsos_negativos_acumulados = 0 # declaración 
        
        if len(pred_named_entities[texto_idx])==0 and len(true_named_entities[texto_idx])==0:
            continue
        
        overlapped_pred_entities = [-1 for i in range(len(pred_named_entities[texto_idx]))]
        overlapped_true_entities = [-1 for i in range(len(true_named_entities[texto_idx]))]
        
        #print(overlapped_pred_entities, '\n', overlapped_true_entities)
        pred_ent_labeled_idx = 0
        true_ent_labeled_idx = 0
        avanza_pred_avanza_true = 2 # 0 avanza pred, 1 avanza true, 2 avanzan las dos
        fin_pred = False
        fin_true = False 
            
        while not fin_pred or not fin_true:
            # Predicted
            if len(pred_named_entities[texto_idx])>0:
                pred_ent_labeled = pred_named_entities[texto_idx][pred_ent_labeled_idx]
            else:
                fin_pred = True
                pred_ent_labeled = ['', '_NAN_', 1e5, 1+1e5] # entrada auxiliar
            #pred_ent = remove_stopwords(remove_punctuation(pred_ent_labeled[0].lower(), sub=''))
            pred_ent = remove_stopwords(pred_ent_labeled[0].lower())
            pred_label = pred_ent_labeled[1]
            pred_start = pred_ent_labeled[2]
            pred_end = pred_ent_labeled[3]
            
            saltamos_pred = len(pred_ent)==0 or pred_label in clases_ignoradas
            
            if avanza_pred_avanza_true==0 or avanza_pred_avanza_true==2:
                pred_overlapped = [False for token in pred_ent.split(' ')]
            
            # Actual
            if len(true_named_entities[texto_idx])>0:
                true_ent_labeled = true_named_entities[texto_idx][true_ent_labeled_idx]
            else:
                fin_true = True
                true_ent_labeled = ['', 'O', 1e5, 1+1e5] # entrada auxiliar
            
            #true_ent = remove_stopwords(remove_punctuation(true_ent_labeled[0].lower(), sub=''))
            true_ent = remove_stopwords(true_ent_labeled[0].lower())
            true_label = true_ent_labeled[1]
            true_start = true_ent_labeled[2]
            true_end = true_ent_labeled[3]
            
            saltamos_true = len(true_ent)==0 or true_label in clases_ignoradas

                
            if avanza_pred_avanza_true==1 or avanza_pred_avanza_true==2:
                true_overlapped = [False for token in true_ent.split(' ')]
               
            """
            print(pred_ent, pred_end, '||', len(pred_named_entities[texto_idx]), '-', pred_ent_labeled_idx, fin_pred)
            print(true_ent, true_end, '||', len(true_named_entities[texto_idx]), '-', true_ent_labeled_idx, fin_true)
            print()
            """
                
            # Vemos si hay solapamiento
            if ((pred_start <= true_start and pred_end > true_start) or (true_start <=pred_start and true_end > pred_start)) and (not saltamos_pred and not saltamos_true):
                # Hay solapamiento
                    
                for pred_token_idx in range(len(pred_ent.split(' '))):
                    pred_token = pred_ent.split(' ')[pred_token_idx]
                    
                    # Comprobamos si el token está en ambas entidades y si no ha sido ya marcado como overlapped
                    if len(pred_token)>0 and pred_token in true_ent and not pred_overlapped[pred_token_idx]:
                        pred_overlapped[pred_token_idx] = True
                                            
                        # CLASIFICACIÓN ( solo lo hacemos en este bucle ya que con el siguiente se duplicarían las acciones )
                        if pred_label in true_label: # por ej. ORG in I-ORG => True. 
                            # Sumamos TP al pred_label y sumamos TN al resto
                            dic_clases[f'{pred_label}']['TP'] += 1
                            
                            
                            for key in dic_clases.keys():
                                if key != pred_label:
                                    dic_clases[f'{key}']['TN'] += 1
                        else:
                            # Sumamos FP al pred_label y FN al true_label
                            dic_clases[f'{pred_label}']['FP'] += 1
                            dic_clases[f'{true_label.split("-")[-1]}']['FN'] += 1
                            
                            error = {
                                        'True':{'ent': true_ent,'label':true_label}, 
                                        'Predicted':{'ent': pred_ent, 'token': pred_token, 'label':pred_label}
                                    }
                            dic_errores['classification'][f'{true_label.split("-")[-1]}'].append(error)
                            
                            for key in dic_clases.keys():
                                if key != pred_label and key != true_label.split("-")[-1]:
                                    dic_clases[f'{key}']['TN'] += 1
                            
                            
                            
                        
                for true_token_idx in range(len(true_ent.split(' '))):
                    true_token = true_ent.split(' ')[true_token_idx]
                    
                    # Comprobamos si el token está en ambas entidades y si no ha sido ya marcado como overlapped
                    if len(true_token)>0 and true_token in pred_ent and not true_overlapped[true_token_idx]:
                        true_overlapped[true_token_idx] = True
                

            
                    
            fin_pred = pred_ent_labeled_idx+1 >= len(pred_named_entities[texto_idx])
            fin_true = true_ent_labeled_idx+1 >= len(true_named_entities[texto_idx])
            
            
            # Avance de entidad
            if (pred_end < true_end or fin_true or saltamos_pred) and not fin_pred:
                # La real acaba después, hay que avanzar con la predicha
                avanza_pred_avanza_true = 0

            elif (pred_end > true_end or fin_pred or saltamos_true) and not fin_true:
                # La predicha acaba después, hay que avanzar con la real
                avanza_pred_avanza_true = 1

            else:
                # Acaban a la vez, avanzamos con las dos
                avanza_pred_avanza_true = 2
           
            
            
            # Contabilización en el avance
            error_added = False
            if (fin_true and fin_pred) or avanza_pred_avanza_true==0 or avanza_pred_avanza_true==2:
                # Avanza pred
                if not saltamos_pred and not saltamos_true:
                    # Sumamos los TP y FP de la entidad predicha
                    suma = int(np.array(pred_overlapped).sum())
                    dic_detection['TP'] += suma
                    dic_detection['FP'] += (len(pred_overlapped)-suma)

                    # Añadimos los errores
                    if len(pred_overlapped)-suma>0: # Hay errores
                        error = {
                                 'True': {'ent': true_ent, 'label': true_label, 'start': true_start, 'end': true_end},
                                 'Predicted': {'ent': pred_ent, 'label': pred_label, 'start': pred_start, 'end': pred_end}
                                }
                        dic_errores['detection'].append(error)
                        error_added = True
                    
                if not fin_pred:
                    pred_ent_labeled_idx += 1


            if (fin_pred and fin_true) or avanza_pred_avanza_true==1 or avanza_pred_avanza_true==2:
                # Avanza true
                if not saltamos_pred and not saltamos_true:
                    # Sumamos los TP y FP de la entidad predicha
                    suma = int(np.array(true_overlapped).sum())
                    dic_detection['TP'] += suma # comentado porque se suman dos veces
                    dic_detection['FN'] += (len(true_overlapped)-suma)


                    # Añadimos los errores
                    if len(pred_overlapped)-suma>0 and not error_added: # Hay errores
                        error = {
                                 'True': {'ent': true_ent, 'label': true_label, 'start': true_start, 'end': true_end},
                                 'Predicted': {'ent': pred_ent, 'label': pred_label, 'start': pred_start, 'end': pred_end}
                                }
                        dic_errores['detection'].append(error)
                    
                if not fin_true:
                    true_ent_labeled_idx += 1                    

                
            
    dic_detection['TP'] /=2  # Están duplicadas
            
    dic_confusion = { 'detection': dic_detection,
                      'classification': dic_clases
                    }
        
    os.makedirs(nombre_corpus, exist_ok=True)
    with open(f'{nombre_corpus}/TPTNFPFN_{modelo}{sufijo_nombre_archivo}.json', 'w') as f:
        json.dump(dic_confusion, f)
        
    with open(f'{nombre_corpus}/errores_{modelo}{sufijo_nombre_archivo}.json', 'w') as f:
        json.dump(dic_errores, f)
        
        
    dic_results = show_metrics(nombre_corpus, modelo, sufijo_nombre_archivo = sufijo_nombre_archivo)
        
    return dic_results


def test_testv1():
    """
        Función para comprobar el buen funcionamiento de 
        la función test_v1
    """
    true_named_entities = [[["Madrid", "LOC", 10,16], ["El Quijote de la Mancha", "PER", 20, 44]],
                           [["Villanueva de la Serena", "LOC", 11, 34]],
                           [["Banco Santander", "MISC", 0, 15]]]
                           
    pred_named_entities =[[["Madrid", "LOC", 10,16], ["El Quijote", "PER", 20, 30],["Mancha", "LOC", 38, 44]],
                           [["Falso Posi", "MISC", 0, 10], ["Villanueva", "LOC", 11, 21]],
                           [["Santander", "MISC", 6, 15]]]
    
    
    test_v1(true_named_entities = true_named_entities,
            pred_named_entities = pred_named_entities,
            modelo = 'spacy', 
            nombre_corpus = 'test',
            sufijo_nombre_archivo = '')
    

In [289]:
corpus = generate_corpus_from_conll()
print(corpus[2])

CoNLL - Num documentos:  404
Novillos de " El Torreón " , bien presentados y de buen juego . Los seis , nobles , con fijeza y movilidad , aplaudidos en el arrastre . Víctor de la Serna : pinchazo , estocada y tres descabellos ( silencio tras un aviso ) ; y pinchazo y estocada que escupe ( silencio ) . Sebastián Castella : estocada desprendida ( silencio tras un aviso ) ; y dos pinchazos , estocada y descabello ( silencio ) . Javier Castaño : estocada chalequera ( vuelta tras petición de oreja ) ; y estocada ( dos orejas ) . Cuadrillas : Domingo Siro saludó tras banderillear al tercero .


In [280]:
s = "Hola que  haces   ? "
s2 = s.replace('  ',' ')
print(s2.split(' '))

['Hola', 'que', 'haces', '', '?', '']


In [281]:
test_testv1()

100%|██████████| 3/3 [00:00<00:00, 721.99it/s]




TEST - SPACY
DETECTION:
	 REC :  0.7142857142857143
	 PREC :  0.7142857142857143
	 F1 :  0.7142857142857143
	 JAC :  0.5555555555555556
CLASSIFICATION:
	 Class  LOC
		 REC :  1.0
		 PREC :  0.6666666666666666
		 F1 :  0.8
		 JAC :  0.6666666666666666
	 Class  MISC
		 REC :  1.0
		 PREC :  1.0
		 F1 :  1.0
		 JAC :  1.0
	 Class  ORG
		 REC :  0
		 PREC :  0
		 F1 :  0
		 JAC :  0
	 Class  PER
		 REC :  0.5
		 PREC :  1.0
		 F1 :  0.6666666666666666
		 JAC :  0.5






In [282]:
s = 'LOC\nHola'
print(s)
print(s.split('\n'))

with open('conll/gc_pred_named_entities.pkl','rb') as f:
    v = pickle.load(f)
    
for i in range(len(v[0])):
    print(v[0][i])
del(v)

LOC
Hola
['LOC', 'Hola']
['multinacional', 'ORG', 3, 16]
['española', 'LOC', 17, 25]
['Telefónica', 'ORG', 26, 36]
['un', 'MISC', 49, 51]
['récord', 'MISC', 52, 58]
['servicio', 'MISC', 79, 87]
['tres millones', 'MISC', 88, 101]
['líneas', 'MISC', 112, 118]
['estado', 'LOC', 125, 131]
['brasileño', 'LOC', 132, 141]
['Sao Paulo', 'LOC', 145, 154]
['control', 'MISC', 175, 182]
['operadora', 'PER', 189, 198]
['Telesp', 'ORG', 199, 205]
['20', 'MISC', 211, 213]
['presidente', 'PER', 237, 247]
['Telefónica', 'ORG', 251, 261]
['Brasil', 'LOC', 265, 271]
['Fernando Xavier Ferreira', 'PER', 274, 298]
['un', 'MISC', 304, 306]
['balance', 'MISC', 307, 314]
['gestión', 'MISC', 330, 337]
['Telefónica', 'ORG', 341, 351]
['Sao Paulo', 'LOC', 355, 364]
['1998', 'MISC', 381, 385]
['1998', 'MISC', 381, 385]
['Ferreira', 'PER', 388, 396]
['empresa', 'ORG', 412, 419]
['metas', 'MISC', 442, 447]
['órgano', 'ORG', 463, 469]
['sector', 'MISC', 484, 490]
['Agencia Nacional de Telecomunicaciones', 'ORG', 496,

In [287]:
#test_v0(corpus_pre, 'spacy', 'wikiner')

for nombre_corpus in ['conll','wikiner']:
    for model in ['spacy','flair','gc']:
        print('='*10)
        print(nombre_corpus.upper(), '-' , model.upper())
        print('='*10)
        
        for clases_ign in [[], ['MISC']]:
            suf = '_'+'_'.join(clases_ign) if len(clases_ign)>0 else ''
            test_v1(true_named_entities = extract_true_named_entities(nombre_corpus)[:],
                    pred_named_entities = predict_named_entities(model, nombre_corpus)[:],
                    modelo = model, 
                    nombre_corpus = nombre_corpus,
                    clases_ignoradas = clases_ign,
                    sufijo_nombre_archivo = suf)






CONLL - SPACY


100%|██████████| 404/404 [00:03<00:00, 103.62it/s]





CONLL - SPACY
DETECTION:
	 REC :  0.9315428202883862
	 PREC :  0.8202247191011236
	 F1 :  0.8723468507333908
	 JAC :  0.7735950112858181
CLASSIFICATION:
	 Class  LOC
		 REC :  0.8971304818624797
		 PREC :  0.5672714823690517
		 F1 :  0.6950503355704698
		 JAC :  0.5326261652201865
	 Class  MISC
		 REC :  0.5309278350515464
		 PREC :  0.519327731092437
		 F1 :  0.5250637213254036
		 JAC :  0.35599078341013823
	 Class  ORG
		 REC :  0.6400911161731208
		 PREC :  0.872369782683684
		 F1 :  0.7383941605839415
		 JAC :  0.5852811849109003
	 Class  PER
		 REC :  0.9044766708701135
		 PREC :  0.9183738796414853
		 F1 :  0.9113722998729352
		 JAC :  0.8371753720455208



100%|██████████| 404/404 [00:04<00:00, 97.09it/s] 





CONLL - SPACY
DETECTION:
	 REC :  0.9061058344640435
	 PREC :  0.9351369517728113
	 F1 :  0.9203925243949501
	 JAC :  0.8525251493642445
CLASSIFICATION:
	 Class  LOC
		 REC :  0.9272523782876329
		 PREC :  0.611439114391144
		 F1 :  0.7369357349344007
		 JAC :  0.5834507042253522
	 Class  MISC
		 REC :  0
		 PREC :  0
		 F1 :  0
		 JAC :  0
	 Class  ORG
		 REC :  0.7136004514672686
		 PREC :  0.9536199095022625
		 F1 :  0.816333118140736
		 JAC :  0.6896645759476411
	 Class  PER
		 REC :  0.9354417998043691
		 PREC :  0.9449934123847167
		 F1 :  0.9401933475339996
		 JAC :  0.8871366728509585

CONLL - FLAIR


100%|██████████| 404/404 [00:03<00:00, 122.21it/s]





CONLL - FLAIR
DETECTION:
	 REC :  0.9746662413997357
	 PREC :  0.9939593885042517
	 F1 :  0.984218275513021
	 JAC :  0.968926937536803
CLASSIFICATION:
	 Class  LOC
		 REC :  0.9270227392913802
		 PREC :  0.9558342420937841
		 F1 :  0.9412080536912752
		 JAC :  0.8889452332657201
	 Class  MISC
		 REC :  0.9559014267185474
		 PREC :  0.9615133724722765
		 F1 :  0.95869918699187
		 JAC :  0.9206745783885072
	 Class  ORG
		 REC :  0.9706583969465649
		 PREC :  0.9601226993865031
		 F1 :  0.965361803084223
		 JAC :  0.9330428800733777
	 Class  PER
		 REC :  0.9984457569163817
		 PREC :  0.9922767995057151
		 F1 :  0.9953517198636505
		 JAC :  0.9907464528069093



100%|██████████| 404/404 [00:03<00:00, 114.77it/s]





CONLL - FLAIR
DETECTION:
	 REC :  0.9785015241456763
	 PREC :  0.9898295915607249
	 F1 :  0.984132960413081
	 JAC :  0.968761582040557
CLASSIFICATION:
	 Class  LOC
		 REC :  0.9309612320764737
		 PREC :  0.9621295279912184
		 F1 :  0.9462887989203778
		 JAC :  0.8980532786885246
	 Class  MISC
		 REC :  0
		 PREC :  0
		 F1 :  0
		 JAC :  0
	 Class  ORG
		 REC :  0.982612895435885
		 PREC :  0.9697330791229742
		 F1 :  0.9761305025788654
		 JAC :  0.9533739456419869
	 Class  PER
		 REC :  0.9984457569163817
		 PREC :  0.9965870307167235
		 F1 :  0.9975155279503106
		 JAC :  0.9950433705080545

CONLL - GC


100%|██████████| 404/404 [00:13<00:00, 29.33it/s]





CONLL - GC
DETECTION:
	 REC :  0.948727322579171
	 PREC :  0.33204318272690925
	 F1 :  0.49192019713771207
	 JAC :  0.32618976542492184
CLASSIFICATION:
	 Class  LOC
		 REC :  0.8677859391395593
		 PREC :  0.6984797297297297
		 F1 :  0.7739822180627048
		 JAC :  0.6312977099236641
	 Class  MISC
		 REC :  0.5762081784386617
		 PREC :  0.6373355263157895
		 F1 :  0.605232331120656
		 JAC :  0.4339305711086226
	 Class  ORG
		 REC :  0.7754247722235903
		 PREC :  0.8838057816446815
		 F1 :  0.8260755508919203
		 JAC :  0.7036871508379888
	 Class  PER
		 REC :  0.9509742300439975
		 PREC :  0.9040932178069914
		 F1 :  0.9269413386429776
		 JAC :  0.8638310019982872



100%|██████████| 404/404 [00:13<00:00, 29.94it/s]





CONLL - GC
DETECTION:
	 REC :  0.931185144729656
	 PREC :  0.6248625668841163
	 F1 :  0.7478726204052988
	 JAC :  0.59728158060674
CLASSIFICATION:
	 Class  LOC
		 REC :  0.8989130434782608
		 PREC :  0.7525022747952684
		 F1 :  0.8192174343734522
		 JAC :  0.6937919463087249
	 Class  MISC
		 REC :  0
		 PREC :  0
		 F1 :  0
		 JAC :  0
	 Class  ORG
		 REC :  0.8390620836664002
		 PREC :  0.954531676265535
		 F1 :  0.8930799773114011
		 JAC :  0.8068152703048936
	 Class  PER
		 REC :  0.9714285714285714
		 PREC :  0.942385549672999
		 F1 :  0.9566866898514069
		 JAC :  0.916969696969697

WIKINER - SPACY


100%|██████████| 286953/286953 [04:14<00:00, 1127.87it/s]





WIKINER - SPACY
DETECTION:
	 REC :  0.9821991503731712
	 PREC :  0.9893720445522334
	 F1 :  0.9857725493894102
	 JAC :  0.9719442604279256
CLASSIFICATION:
	 Class  LOC
		 REC :  0.9744091422341626
		 PREC :  0.967272200212581
		 F1 :  0.9708275547710719
		 JAC :  0.9433089267708893
	 Class  MISC
		 REC :  0.9228210068143708
		 PREC :  0.945247405350095
		 F1 :  0.9338995902009279
		 JAC :  0.8759959020904419
	 Class  ORG
		 REC :  0.9070502327302749
		 PREC :  0.9100165657488861
		 F1 :  0.9085309779935413
		 JAC :  0.8323928207539776
	 Class  PER
		 REC :  0.9802157050308506
		 PREC :  0.9774903478394531
		 F1 :  0.9788511294261986
		 JAC :  0.9585782814176398



100%|██████████| 286953/286953 [04:23<00:00, 1089.34it/s]





WIKINER - SPACY
DETECTION:
	 REC :  0.9825474863486339
	 PREC :  0.9881902737443823
	 F1 :  0.9853608015872024
	 JAC :  0.9711440314237858
CLASSIFICATION:
	 Class  LOC
		 REC :  0.9805495876133818
		 PREC :  0.9805079682250497
		 F1 :  0.980528777477573
		 JAC :  0.9618013297633841
	 Class  MISC
		 REC :  0
		 PREC :  0
		 F1 :  0
		 JAC :  0
	 Class  ORG
		 REC :  0.9434294681985076
		 PREC :  0.9414780450327995
		 F1 :  0.9424527464726241
		 JAC :  0.8911684497587581
	 Class  PER
		 REC :  0.9862698141179154
		 PREC :  0.9869586254747298
		 F1 :  0.9866140995717516
		 JAC :  0.973581830134815

WIKINER - FLAIR


100%|██████████| 286953/286953 [04:16<00:00, 1119.99it/s]





WIKINER - FLAIR
DETECTION:
	 REC :  0.9652009289489113
	 PREC :  0.9752141993513983
	 F1 :  0.9701817280332613
	 JAC :  0.942090225472904
CLASSIFICATION:
	 Class  LOC
		 REC :  0.8107732562188478
		 PREC :  0.9604819636837929
		 F1 :  0.8793008263125243
		 JAC :  0.784600227212919
	 Class  MISC
		 REC :  0.8626786037258094
		 PREC :  0.7153999025085305
		 F1 :  0.7821666495849134
		 JAC :  0.642260822729415
	 Class  ORG
		 REC :  0.8513139596564924
		 PREC :  0.609557049735667
		 F1 :  0.7104315470849335
		 JAC :  0.5509064256953593
	 Class  PER
		 REC :  0.9521115660555113
		 PREC :  0.9671276615322328
		 F1 :  0.9595608709544895
		 JAC :  0.9222652668155434



100%|██████████| 286953/286953 [04:27<00:00, 1072.99it/s]





WIKINER - FLAIR
DETECTION:
	 REC :  0.952834929159148
	 PREC :  0.9864951596034719
	 F1 :  0.9693729303363287
	 JAC :  0.9405661454755577
CLASSIFICATION:
	 Class  LOC
		 REC :  0.883126593892457
		 PREC :  0.9798093146468634
		 F1 :  0.9289591324552254
		 JAC :  0.867342377499326
	 Class  MISC
		 REC :  0
		 PREC :  0
		 F1 :  0
		 JAC :  0
	 Class  ORG
		 REC :  0.9507188830894361
		 PREC :  0.6552637386890311
		 F1 :  0.7758136374919975
		 JAC :  0.633738180110568
	 Class  PER
		 REC :  0.9784810597950045
		 PREC :  0.9823448868867092
		 F1 :  0.9804091664859395
		 JAC :  0.9615711854792967

WIKINER - GC


100%|██████████| 286953/286953 [17:07<00:00, 279.30it/s]





WIKINER - GC
DETECTION:
	 REC :  0.9902834343176575
	 PREC :  0.325708107705915
	 F1 :  0.49019060257507563
	 JAC :  0.3246705202730403
CLASSIFICATION:
	 Class  LOC
		 REC :  0.8726351232969224
		 PREC :  0.9269059373276443
		 F1 :  0.8989521768665066
		 JAC :  0.8164515273352627
	 Class  MISC
		 REC :  0.7124752237371614
		 PREC :  0.7554726594486398
		 F1 :  0.7333442259541716
		 JAC :  0.5789609465970339
	 Class  ORG
		 REC :  0.7908442597635961
		 PREC :  0.5867923545648664
		 F1 :  0.673706491906792
		 JAC :  0.5079618408713843
	 Class  PER
		 REC :  0.9062095320890724
		 PREC :  0.905217643014799
		 F1 :  0.9057133159859713
		 JAC :  0.827674620569869



100%|██████████| 286953/286953 [16:22<00:00, 292.01it/s]





WIKINER - GC
DETECTION:
	 REC :  0.9694178463210918
	 PREC :  0.6013702856186159
	 F1 :  0.7422758999407505
	 JAC :  0.5901738703311665
CLASSIFICATION:
	 Class  LOC
		 REC :  0.9052326887746336
		 PREC :  0.9546336788918895
		 F1 :  0.9292770996470554
		 JAC :  0.8678969127686873
	 Class  MISC
		 REC :  0
		 PREC :  0
		 F1 :  0
		 JAC :  0
	 Class  ORG
		 REC :  0.8680277349768876
		 PREC :  0.7069534553942299
		 F1 :  0.7792540131547097
		 JAC :  0.6383424738249558
	 Class  PER
		 REC :  0.945722075479722
		 PREC :  0.9418325043410335
		 F1 :  0.9437732824067524
		 JAC :  0.8935328624873881



### Metrics examples
Named Entity Recognition aims to **detect** entities and **classify** them. So we have to measure both, detection and classification capacity.

#### For _detection_ capacity it's enough to use the accuracy:
$$ Accuracy = \frac{TP+TN}{TP+TN + FP+FN} $$

where 
- $ TP$ are the true positives entities detected (the model says that a **real** entity **is** an entity)
- $ TN$ are the true negatives entities detected (the model says that a token that **a no real** entity **is not** an entity)
- $ FP$ are the false positives entities detected (the model says that a **no real** entity **is** an entity)
- $ FN$ are the false negatives entities detected (the model says that a **real** entity **is not** an entity)

#### For the _classify_ task, some ideas could be: 
- Measure the accuracy on the entities that are detected
- Measure the capacity of classify a specific class. For example, we want to study the 'PERS' class. We could use other metrics, like _recall_, _precision_ and _F1-Score_
$$ recall = \frac{TP}{TP+FN} $$
$$ precision = \frac{TP}{TP+FP} $$
$$ f1-score = 2 \frac{precision \times recall}{precision+recall} $$
    so, in this point, it's interesting to study each class with these metrics.
    
Here, _TP,TN,FP,FN_ refer to the specific entity class

## Testeo con CoNLL

In [None]:
def calculate_num_original_entities_conll(corpus_pre):
    num_original_entities = 0
    for text in corpus_pre:
        new_text = []
        for token in text.split('\n'):
            ent_type = token.split(' ')[1]
            
            if ent_type.strip().upper() != 'O':
                num_original_entities += 1
        
    return corpus
    

In [22]:
from transformers import pipeline

print("mean,",np.array([len(n) for n in noticias]).mean())
print("max,",np.array([len(n) for n in noticias]).max())
print("min,",np.array([len(n) for n in noticias]).min())
nlp_ner = pipeline(
    "ner",
    model="mrm8488/bert-spanish-cased-finetuned-ner",
    tokenizer=(
        'mrm8488/bert-spanish-cased-finetuned-ner',  
        {"use_fast": False, "model_max_length":87})
)

text = ['Mis amigos están pensando viajar a Londres este verano','segunda frase del 2022 en Madrid o Real Madrid']

entidades_transf = []

output = nlp_ner(text)
print(type(output))
print(output)
for new_list in tqdm(output):
    for ent_dict in new_list:
        entidades_transf.append(prepare_string(ent_dict['entity']))
        

entidades_transf = list(set(entidades_transf))
print("Número de entidades: ",len(entidades_transf))
    

mean, 2486.8713540431586
max, 8087
min, 151
<class 'list'>
[[{'entity': 'B-LOC', 'score': 0.99986696, 'index': 7, 'word': 'Londres', 'start': None, 'end': None}], [{'entity': 'B-LOC', 'score': 0.99954164, 'index': 7, 'word': 'Madrid', 'start': None, 'end': None}, {'entity': 'B-ORG', 'score': 0.63106686, 'index': 9, 'word': 'Real', 'start': None, 'end': None}, {'entity': 'I-LOC', 'score': 0.8082197, 'index': 10, 'word': 'Madrid', 'start': None, 'end': None}]]


100%|██████████| 2/2 [00:00<00:00, 4899.89it/s]

Número de entidades:  3



