In [1]:
import spacy
import stanza
import pandas as pd
from tqdm import tqdm
from spacy import displacy
from matplotlib import pyplot as plt

In [2]:
stanza.download('en', verbose=False)

In [3]:
data = pd.read_csv("data/custom/DATA_PROCESSADO.csv")

In [4]:
data.head()

Unnamed: 0,text,text_lower
0,Long over due !! and lizzo was fire AF omg !!!,long over due !! and lizzo was fire af omg !!!
1,Another Bachour Class at bachour1234 with pavo...,another bachour class at bachour1234 with pavo...
2,Always happy to be here need to move here forever,always happy to be here need to move here forever
3,Morning love it up North this morning First Wa...,morning love it up north this morning first wa...
4,What are your flaws that only God can anoint a...,what are your flaws that only god can anoint a...


In [5]:
nlp_spacy_model = spacy.load("en_core_web_trf")

nlp_stanza_model = stanza.Pipeline(lang='en', 
                      use_gpu=False,
                      verbose=False,
                      processors='tokenize, ner')

In [9]:
def simple_clean_text(text):
    return " ".join(text.replace('\n', '').split())

def chunk_class(tag):
    
    """
        Converte as classes Spacy em um modelo customizado...
    """
    
    final = tag
    
    try:
        pre, suf = tag.split('-')
    except:
        return final
    
    if suf == 'PERSON': 
        final = pre + '-PER'
        
    if suf == 'EVENT':
        final = pre + '-EVENT'
        
    if suf == 'GPE' or suf == 'FAC': 
        final = pre + '-LOC'
        
    if suf == 'ORG':
        final = pre + '-ORG'
        
    return final

def stanza_parser_tags(tag):
    
    if tag != 'O':
        prefix, sufix = tag.split('-')
        if  prefix == "E": return 'I-' + sufix
        elif prefix == 'S': return 'B-' +  sufix
        else: return tag
    else: return tag
    
def spacy_annotator(index, sentence, apply_func):
    
    document = apply_func(sentence)

    orgin_tokens = []
    origin_tags = []
    
    for word_obj in document:
        
        tmp_tag = 'O'
        if word_obj.ent_iob_ != 'O':
            tmp_tag = word_obj.ent_iob_+ '-' + word_obj.ent_type_

        orgin_tokens.append(str(word_obj))
        origin_tags.append(chunk_class(tmp_tag))

    return pd.DataFrame({
        "Sentence": ['Sentence #' + str(index)] * len(orgin_tokens),
        "Word": orgin_tokens,
        "Tag": origin_tags
    })
    
def stanza_annotator(index, text, apply_func):
    
    doc = apply_func(text)
    
    words = []
    tags = []
    
    for sent in doc.sentences:
        for token in sent.tokens:
    
            words.append(token.text)
            #CONVERTE BILOUS TO BIO
            bio_tag = stanza_parser_tags(token.ner)
            tags.append(chunk_class(bio_tag))
    
    return pd.DataFrame({
        "Sentence": ['Sentence #' + str(index)] * len(words),
        "Word": words,
        "Tag": tags
    })

## Reducer NER Class

In [10]:
def reduce_ner_class(candidates_preds: zip) -> list:

    if not isinstance(candidates_preds, zip):
        raise Exception("Tipo de estrutura não é aceita!")
        
    final_tags = []

    for tags in tqdm(candidates_preds):
        if not 'O' in tags:
            tag = max(tags)
            final_tags.append(tag)
        else:
            tag = tuple(filter((lambda x: x!= 'O'), tags))

            if tag:
                final_tags.append(tag[0])
            else:
                final_tags.append('O')
    
    return final_tags

In [13]:
## Erro no Tokenizer de ambos pipelines

In [14]:
%%time

final_data_annotate = pd.DataFrame()

for index, text in enumerate(tqdm(data.text[:10])):
    
    preproc_text = simple_clean_text(text)
    
    tmp_pd_1 = stanza_annotator(index, preproc_text, nlp_stanza_model)
    tmp_pd_2 = spacy_annotator(index, preproc_text, nlp_spacy_model)
    
    tmp_pd_1.Tag = reduce_ner_class(zip(tmp_pd_1.Tag, tmp_pd_2.Tag))
    
    if len(tmp_pd_1.Tag.value_counts()) == 1:
        pass
    else:
        final_data_annotate = final_data_annotate.append(tmp_pd_1)

In [187]:
tmp_pd_1

Unnamed: 0,Sentence,Word,Tag
0,Sentence #1,Another,O
1,Sentence #1,Bachour,O
2,Sentence #1,Class,O
3,Sentence #1,at,O
4,Sentence #1,bachour,O
5,Sentence #1,1234,O
6,Sentence #1,with,O
7,Sentence #1,pavonitalia,O
8,Sentence #1,valrhonausa,O
9,Sentence #1,bravonorthamerica,O


In [188]:
tmp_pd_2

Unnamed: 0,Sentence,Word,Tag
0,Sentence #1,Another,O
1,Sentence #1,Bachour,O
2,Sentence #1,Class,O
3,Sentence #1,at,O
4,Sentence #1,bachour1234,O
5,Sentence #1,with,O
6,Sentence #1,pavonitalia,O
7,Sentence #1,valrhonausa,O
8,Sentence #1,bravonorthamerica,O
9,Sentence #1,BRAVOSPA,O
