In [1]:
# !./requeriments.sh

In [2]:
import tqdm
import spacy
import stanza
import pandas as pd

from statistics import mode, StatisticsError
from spacy import displacy
from matplotlib import pyplot as plt
import spacy_stanza

In [3]:
data = pd.read_csv("data/custom/DATA_PROCESSADO.csv")

In [4]:
data.head()

Unnamed: 0,text,text_lower
0,Long over due !! and lizzo was fire AF omg !!!,long over due !! and lizzo was fire af omg !!!
1,Another Bachour Class at bachour1234 with pavo...,another bachour class at bachour1234 with pavo...
2,Always happy to be here need to move here forever,always happy to be here need to move here forever
3,Morning love it up North this morning First Wa...,morning love it up north this morning first wa...
4,What are your flaws that only God can anoint a...,what are your flaws that only god can anoint a...


In [5]:
spacy.prefer_gpu() 
stanza.download('en', verbose=False)

In [6]:
nlp_spacy_model = spacy.load("en_core_web_trf")

nlp_spacy_model_lg = spacy.load("en_core_web_lg")

nlp_stanza_model = stanza.Pipeline(lang='en', 
                                   use_gpu=False,
                                   verbose=False,
                                   processors={'tokenize': 'spacy'})

Some weights of the model checkpoint at /home/mpgxc/anaconda3/lib/python3.8/site-packages/en_core_web_trf/en_core_web_trf-3.1.0/transformer/model were not used when initializing RobertaModel: ['embeddings.position_ids']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
def simple_clean_text(text):
    return " ".join(text.replace('\n', '').split())

def chunk_class(tag):
    
    """
        Converte as classes Spacy em um modelo customizado...
    """
    
    final = tag
    
    try:
        pre, suf = tag.split('-')
    except:
        return final
    
    if suf == 'PERSON': 
        final = pre + '-PER'
        
        
    if suf == 'GPE' or suf == 'FAC': 
        final = pre + '-LOC'
        
    return final

def stanza_parser_tags(tag):
    
    if tag != 'O':
        prefix, sufix = tag.split('-')
        if  prefix == "E": return 'I-' + sufix
        elif prefix == 'S': return 'B-' +  sufix
        else: return tag
    else: return tag
    
def spacy_annotator(index, sentence, apply_func):
    
    document = apply_func(sentence)

    orgin_tokens = []
    origin_tags = []
    
    for word_obj in document:
        
        tmp_tag = 'O'
        if word_obj.ent_iob_ != 'O':
            tmp_tag = word_obj.ent_iob_+ '-' + word_obj.ent_type_

        orgin_tokens.append(str(word_obj))
        origin_tags.append(chunk_class(tmp_tag))
        
    return pd.DataFrame({
        "Sentence": ['Sentence #' + str(index)] * len(orgin_tokens),
        "Word": orgin_tokens,
        "Tag": origin_tags
    })
    
def stanza_annotator(index, text, apply_func):
    
    doc = apply_func(text)
    
    words = []
    tags = []
    
    for sent in doc.sentences:
        for token in sent.tokens:
    
            words.append(token.text)
            #CONVERTE BILOUS TO BIO
            bio_tag = stanza_parser_tags(token.ner)
            
            tags.append(chunk_class(bio_tag))
    
    return pd.DataFrame({
        "Sentence": ['Sentence #' + str(index)] * len(words),
        "Word": words,
        "Tag": tags
    })

## Reducer NER Class

In [33]:
def reduce_ner_class(candidates_preds):

    if not isinstance(candidates_preds, zip):
        raise Exception("Tipo de estrutura não é aceita!")
     
    def __majority(candidate_tags): 
        try:
            return mode(candidate_tags)
        except StatisticsError:
            return 'O'

    return [__majority(tags) for tags in candidates_preds]

In [38]:
%%time

final_data_annotate = pd.DataFrame()

for index, text in enumerate(tqdm.notebook.tqdm(data.text[:100])):
    
    preproc_text = simple_clean_text(text)
    
    # Predictions #1
    stanza_pd = stanza_annotator(index,
                                 preproc_text,
                                 nlp_stanza_model)
    # Predictions #2
    spacy_pd = spacy_annotator(index,
                               preproc_text,
                               nlp_spacy_model)
    # Predictions #3
    spacy_pd_lg = spacy_annotator(index,
                                  preproc_text,
                                  nlp_spacy_model_lg)
    
    # Majority Vote or Mode
    stanza_pd.Tag = reduce_ner_class(zip(stanza_pd.Tag,
                                         spacy_pd.Tag,
                                         spacy_pd_lg.Tag))
    
    if len(stanza_pd.Tag.value_counts()) == 1:
        pass
    else:
        # Append on Final Model
        final_data_annotate = final_data_annotate.append(stanza_pd)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


CPU times: user 4min 3s, sys: 554 ms, total: 4min 4s
Wall time: 41.2 s


In [41]:
final_data_annotate

Unnamed: 0,Sentence,Word,Tag
0,Sentence #3,Morning,O
1,Sentence #3,love,O
2,Sentence #3,it,O
3,Sentence #3,up,O
4,Sentence #3,North,B-PER
...,...,...,...
15,Sentence #99,down,O
16,Sentence #99,Jess,B-ORG
17,Sentence #99,Frank,I-ORG
18,Sentence #99,Golf,I-ORG


In [40]:
final_data_annotate.to_excel('data/final_data.csv', index=None)