# Deep Model Evaluation

In [None]:
!pip install transformers 
!pip install datasets
!pip install seqeval
!pip install huggingface_hub
!apt install git-lfs
!pip install spacy==3.2.4
!python -m spacy download pt_core_news_lg

In [None]:
import requests
from tqdm import tqdm
from zipfile import ZipFile
import os 
import shutil

In [None]:
from nltk.tokenize import wordpunct_tokenize
import string

In [None]:
import json 

data = json.load(open('new_samples.json'))
data = data["diff_n_labels"]
import random

random.shuffle(data)


In [None]:
json.dump(data[:23], open('test_samples.json', 'w'), indent=4)

In [None]:
import re
def join_punctuation_marks(text):
    text = re.sub(r'(\w)\s([.,?!;:]+)', r'\1\2', text)
    return text

In [None]:
def transform_sentencesv2(tokens_, labels):
    
  new_tokens = []
  for tk, lbl in zip(tokens_, labels):

      new_tokens.append(tk)
      if lbl == "I-PERIOD":
          new_tokens.append('.')
      elif lbl == 'I-COMMA':
          new_tokens.append(',')

  return join_punctuation_marks(' '.join(new_tokens))

In [None]:
import pandas as pd

data_samples = []
for item in data[:23]:

    tokens = item.pop('tokens')
    pred_labels = item.pop('pred_labels')
    labels = item.pop('labels')
    item['anotadores'] = transform_sentencesv2(tokens,labels )
    item['modelo_IA'] = transform_sentencesv2(tokens,pred_labels )
    data_samples.append(item)

datadf = pd.DataFrame.from_dict(data_samples)
datadf

In [None]:
datadf = datadf.rename(columns={
    'text': 'contexto'
})
datadf

In [None]:
datadf.to_csv('manual_evaluation_sample.csv', index_label=False)

In [None]:
len(data[:23])

In [None]:
def text2labels(sentence):
    """
    Convert text to labels
    :param sentence: text to convert
    :return:  list of labels
    """
    tokens = wordpunct_tokenize(sentence.lower())

    labels = []
    for i, token in enumerate(tokens):
        try:
            if token not in string.punctuation:
                labels.append('O')
            elif token in ['.', '?', '!', ';']:
                labels[-1] = 'I-PERIOD'
            elif token == ',':
                labels[-1] = 'I-COMMA'

        except IndexError:
            raise ValueError(f"Sentence can't start with punctuation {token}")
    return labels

In [None]:
def bert_transform_sentences(text_, groups):
  
  new_text_list = list(text_)

  shift = 0 
 
  for out in groups:
      punkt = '.' if out['entity_group']=='PERIOD' else ','
      if out['end']+shift < len(new_text_list)+1:
        new_text_list.insert(out['end']+shift, punkt)
        shift += 1
 
  return ''.join(new_text_list)

In [None]:
from nltk.tokenize import wordpunct_tokenize
import string
def remove_punctuation(text):
    """
    Remove punctuation from text
    :param text: text to remove punctuation from
    :return:  text without punctuation
    """
    text = [word.lower() for word in wordpunct_tokenize(text)
                    if word not in string.punctuation]
    return text

### Load Dataset

In [None]:
from seqeval.metrics import classification_report

def compute_scores(true_labels, pred_labels):


  true_labels = [] 
  pred_labels = []
  for t_lbls, p_lbls in zip(true_labels, pred_labels):

      true_labels.append([
      t_lbl for t_lbl, p_lbl in zip(t_lbls, p_lbls)  
      ])

      pred_labels.append([
          p_lbls for t_lbl, p_lbl in zip(t_lbls, p_lbls)  
      ])
  

  return classification_report(true_labels, pred_labels)



In [None]:
def t5_transform_sentence(text):
  text = text.replace(' [I-COMMA]', ',')
  text = text.replace(' [I-PERIOD]', '.')
  text = text.replace('[Other]', '')
  text = text.replace('Recognize Entities: ', '')
  return text

In [None]:
from transformers import pipeline, TokenClassificationPipeline

def get_bert_pred_sentence(sentence: str, model: pipeline):
    
    groups = model(sentence)
    new_text = bert_transform_sentences(sentence,groups)
    return new_text

def get_t5_pred_sentence(sentence: str, model: pipeline):
  
  gen_text = model(sentence)[0]['generated_text']  

  return t5_transform_sentence(gen_text)

In [None]:
def t5labels2text(text):
  text = text.replace(' [I-COMMA]', ',')
  text = text.replace(' [I-PERIOD]', '.')
  text = text.replace(' [Other]', '')
  text = text.replace('Recognize Entities: ', '')
  return text

In [None]:
def get_model(model_path: str, model_type:str):

    if model_type == 'bert':
       model = pipeline("ner", model=model_path, aggregation_strategy="average", device=0)
    elif model_type == 't5':
        model = pipeline("text2text-generation", model_path, max_length=512, device=0, use_auth_token=True)
    else:
      raise ValueError("Model type not supported")

    return model

In [None]:
from tqdm.notebook import tqdm
from datasets import Dataset
from collections import Counter
from itertools import chain

def compute_report(test_subset: Dataset, 
                   model_path: str,
                   model_type: str = 'bert'):
  pred_labels = []
  true_labels = []
  model = get_model(model_path, model_type)
  for samples in tqdm(test_subset):
    text = ' '.join(remove_punctuation(' '.join(samples['text'].split()))).lower()

    pred_pipeline = get_bert_pred_sentence
    if model_type == "t5":
      text = "Recognize Entities: " + text
      pred_pipeline = get_t5_pred_sentence
      

    pred_text = pred_pipeline(text, model)

    preds = text2labels(pred_text)
    true_label  = [t_lbl for t_lbl, p_lbl in zip(samples['labels'] , preds)]
    preds =  [p_lbl for t_lbl, p_lbl in zip(samples['labels'] , preds) ]

    pred_labels.append(preds)
    true_labels.append(true_label)
  
  return classification_report(true_labels, pred_labels, output_dict=True)
  

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from datasets import load_dataset, Dataset

dataset = load_dataset('tiagoblima/mec-punctuation', use_auth_token=True)
dataset

In [None]:
from datasets import concatenate_datasets

both_annotators_ds = dataset['train'].filter(lambda sample: sample['tag']=='both_anotators')
both_annotators_ds

In [None]:
len(list(set(both_annotators_ds['text_id'])))

In [None]:
from collections import Counter 
from itertools import chain

true_labels = both_annotators_ds['labels']
true_labels[:3]
print(Counter(chain.from_iterable(both_annotators_ds['labels'])))

In [None]:
nilc_dataset = load_dataset('tiagoblima/punctuation-nilc', use_auth_token=True)
nilc_dataset

In [None]:
nilc_dataset['test']['text'][790:800]

In [None]:
print(Counter(list(chain.from_iterable(nilc_dataset['test']['labels']))))
 

In [None]:
BASE_MODEL_DIR = 'tiagoblima/punctuation-finetune-mec'
MODEL_NAME = 'bert-portuguese-tedtalk2012'
BERT_BASE = 'tiagoblima/punctuation-nilc-bert-base'
BERT_LARGE = 'tiagoblima/punctuation-nilc-bert-large'
T5_BASE = 'tiagoblima/punctuation-nilc-t5-base'
T5_LARGE = 'tiagoblima/punctuation-nilc-t5-large'

In [None]:
from seqeval.metrics import classification_report
import pandas as pd 

report = compute_report(both_annotators_ds, BERT_LARGE)
df = pd.DataFrame.from_dict(report, orient='index')
df

In [None]:
df.round(3).to_csv()

In [None]:
df.round(3).to_csv('results_mec_t5_base.csv')

In [None]:
from google.colab import files
files.download('results_mec_t5_base.csv') 

### Dataset Statistics

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(BERT_LARGE, use_auth_token=True)

## Qual a quantidade de erros de pontuação? (pontuação|vírgula) (treinar o modelo)

In [None]:
text_ids = list(set(both_annotators_ds['text_id']))
text_ids[:2]

In [None]:
from itertools import chain
essays = []
for text_id in text_ids:
  sentences = both_annotators_ds.filter(lambda ex: ex['text_id']==text_id)

  essay = {
      'text_id':text_id,
      'text': ' '.join(sentences['text']),
      'tokens': list(chain.from_iterable(sentences['tokens'])),
      'labels': list(chain.from_iterable(sentences['labels']))
  }
 
  essays.append(essay)
  
  

In [None]:
len(essays)

In [None]:
import pandas as pd

df = pd.DataFrame(essays)
df

In [None]:
dataset = Dataset.from_dict(df)
dataset

In [None]:
def calculate_punct(example):
    example['n_puncts'] = len(list(filter(lambda lbl: lbl != "O", example['labels'])))
    return example

In [None]:
def calculate_bert_tokens(example):
    example['n_bert_tokens'] = len(tokenizer(example['text']).tokens())
    return example

In [None]:
dataset = dataset.map(calculate_punct)
dataset['n_puncts'][:3]

In [None]:
dataset = dataset.map(calculate_bert_tokens)
dataset

In [None]:
filtered_dataset = dataset.filter(lambda example:example['n_bert_tokens'] <= 512)
filtered_dataset

In [None]:
from tqdm.notebook import tqdm
model = get_model(BERT_LARGE, 'bert')
report = []

for example in tqdm(filtered_dataset):
  pred_report = {}
  pred_labels = text2labels(get_bert_pred_sentence(example['text']), model)
  pred_report['text_id'] = example['text_id']
  pred_report['pred_n_puncts'] = len(list(filter(lambda lbl: lbl != "O", pred_labels)))
  pred_report['pred_labels'] = pred_labels
  pred_report['n_puncts'] = example['n_puncts']
  pred_report['diff'] = example['n_puncts'] - len(list(filter(lambda lbl: lbl != "O", pred_labels)))
  report.append(pred_report)

In [None]:
report_df = pd.DataFrame(report)
report_df

# Análise Qualitativa


In [None]:
from datasets import load_dataset,concatenate_datasets, Dataset

dataset = load_dataset('tiagoblima/mec-punctuation', use_auth_token=True).filter(lambda ex:ex['tag']=='both_anotators')['train']
dataset

In [None]:
dataset = dataset
dataset

In [None]:
set(dataset['tag'])

In [None]:
from transformers import pipeline, TokenClassificationPipeline


classifier = pipeline("ner", model=BERT_LARGE, aggregation_strategy="average", device=0)

In [None]:
filtered_dataset = dataset.shuffle(seed=42).select(range(int(2190 * 0.2)))
filtered_dataset

In [None]:
example = ' '.join(filtered_dataset[1]['tokens'])
example

In [None]:

classifier(example)

In [None]:
from tqdm.notebook import tqdm 
from seqeval.metrics import accuracy_score

samples_groups = {
    'equal_n_labels':[],
    'diff_n_labels':[],
    'full_match':[],
    'miss_match':[]
}
model = get_model(BERT_LARGE, 'bert')
for example in tqdm(filtered_dataset):
  preds = text2labels(get_bert_pred_sentence(' '.join(example['tokens']), model))
  example['pred_labels'] = preds

  len_pred = len(list(filter(lambda lbl: lbl != 'O', preds)))
  len_true = len(list(filter(lambda lbl: lbl != 'O', example['labels'])))
  if len_true != len_pred:
    samples_groups['diff_n_labels'].append(example)
  else:
    
    samples_groups['equal_n_labels'].append(example)

    if accuracy_score(preds, example['labels']) > 0.99:
      samples_groups['full_match'].append(example)
    else:
      samples_groups['miss_match'].append(example)

In [None]:
import json 

json.dump(samples_groups, open('new_samples.json', 'w'), indent=4)

In [None]:
import re

In [None]:
def join_punctuation_marks(text):
    text = re.sub(r'(\w)\s([.,?!;:]+)', r'\1\2', text)
    return text

### Análise Diferente Número de Labels

In [None]:
!pip install language-tool-python

In [None]:
samples = samples_groups['diff_n_labels']
samples

In [None]:
TOTAL = 438

In [None]:
len(samples)/TOTAL

In [None]:
len(samples)

In [None]:

new_samples = []

for i,sample in enumerate(samples):
 
  
  sample['pred_sentence'] = transform_sentencesv2(sample['tokens'], sample['pred_labels']) 
  sample['true_sentence'] = transform_sentencesv2(sample['tokens'], sample['labels']) 
  print('Prediction: ', sample['pred_sentence'])
  print('Ground truth: ', sample['true_sentence'])
  if i > 5:
    break
  new_samples.append(sample)

In [None]:
import spacy 


nlp = spacy.load('pt_core_news_lg')



In [None]:
import language_tool_python
tool = language_tool_python.LanguageTool('pt-BR')
text = 'Eu vou para caza usando carro azul'
matches = tool.check(text)
len(matches)

match = matches[0]
match

In [None]:
from spacy.tokens import Span 



In [None]:
len(samples)

In [None]:
words_labels = zip(samples[0]['tokens'], samples[0]['pred_labels'], samples[0]['labels'])
list(words_labels)

In [None]:
import spacy, re


nlp = spacy.load('pt_core_news_lg')

In [None]:
def join_punctuation_marks(text):
    text = re.sub(r'(\w)\s([.,?!;:]+)', r'\1\2', text)
    return text

In [None]:
for i in range(len(samples)):
  print(i, ' '.join(samples[i]['tokens']))

In [None]:
key = 206

In [None]:
pred_sentence = nlp(transform_sentencesv2(samples[key]['tokens'], samples[key]['pred_labels']))
pred_sentence

In [None]:
true_sentence = nlp(transform_sentencesv2(samples[key]['tokens'], samples[key]['labels']))
true_sentence

In [None]:
[(token, nlp.vocab[token.pos].text, lbl) for token, lbl in zip(true_sentence, samples[0]['labels'])]

In [None]:
text = 'caiu um diamante brilhante na aréa da minha casa e depois peguei o diamante, levei pro meu quarto e mostrei por meus pais.'
fixed = nlp(text)
fixed

In [None]:
fixed_labels = text2labels(text)

[(token, nlp.vocab[token.pos].text, lbl) for token, lbl in zip(fixed, fixed_labels)]

In [None]:
def get_conll_dataset(samples):

    dataset = []
    for sentence_id,sample in enumerate(samples, 1):
      sentence = nlp(' '.join(sample['tokens']))
      dataset.extend([(sample['text_id'], sentence_id, token.text, nlp.vocab[token.pos].text, lbl_pred, lbl_true) 
                      for token, lbl_pred, lbl_true in zip(sentence,sample['pred_labels'],sample['labels'])])
    return dataset

In [None]:
dataset = get_conll_dataset(samples)
dataset[:3]

In [None]:
import pandas as pd 


df = pd.DataFrame(dataset, columns=['text_id', 'sentence_id', 'tokens','pos_tag', 'pred_labels', 'labels'])
df

In [None]:
for id, group in df.groupby('sentence_id'):
  print(id, group)
  break

In [None]:
from collections import defaultdict
from spacy.tokens import Span

def preprocess_data(dataframe, label_col="labels"):
     

    TOTAL = len(list(dataframe.groupby("sentence_id")))
    data = []
    for _, group in tqdm(dataframe.groupby("sentence_id"), total=TOTAL):
        text = " ".join(group.tokens)
        doc = nlp.make_doc(text)
        ents = []
        count_dict = defaultdict(lambda : 0)
        for i, label in enumerate(group[label_col]):
            if label != "O":
                matching_word = group.tokens.tolist()[i]
                span = Span(doc, i, i+1, label=label)
                ents.append({
                    "start":span.start_char, 
                     "end":span.end_char, 
                     "label":label
                })
                count_dict[matching_word] += 1
        ent = {
            "text":text,
            'ents': ents,
            'title':"Pontuação - " +label_col
        } 
      
        data.append(ent)
    return data

In [None]:
true_data = preprocess_data(df)
true_data[:1]

In [None]:
pred_data = preprocess_data(df, label_col='pred_labels')
pred_data[:1]

In [None]:
pos_data = preprocess_data(df, label_col='pos_tag')
pos_data[:1]

In [None]:
from spacy import displacy

displacy.render(true_data[:1], style="ent",  manual=True, jupyter=True)

In [None]:
from spacy import displacy

displacy.render(pred_data[:1], style="ent",  manual=True, jupyter=True)

In [None]:
from spacy import displacy

displacy.render(pos_data[:1], style="ent",  manual=True, jupyter=True)

In [None]:
text = 'caiu um diamante brilhante na aréa da minha casa e depois peguei o diamante levei pro meu quarto e mostrei por meus pais'
fixed = nlp(text)
fixed

In [None]:
from spacy import displacy

displacy.render(fixed, style="dep",  jupyter=True)

In [None]:

fixed_df = pd.DataFrame([(1, token.text, nlp.vocab[token.pos].text, lbl) for token, lbl in zip(fixed, fixed_labels)], 
                        columns=['sentence_id','tokens', 'pos_tag', 'pred_labels'])
fixed_df

In [None]:
fixed_data = preprocess_data(fixed_df, label_col='pos_tag')
fixed_data[:1]

In [None]:
from spacy import displacy

displacy.render(fixed_data[:1], style="ent",  manual=True, jupyter=True)

## Análise Full Match

In [None]:
full_match_samples = samples_groups['full_match']
len(full_match_samples)

In [None]:

new_samples = []

for i,sample in enumerate(full_match_samples):
 
  
  sample['pred_sentence'] = transform_sentencesv2(sample['tokens'], sample['pred_labels']) 
  sample['true_sentence'] = transform_sentencesv2(sample['tokens'], sample['labels']) 
  print('Prediction: ', sample['pred_sentence'])
  print('Ground truth: ', sample['true_sentence'])
  if i > 5:
    break
  new_samples.append(sample)

In [None]:
len(full_match_samples)

In [None]:
example = full_match_samples[34]
example

In [None]:
fm_doc = nlp(' '.join(example['tokens']))
fl_labels = predict_labels(' '.join(example['tokens']))
fixed_df = pd.DataFrame([(1, token.text, nlp.vocab[token.pos].text, lbl_pred, t_label) for token, lbl_pred, t_label in zip(fm_doc, fl_labels, example['labels'])], 
                        columns=['sentence_id','tokens', 'pos_tag', 'pred_labels', 'true_labels'])
fixed_df

## Miss Match



In [None]:
miss_match = samples_groups['miss_match']
len(miss_match)

In [None]:
example = miss_match[7]
fm_doc = nlp(' '.join(example['tokens']))
fl_labels = text2labels(get_bert_pred_sentence(' '.join(example['tokens']), model))
fixed_df = pd.DataFrame([(1, token.text, nlp.vocab[token.pos].text, lbl_pred, t_label) for token, lbl_pred, t_label in zip(fm_doc, fl_labels, example['labels'])], 
                        columns=['sentence_id','tokens', 'pos_tag', 'pred_labels', 'true_labels'])
fixed_df

In [None]:

new_samples = []

for i,sample in enumerate(miss_match):
 
  
  sample['pred_sentence'] = transform_sentencesv2(sample['tokens'], sample['pred_labels']) 
  sample['true_sentence'] = transform_sentencesv2(sample['tokens'], sample['labels']) 
  print('Prediction: ', sample['pred_sentence'])
  print('Ground truth: ', sample['true_sentence'])
  if i > 5:
    break
  new_samples.append(sample)

In [None]:
sentences = new_samples[:5]

In [None]:
from datasets import load_dataset,concatenate_datasets, Dataset

dataset = load_dataset('tiagoblima/mec-punctuation', use_auth_token=True).filter(lambda ex:ex['tag']=='both_anotators')['train']
dataset

In [None]:
for sample in dataset:
  print(sample['tokens'])

In [None]:
text = "Recognize Entities: " + text
pred_pipeline = get_t5_pred_sentence()