# Load Data

In [127]:
import pandas as pd
def load_ner_data(path):
    sentences = []
    with open(path, 'r', encoding='utf-8') as fd:
        sentence = []
        count = 1
        for line in fd:
            line = line.strip()
            if line == '':
              if len(sentence) > 0:
                sentences.append(sentence)
              sentence = []
            else: 
              token_tag = line.split('\t')
              sentence.append(token_tag)
            count += 1
    return sentences
            
def load_ner_data_as_df(path):
    token_tags = []
    with open(path, 'r', encoding='utf-8') as fd:
      for line in fd:
        line = line.strip()
        if line == '':
            continue
        else:
          token_tag = line.split('\t')
          token_tags.append(token_tag)
    return pd.DataFrame(token_tags)

# Data Analysis

In [128]:
en_df = load_ner_data_as_df('dataset/NEREnglish.tsv')
id_df = load_ner_data_as_df('dataset/NERIndonesia.tsv')

In [129]:
en_df[1].value_counts()

O             20993
B-DISEASE       815
I-DISEASE       405
B-CHEMICAL      156
I-CHEMICAL       54
I-GENETIC        27
B-GENETIC        20
Name: 1, dtype: int64

In [130]:
id_df[1].value_counts()

O             18828
B-DISEASE       863
I-DISEASE       385
B-CHEMICAL      165
I-CHEMICAL       51
I-GENETIC        23
B-GENETIC        22
Name: 1, dtype: int64

# NER Evaluate

In [None]:
!pip3 install transformers

In [131]:
# Utils function source code

# Convert tags data and cascade it into sentence
# usually for NER input
def tokens_to_sentence(tokens):
  return " ".join([token[0] for token in tokens])

# Collapse list of entities from NER prediction
# and get all the highest scored prediction 
def collapse_entities(entities_list):
  indexs = set()
  indexed = []
  ret = []
  for _ in entities_list:
    for e in _:
      index = e['index']
      indexs.add(index)
    indexed.append({e['index'] : e for e in _})
  for index in indexs:
    entities = [e.get(index, {'score': 0}) for e in indexed]
    entities.sort(key=(lambda e: e['score']), reverse=True)
    ret.append(entities[0])
  return ret

# Use to convert tags data for evaluation  
# by tokenizing the tagged data using 
# tokenizer but pass through the label
def tokenize_and_preserve_label(tokenizer, tokens):
  new_tokens = []
  for token in tokens:
    old_token = token[0]
    label = token[1]
    for idx, new_token in enumerate(tokenizer.tokenize(old_token)):
      new_label = label
      if idx > 0:
        if label != 'O':
          new_label = 'I' + label[1:]        
      new_tokens.append([new_token, new_label])
  return new_tokens

In [132]:
evaluation_data_en = load_ner_data('dataset/NEREnglish.tsv')
evaluation_data_id = load_ner_data('dataset/NERIndonesia.tsv')

In [72]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained('alvaroalon2/biobert_chemical_ner')
chem_model = AutoModelForTokenClassification.from_pretrained('alvaroalon2/biobert_chemical_ner')
gene_model = AutoModelForTokenClassification.from_pretrained('alvaroalon2/biobert_genetic_ner')
dise_model = AutoModelForTokenClassification.from_pretrained('alvaroalon2/biobert_diseases_ner')

# Create pipelines for all fine-tuned models 
ner_chem = pipeline('ner', model=chem_model, tokenizer=tokenizer)
ner_gene = pipeline('ner', model=gene_model, tokenizer=tokenizer)
ner_dise = pipeline('ner', model=dise_model, tokenizer=tokenizer)

In [73]:
def perform_predictions(sentences):
    # Run predicition for all sentences
    predictions = []
    for sentence in sentences:
      # Tokenize actual tags
      actual_entities = tokenize_and_preserve_label(tokenizer, sentence)

      # Make predictions
      sentence_string = tokens_to_sentence(sentence)
      entities = [ner_chem(sentence_string),ner_gene(sentence_string)]

      # Handle special case for ner_dise
      disease_entities = [e for e in ner_dise(sentence_string) if e['entity'] != '0']
      entities.append(disease_entities)

      # Join all entities with highest score
      predicted_entities = collapse_entities(entities)
      # Add O tokens for unpredicted tokens and format it for evaluation
      indexed = {e['index']: e for e in predicted_entities}
      predicted_entities_all = []
      for idx, e in enumerate(actual_entities):
        entity = indexed.get(idx+1, None)
        entity = [entity['word'], entity['entity']] if entity else [e[0], 'O']
        predicted_entities_all.append(entity)

      for _ in range(len(actual_entities)):
        predictions.append([actual_entities[_][0], actual_entities[_][1], predicted_entities_all[_][1]])
    return predictions

In [78]:
# Quiet slow for large datasets
pred_en = pd.DataFrame(perform_predictions(evaluation_data_en))

In [137]:
# Quiet slow for large datasets
pred_id = pd.DataFrame(perform_predictions(evaluation_data_id))

In [81]:
pred_en.to_csv('pred_en.csv')

In [138]:
pred_id.to_csv('pred_id.csv')

# Evaluate English

In [82]:
from sklearn.metrics import classification_report
report = classification_report(pred_en[1],pred_en[2])
print(report)

              precision    recall  f1-score   support

  B-CHEMICAL       0.72      0.92      0.81       156
   B-DISEASE       0.65      0.89      0.75       815
   B-GENETIC       0.88      0.70      0.78        20
  I-CHEMICAL       0.93      0.86      0.90       404
   I-DISEASE       0.90      0.73      0.81      1581
   I-GENETIC       0.88      0.66      0.75        67
           O       0.98      0.98      0.98     23174

    accuracy                           0.96     26217
   macro avg       0.85      0.82      0.82     26217
weighted avg       0.97      0.96      0.96     26217



# Evaluate Indonesia

In [139]:
from sklearn.metrics import classification_report
report = classification_report(pred_id[1],pred_id[2])
print(report)

              precision    recall  f1-score   support

  B-CHEMICAL       0.70      0.53      0.61       165
   B-DISEASE       0.68      0.10      0.18       863
   B-GENETIC       0.46      0.50      0.48        22
  I-CHEMICAL       0.92      0.43      0.58       476
   I-DISEASE       0.88      0.07      0.13      2582
   I-GENETIC       0.36      0.63      0.46        65
           O       0.93      1.00      0.96     46146

    accuracy                           0.93     50319
   macro avg       0.70      0.47      0.49     50319
weighted avg       0.92      0.93      0.90     50319

