<a href="https://colab.research.google.com/github/moustafa100/AI-for-Medical/blob/main/project_1(NER).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U accelerate
!pip install -U transformers
!pip install -U torch

In [None]:
!pip install -U datasets

#**The** **Dataset**


:






In [None]:
from datasets import get_dataset_config_names
xtreme_subsets=get_dataset_config_names("xtreme")
print(f"XTREME has {len(xtreme_subsets)} configurations")

XTREME has 183 configurations


In [None]:
panx_subsets=[s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:3]

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']

In [None]:
from datasets.load import load_dataset_builder
from datasets import load_dataset
load_dataset("xtreme",name="PAN-X.de")

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [None]:
from collections import defaultdict
from datasets import DatasetDict

In [None]:
langs=["de", "fr", "it", "en"]
fracs= [0.629, 0.229, 0.084, 0.059]
panx_ch=defaultdict(DatasetDict)
for lang ,frac in zip (langs,fracs):
    ds =load_dataset("xtreme",name=f"PAN-X.{lang}")
    for split in ds:
      panx_ch[lang][split]=(ds[split].shuffle(seed=0).select(range(int(frac*ds[split].num_rows))))

In [None]:
import pandas as pd
pd.DataFrame({lang: [panx_ch[lang]["train"].num_rows] for lang in langs},
 index=["Number of training examples"])

Unnamed: 0,de,fr,it,en
Number of training examples,12580,4580,1680,1180


##the examples in the German corpus:


In [None]:
element =panx_ch["de"]["train"][0]
for key, value in element.items():
  print(f"{key}:{value}")

tokens:['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags:[0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs:['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


In [None]:
tags=panx_ch["de"]["train"].features["ner_tags"].feature
print(tags)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [None]:
def create_tag_names(batch):
  return {"ner_tags_str":[tags.int2str(idx) for idx in batch["ner_tags"]]}

In [None]:
panx_de=panx_ch["de"].map(create_tag_names)

In [None]:
de_example=panx_de["train"][0]
pd.DataFrame([de_example["tokens"],de_example["ner_tags_str"]])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
1,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


In [None]:
from collections import Counter

In [None]:
split2freqs=defaultdict(Counter)
for split,dataset in panx_de.items():
  for row in dataset["ner_tags_str"]:
    for tag in row:
      if tag.startswith("B"):
        tag_type=tag.split("-")[1]
        split2freqs[split][tag_type] +=1
pd.DataFrame.from_dict(split2freqs,orient="index")

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


##**Creating a Custom Model for Token Classification**

In [None]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel


In [None]:
from flax.linen.recurrent import Output
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
  config_class=XLMRobertaConfig


  def __init__(self,config):
    super().__init__(config)
    self.num_labels=config.num_labels
    # Load model body
    self.roberta=RobertaModel(config,add_pooling_layer=False)
    # Set up token classification head
    self.dropout=nn.Dropout(config.hidden_dropout_prob)
    self.classifier=nn.Linear(config.hidden_size,config.num_labels)
    # Load and initialize weights
    self.init_weights
  def forward(self,input_ids=None,attention_mask=None,token_type_ids=None,
              labels=None,**kwargs):
    # Use model body to get encoder representations
    outputs=self.roberta(input_ids,attention_mask=attention_mask,
                         token_type_ids=token_type_ids,**kwargs)
    # Apply classifier to encoder representation
    sequence_output=self.dropout(outputs[0])
    logits=self.classifier(sequence_output)
    # Calculate losses
    loss=None
    if labels is not None :
      loss_fct=nn.CrossEntropyLoss()
      loss=loss_fct(logits.view(-1,self.num_labels),labels.view(-1))
    # Return model output object
    return TokenClassifierOutput(loss=loss,logits=logits,
                                 hidden_states=outputs.hidden_states,
                                 attentions=outputs.attentions)

In [None]:
index2tag={idx:tag for idx, tag in enumerate (tags.names)}
tag2index={tag:idx for idx,tag in enumerate(tags.names)}

In [None]:
from transformers import AutoTokenizer

bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

In [None]:
from transformers import AutoConfig

xlmr_config=AutoConfig.from_pretrained(xlmr_model_name,
                                      num_labels=tags.num_classes,
                                      id2label=index2tag, label2id=tag2index)

In [None]:
import torch
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model=(XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name,config=xlmr_config).to(device))

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
text = "Jack Sparrow loves New York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()


In [None]:
input_ids=xlmr_tokenizer.encode(text,return_tensors="pt")
pd.DataFrame([xlmr_tokens,input_ids[0].numpy()],index=["Tokens","Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Input IDs,0,21763,37456,15555,5161,7,2356,5753,38,2


In [None]:
outputs=xlmr_model(input_ids.to(device)).logits
predicions=torch.argmax(outputs,dim=-1)
print(f"Number of tokens in sequence : {len (xlmr_tokens)}")
print(f"shape of outputs :{outputs.shape}")

Number of tokens in sequence : 10
shape of outputs :torch.Size([1, 10, 7])


In [None]:
preds=[tags.names[p] for p in predicions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens,preds],index=["tokens","tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
tags,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG,B-ORG


In [None]:
def tag_text(text, tags, model, tokenizer):
    # Get tokens with special characters
    tokens = tokenizer(text).tokens()
    # Encode the sequence into IDs
    input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
    # Get predictions as distribution over 7 possible classes
    outputs = model(input_ids)[0]
    # Take argmax to get most likely class per token
    predictions = torch.argmax(outputs, dim=2)
    # Convert to DataFrame
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])


In [None]:
words,labels=de_example["tokens"],de_example["ner_tags"]

In [None]:
tokenized_input=xlmr_tokenizer(de_example["tokens"],is_split_into_words=True)
tokens=xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

In [None]:
pd.DataFrame([tokens],index=["Tokens"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>


In [None]:
word_ids=tokenized_input.word_ids()
pd.DataFrame([tokens,word_ids],index=["Tokens","Word_IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word_IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,


In [None]:
previous_word_idx=None
label_ids=[]
for word_idx in word_ids:
  if word_idx is None or word_idx == previous_word_idx:
    label_ids.append(-100)
  elif word_idx != previous_word_idx:
    label_ids.append(labels[word_idx])
  previous_word_idx=word_idx

labels=[index2tag[l] if l != -100 else "ING" for l in label_ids]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]
pd.DataFrame([tokens,word_ids,label_ids,labels],index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,
Label IDs,-100,0,0,-100,0,0,5,-100,-100,6,...,5,-100,-100,-100,6,-100,-100,0,-100,-100
Labels,ING,O,O,ING,O,O,B-LOC,ING,ING,I-LOC,...,B-LOC,ING,ING,ING,I-LOC,ING,ING,O,ING,ING


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True,
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
def encode_panx_dataset(corpus):
  return corpus.map(tokenize_and_align_labels,batched=True,
                    remove_columns=['langs','ner_tags','tokens'])

In [None]:
panx_de_encoded=encode_panx_dataset(panx_ch["de"])

Map:   0%|          | 0/6290 [00:00<?, ? examples/s]

## **Performance Measures**

In [None]:
!pip install seqeval



In [None]:
from seqeval.metrics import classification_report

In [None]:
import numpy as np
def align_predictions(predictions,label_ids):
  preds=np.argmax(predictions,axis=2)
  batch_size,seq_len=preds.shape
  labels_list,preds_list=[],[]
  for batch_idx in range (batch_size):
    example_labels,example_preds=[],[]
    for seq_idx in range(seq_len):
      if label_ids[batch_idx,seq_idx] != -100:
        example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
        example_preds.append(index2tag[preds[batch_idx][seq_idx]])
      labels_list.append(example_labels)
      preds_list.append(example_preds)
  return preds_list,labels_list

In [None]:
from transformers import TrainingArguments

num_epochs=3
batch_size=24
logging_steps=len(panx_de_encoded["train"])//batch_size
model_name=f"{xlmr_model_name}-finetuned-panx-de"
training_args=TrainingArguments(
    output_dir=model_name, log_level="error",num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,evaluation_strategy="epoch",
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False,
    logging_steps=logging_steps, push_to_hub=True
)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
  y_pred,y_true=align_predictions(eval_pred.predictions,
                                  eval_pred.label_ids)
  return {"f":f1_score(y_true,y_pred)}

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator=DataCollatorForTokenClassification(xlmr_tokenizer)

In [None]:
def model_init():
  return (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name,
                                                           config=xlmr_config).to(device))

In [None]:
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [None]:
from transformers import Trainer
trainer = Trainer(model_init=model_init,args=training_args,
                  data_collator=data_collator,compute_metrics=compute_metrics,
                  train_dataset=panx_de_encoded["train"],
                  eval_dataset=panx_de_encoded["validation"],
                  tokenizer=xlmr_tokenizer)

In [None]:
trainer.train()
trainer.push_to_hub(commit_message="Training completed")

Epoch,Training Loss,Validation Loss,F
1,0.2607,0.152943,0.815571
2,0.1265,0.144479,0.848746
3,0.0838,0.134196,0.863693


events.out.tfevents.1700989675.31d09fb7a431.15835.0:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

'https://huggingface.co/moustafafathy100/xlm-roberta-base-finetuned-panx-de/tree/main/'

In [None]:
text_de = "Jeff Dean ist ein Informatiker bei Google in Kalifornien"
tag_text(text_de, tags, trainer.model, xlmr_tokenizer)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
Tokens,<s>,▁Jeff,▁De,an,▁ist,▁ein,▁Informati,ker,▁bei,▁Google,▁in,▁Kaliforni,en,</s>
Tags,O,B-PER,I-PER,I-PER,O,O,O,O,O,B-ORG,O,B-LOC,I-LOC,O


##**Error Analysis**

In [None]:
from torch.nn.functional import cross_entropy
def forward_pass_with_label(batch):
  features=[dict(zip(batch,t)) for t in zip (*batch.values())]
  batch=data_collator(features)
  input_ids=batch["input_ids"].to(device)
  attention_mask=batch["attention_mask"].to(device)
  labels=batch["labels"].to(device)
  with torch.no_grad():
    output=trainer.model(input_ids,attention_mask)
    predicted_label=torch.argmax(output.logits,axis=-1).cpu().numpy()
  loss=cross_entropy(output.logits.view(-1,7),
                     labels.view(-1),reduction="none")
  loss=loss.view(len(input_ids),-1).cpu().numpy()
  return {"loss":loss, "predicted_label":predicted_label}

In [None]:
valid_set=panx_de_encoded["validation"]
valid_set=valid_set.map(forward_pass_with_label,batched=True,batch_size=32)
df=valid_set.to_pandas()