# Demo 2 - Chinese Words

In [1]:
#pip install transformers datasets evaluate seqeval

In [2]:
from datasets import Dataset,DatasetDict


In [3]:
#Reads the files -function from a demo used previously in this course 
def read_chinese_data(inputfilename):
    with open(inputfilename, "r") as inputfile:
        sentences = []
        collection_words = []
        collection_labels = []
        for line in inputfile:
            if line[0] == '#':
                continue
            columns = line.split()
            #print(words)
            if columns == []:
                sentences.append((''.join(collection_words), collection_labels))
                collection_words = []
                collection_labels = []
                continue
            collection_words.append(columns[1])
            collection_labels += [1] + ([0] * (len(columns[1]) - 1))
            
    return sentences
train_sentences = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-train.conllu')
test_sentences = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-test.conllu')


In [4]:
print( len(train_sentences[1][0]))
len( train_sentences[1][1])

46


46

In [5]:
#checking the size
cont = 0
for sentence in train_sentences:
    cont += len(sentence[0])
print(cont)

156297


In [6]:
{'abc':[10,11,12]}


{'abc': [10, 11, 12]}

In [7]:
#creates a list with all unique characters 
id_sent_list = []
for sentence in train_sentences:
    for c in sentence[0]:
        if c not in id_sent_list:
           id_sent_list.append(c)
for sentence in test_sentences:
    for c in sentence[0]:
        if c not in id_sent_list:
           id_sent_list.append(c)
print(id_sent_list[:5])


['看', '似', '簡', '單', '，']


In [8]:
#organizes the data into a dictionary and converts the characters in the phrases to corresponding indices using input_ids_list
def create_classification(source,input_ids_list):
    #list[tuple[str,list[int]]]
    classification = {'tokens':[],'input_ids':[],'labels':[]} #{tokens: list[str], input_ids: list[int], labels: list[0,1]}
    #dict[key]=value
    
    for tupla in source:
        words = tupla[0] # phrase
        labels = tupla[1] # binary list
        #for index in range(len(labels)):
        classification['tokens'].append(words)
        classification['labels'].append(labels)
        #abc
        #[1,2,3]
        new_list = []
        for i in words:# each character
            index = input_ids_list.index(i)
            new_list.append(index)
        classification['input_ids'].append(new_list)
        
    #dataset_format = {'tokens':list(classification.keys())}#,'ner_tags':list(classification.values())}
    ds = Dataset.from_dict(classification)
    return ds

In [9]:
ds_train = create_classification(train_sentences,id_sent_list)
ds_test = create_classification(test_sentences,id_sent_list)

In [10]:
raw_datasets = DatasetDict({"train":ds_train,"test":ds_test})
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'input_ids', 'labels'],
        num_rows: 3997
    })
    test: Dataset({
        features: ['tokens', 'input_ids', 'labels'],
        num_rows: 500
    })
})

In [11]:
#label_list = ds.features[f"ner_tags"].feature.names
#label_list
raw_datasets['train'][40]

{'tokens': '現存的三棟屋村位於三棟屋路近和宜合交匯處一帶。',
 'input_ids': [472,
  727,
  20,
  181,
  728,
  729,
  730,
  271,
  84,
  181,
  728,
  729,
  168,
  593,
  406,
  179,
  731,
  266,
  732,
  719,
  9,
  65,
  45],
 'labels': [1,
  0,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  1]}

## Data Preprocessing

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

In [13]:
raw_datasets['train']

Dataset({
    features: ['tokens', 'input_ids', 'labels'],
    num_rows: 3997
})

In [14]:
#accessing an especific example
example = raw_datasets['train'][30]
tokenized_input = tokenizer(example['tokens'], is_split_into_words=False)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens
#tokenized_input.__dict__

['[CLS]',
 '該',
 '組',
 '織',
 '的',
 '前',
 '身',
 '，',
 '是',
 '1963',
 '年',
 '由',
 '奧',
 '地',
 '利',
 '裔',
 '美',
 '國',
 '經',
 '濟',
 '學',
 '家',
 '弗',
 '里',
 '茨',
 '·',
 '馬',
 '赫',
 '盧',
 '普',
 '創',
 '立',
 '的',
 '[UNK]',
 '；',
 '目',
 '的',
 '是',
 '為',
 '了',
 '探',
 '求',
 '解',
 '決',
 '國',
 '際',
 '貨',
 '幣',
 '體',
 '系',
 '問',
 '題',
 '的',
 '方',
 '案',
 '，',
 '特',
 '別',
 '是',
 '1960',
 '年',
 '代',
 '美',
 '國',
 '所',
 '面',
 '臨',
 '的',
 '國',
 '際',
 '收',
 '支',
 '危',
 '機',
 '。',
 '[SEP]']

In [15]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    print(labels)
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [16]:
raw_datasets["train"][30]['tokens']

'該組織的前身，是1963年由奧地利裔美國經濟學家弗里茨·馬赫盧普創立的TheBellagioGroup；目的是為了探求解決國際貨幣體系問題的方案，特別是1960年代美國所面臨的國際收支危機。'

In [17]:
#labels = raw_datasets["train"][30]['labels']
#word_ids = tokenized_input.word_ids()
#print(labels)
#print(align_labels_with_tokens(labels, word_ids))
#print(len(word_ids),len(labels))

In [19]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=False
    )
    all_labels = examples["labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [20]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/3997 [00:00<?, ? examples/s]

[1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1]
[1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1]
[1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1]
[1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1]
[1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1]
[1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1]
[1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1]
[1, 0, 1, 1

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1]
[1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1]
[1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1]
[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1]
[1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1]
[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1]
[1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1]
[1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
[1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1

In [21]:
tokenized_datasets['train'][0]

{'input_ids': [101,
  4692,
  849,
  5080,
  1606,
  8024,
  1372,
  3221,
  753,
  6908,
  671,
  976,
  3748,
  3079,
  8024,
  852,
  1071,
  2179,
  800,
  947,
  807,
  6134,
  4638,
  3221,
  872,
  1453,
  6901,
  4638,
  6217,
  3301,
  1962,
  1351,
  8024,
  6275,
  5865,
  5183,
  872,
  679,
  1398,
  4638,
  2692,
  6210,
  8024,
  852,
  6841,
  3418,
  4955,
  2419,
  8024,
  3297,
  2527,
  3748,
  2137,
  4638,
  6917,
  3221,
  5632,
  2346,
  511,
  102],
 'labels': [-100,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  -100],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

In [22]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
data_collator

2023-11-18 10:59:50.515857: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


DataCollatorForTokenClassification(tokenizer=BertTokenizerFast(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

## Evaluate

In [23]:
import evaluate

seqeval = evaluate.load("seqeval")

In [24]:
import numpy as np

labels = ['COW' if i==0 else 'BOW' for i in example[f"labels"]]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [25]:
#pip install torch

## Train

In [27]:
label2id = {
    "BOW": 1,
    "COW": 0,
}

In [28]:
id2label = {
    1: "BOW",
    0: "COW",
}

In [29]:
import torch
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-chinese", num_labels=len(id2label), id2label=id2label, label2id=label2id
)

#talvez usar o from LM chinese_model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
#pip install transformers[torch]

In [31]:
training_args = TrainingArguments(
    output_dir="my_awesome_chinese_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)




In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
trainer.train()

In [None]:
#trainer.push_to_hub()

## Inference

In [None]:
import torch

In [None]:
text = raw_datasets['test'][30]['tokens']
text

In [None]:
from transformers import pipeline

classifier = pipeline("ner", model="my_awesome_chinese_model")
classifier(text)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("BERT_and_chinese")
inputs = tokenizer(text, return_tensors="pt")

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("LiviaDutra/my_awesome_chinese_model")
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
predictions = torch.argmax(logits, dim=2)
predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
predicted_token_class