In [2]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 32.6 MB/s 
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.12.1 transformers-4.21.1


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

In [5]:
import numpy as np
import pandas as pd

f = open("Train.txt", "r", encoding = 'utf-8')
lines = f.readlines()

train_tagged = []

for line in lines:
    if line == '\n':
        continue
    train_tagged.append(tuple(line.strip().split()))

x = []
y = []
sen_x = []
sen_y = []
for tup in train_tagged:
    x.append(tup[0])
    y.append(tup[1])
    if tup[0] == '.':
        sen_x.append(x)
        sen_y.append(y)
        x = []
        y = []

train_df = pd.DataFrame()
train_df['tags'] = sen_y
train_df['tokens'] = sen_x

tags = {tag for word,tag in train_tagged}

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(list(tags))
train_df['tags']=train_df['tags'].apply(lambda r:le.transform(r))

In [6]:
from datasets import Dataset
dataset_train = Dataset.from_pandas(train_df.iloc[:7000])
dataset_val = Dataset.from_pandas(train_df.iloc[7000:])

In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                #label_ids.append(-100)
                pass
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                #label_ids.append(-100)
                pass
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [8]:
tokenized_dataset_train = dataset_train.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

In [9]:
tokenized_dataset_val = dataset_val.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [10]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [11]:
num_classes = 23

In [12]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=num_classes)

Downloading pytorch_model.bin:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10
)

In [14]:
tokenized_dataset_train

Dataset({
    features: ['tags', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 7000
})

In [15]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [16]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7000
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4380


Epoch,Training Loss,Validation Loss
1,No log,1.144587
2,1.522900,0.824952
3,1.024800,0.665848
4,0.828300,0.569425
5,0.731500,0.674165
6,0.610900,0.623058
7,0.549800,0.487459
8,0.455600,0.51416
9,0.455600,0.401028
10,0.390800,0.3566


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1722
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1722
  Batc

TrainOutput(global_step=4380, training_loss=0.7271944368266624, metrics={'train_runtime': 2402.514, 'train_samples_per_second': 29.136, 'train_steps_per_second': 1.823, 'total_flos': 5596573179884496.0, 'train_loss': 0.7271944368266624, 'epoch': 10.0})

In [17]:
f = open("Test.txt", "r", encoding = 'utf-8')
lines = f.readlines()

test_tagged = []

for line in lines:
    if line == '\n':
        continue
    test_tagged.append(tuple(line.strip().split()))

x = []
y = []
sen_x = []
sen_y = []
for tup in test_tagged:
    x.append(tup[0])
    y.append(tup[1])
    if tup[0] == '.':
        sen_x.append(x)
        sen_y.append(y)
        x = []
        y = []

test_df = pd.DataFrame()
test_df['tags'] = sen_y
test_df['tokens'] = sen_x

test_df['tags']=test_df['tags'].apply(lambda r:le.transform(r))

In [18]:
dataset_test = Dataset.from_pandas(test_df)

In [19]:
tokenized_dataset_test = dataset_test.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/10 [00:00<?, ?ba/s]

In [20]:
predictions = trainer.predict(tokenized_dataset_test)

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 9279
  Batch size = 16


In [21]:
preds = []
for i,sentence in enumerate(tokenized_dataset_test['tags']):
  l = len(sentence)
  p = predictions.predictions[i][:l]
  t = np.argmax(p, axis=-1)
  preds.append(t)

In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(list(np.concatenate(preds).flat), list(np.concatenate(tokenized_dataset_test['tags']).flat))

0.8872985136107978