In [1]:
import json
import numpy as np
import nltk
from nltk.corpus import brown 
import torch
from datasets import load_dataset
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('brown')
nltk.download('universal_tagset')
corpus = brown.tagged_sents(tagset='universal')
corpus #output is list of lists, with tuples of token, tag 

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\91959\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\91959\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

In [3]:
#tag2int dictionary
tags = {tag for sentence in corpus for _, tag in sentence}
tag2int = {tag: i for i, tag in enumerate(tags)}
tag2int

{'ADV': 0,
 'CONJ': 1,
 'VERB': 2,
 'ADJ': 3,
 'ADP': 4,
 'PRON': 5,
 'X': 6,
 'NUM': 7,
 '.': 8,
 'PRT': 9,
 'NOUN': 10,
 'DET': 11}

In [4]:
#int2tag dictionary
int2tag = {value:key for key, value in tag2int.items()}
int2tag

{0: 'ADV',
 1: 'CONJ',
 2: 'VERB',
 3: 'ADJ',
 4: 'ADP',
 5: 'PRON',
 6: 'X',
 7: 'NUM',
 8: '.',
 9: 'PRT',
 10: 'NOUN',
 11: 'DET'}

In [5]:
#Convert to json
converted_data = [{
    "inputs": [token for token, _ in sentence],
    "targets": [tag2int[tag] for _, tag in sentence]
} for sentence in corpus]

filename = './datasets/brown.json'

# Use 'with' statement to open file to ensure proper closure after writing
with open(filename, 'w') as file:
    json.dump(converted_data, file, indent=4)

In [6]:
data = load_dataset(path='json', data_files='./datasets/brown.json')
data 

Generating train split: 57340 examples [00:00, 70254.38 examples/s]


DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 57340
    })
})

In [7]:
#train test split
datasets = data['train'].train_test_split(test_size=0.3, seed=42)
datasets

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 40138
    })
    test: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 17202
    })
})

In [8]:
# casing is better for pos(Bill vs bill)
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
#function for aligning targets:
def align_targets(labels, word_ids):
    aligned_labels = []
    last_id = None
    for id in word_ids:
        if id == None:
            label = -100 #transformers(pytorch crossentropy loss) uses -100 for depicting targets that should not affect loss function 
        else:
            label = labels[id] 
        aligned_labels.append(label)
    return aligned_labels 

In [10]:
#checking target alignment:
labels = datasets['train'][0]['targets']
t = tokenizer(datasets['train'][0]['inputs'], is_split_into_words=True)
word_ids = t.word_ids()
aligned_targets = align_targets(labels, word_ids)
aligned_labels =  [int2tag[t] if t>=0 else None for t in aligned_targets]
for x, y in zip(t.tokens(), aligned_labels):
    print(f"{x}\t{y}")

[CLS]	None
Co	NOUN
##st	NOUN
of	ADP
power	NOUN
and	CONJ
machinery	NOUN
is	VERB
often	ADV
a	DET
serious	ADJ
problem	NOUN
to	ADP
the	DET
small	NOUN
-	NOUN
scale	NOUN
farmer	NOUN
.	.
[SEP]	None


In [11]:
for x, y in zip(datasets['train'][0]['inputs'], datasets['train'][0]['targets']):
    print(f"{x}\t{int2tag[y]}")

Cost	NOUN
of	ADP
power	NOUN
and	CONJ
machinery	NOUN
is	VERB
often	ADV
a	DET
serious	ADJ
problem	NOUN
to	ADP
the	DET
small-scale	NOUN
farmer	NOUN
.	.


In [12]:
#tokenize fn:
def tokenize_fn(batch):
    tokenized_inputs = tokenizer(
        batch['inputs'],
        truncation=True,
        is_split_into_words=True
    )
    labels_batch = batch['targets']
    aligned_labels_batch = []
    for i, labels in enumerate(labels_batch):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels_batch.append(align_targets(labels, word_ids))
    tokenized_inputs['labels'] = aligned_labels_batch
    return tokenized_inputs

In [13]:
datasets['train'].column_names

['inputs', 'targets']

In [14]:
tokenized_datasets = datasets.map(
    function=tokenize_fn,
    batched=True,
    remove_columns=datasets['train'].column_names
)

Map: 100%|██████████| 40138/40138 [00:04<00:00, 9890.78 examples/s] 
Map: 100%|██████████| 17202/17202 [00:01<00:00, 9765.89 examples/s] 


In [15]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 40138
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 17202
    })
})

In [16]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [17]:
#using data collator explicitly:
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch #even padding tokens have -100 as target

{'input_ids': tensor([[  101,  3291,  2050,  1104,  1540,  1105, 11360,  1110,  1510,   170,
          3021,  2463,  1106,  1103,  1353,   118,  3418,  9230,   119,   102],
        [  101, 17323, 17941,   136,   136,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100,   10,   10,    4,   10,    1,   10,    2,    0,   11,    3,   10,
            4,   11,   10,   10,   10,   10,    8, -100],
        [-100,    2,   10,    8,    8, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100]])}

In [18]:
#trying out sklearn classification_report
classification_report(
    y_true=[0, 0, 1],
    y_pred=[0, 1, 1],
    output_dict=True
)

{'0': {'precision': 1.0,
  'recall': 0.5,
  'f1-score': 0.6666666666666666,
  'support': 2.0},
 '1': {'precision': 0.5,
  'recall': 1.0,
  'f1-score': 0.6666666666666666,
  'support': 1.0},
 'accuracy': 0.6666666666666666,
 'macro avg': {'precision': 0.75,
  'recall': 0.75,
  'f1-score': 0.6666666666666666,
  'support': 3.0},
 'weighted avg': {'precision': 0.8333333333333334,
  'recall': 0.6666666666666666,
  'f1-score': 0.6666666666666666,
  'support': 3.0}}

In [19]:
def flatten(list_of_lists):
    return [item for row in list_of_lists for item in row]

flatten([[1,2,3], [4, 5]])

[1, 2, 3, 4, 5]

In [20]:
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    labels_jagged = [[l for l in row if l != -100] for row in labels]
    predictions_jagged = [[p for (p,l) in zip(ps, ls) if l !=-100] for ps, ls in zip(predictions, labels)]
    labels_flat = [label for row in labels_jagged for label in row]
    predictions_flat = [prediction for row in predictions_jagged for prediction in row]
    return classification_report(
        y_true=labels_flat, 
        y_pred=predictions_flat,  
        target_names=int2tag.values(),
        output_dict=True
        )

In [21]:
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=int2tag,
    label2id=tag2int
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
training_args = TrainingArguments(
    output_dir='output_dir',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01
)

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [24]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)
trainer.train()

  3%|▎         | 501/15054 [00:54<26:31,  9.14it/s]

{'loss': 0.2972, 'learning_rate': 1.9335724724325762e-05, 'epoch': 0.1}


  7%|▋         | 1002/15054 [01:49<25:00,  9.36it/s]

{'loss': 0.0743, 'learning_rate': 1.8671449448651522e-05, 'epoch': 0.2}


 10%|▉         | 1501/15054 [02:47<22:57,  9.84it/s]

{'loss': 0.0656, 'learning_rate': 1.8007174172977285e-05, 'epoch': 0.3}


 13%|█▎        | 2002/15054 [03:43<24:45,  8.78it/s]

{'loss': 0.0588, 'learning_rate': 1.7342898897303046e-05, 'epoch': 0.4}


 17%|█▋        | 2501/15054 [04:41<23:52,  8.76it/s]

{'loss': 0.0548, 'learning_rate': 1.6678623621628806e-05, 'epoch': 0.5}


 20%|█▉        | 3002/15054 [05:38<23:20,  8.61it/s]

{'loss': 0.0597, 'learning_rate': 1.6014348345954566e-05, 'epoch': 0.6}


 23%|██▎       | 3501/15054 [06:37<23:34,  8.17it/s]

{'loss': 0.0514, 'learning_rate': 1.5350073070280326e-05, 'epoch': 0.7}


 27%|██▋       | 4002/15054 [07:34<20:02,  9.19it/s]

{'loss': 0.0487, 'learning_rate': 1.4685797794606086e-05, 'epoch': 0.8}


 30%|██▉       | 4501/15054 [08:32<19:53,  8.85it/s]

{'loss': 0.0474, 'learning_rate': 1.4021522518931846e-05, 'epoch': 0.9}


 33%|███▎      | 5001/15054 [09:30<19:44,  8.49it/s]

{'loss': 0.0456, 'learning_rate': 1.3357247243257607e-05, 'epoch': 1.0}


                                                    
 33%|███▎      | 5018/15054 [11:16<21:56,  7.62it/s]

{'eval_loss': 0.041264407336711884, 'eval_ADV': {'precision': 0.975478760917919, 'recall': 0.9664606288632088, 'f1-score': 0.9709487553323614, 'support': 18605.0}, 'eval_CONJ': {'precision': 0.9974632610216935, 'recall': 0.9963302752293578, 'f1-score': 0.9968964462123531, 'support': 11445.0}, 'eval_VERB': {'precision': 0.99338722716249, 'recall': 0.9900098290391711, 'f1-score': 0.991695652524796, 'support': 62061.0}, 'eval_ADJ': {'precision': 0.9637307281331094, 'recall': 0.9676005394801374, 'f1-score': 0.965661756833234, 'support': 32624.0}, 'eval_ADP': {'precision': 0.9892465706210795, 'recall': 0.9933321028102072, 'f1-score': 0.9912851271542544, 'support': 43342.0}, 'eval_PRON': {'precision': 0.9957026791109918, 'recall': 0.9907797153738224, 'f1-score': 0.9932350971198929, 'support': 14967.0}, 'eval_X': {'precision': 0.8966666666666666, 'recall': 0.5729499467518637, 'f1-score': 0.6991552956465237, 'support': 939.0}, 'eval_NUM': {'precision': 0.9854413102820746, 'recall': 0.985262008

 37%|███▋      | 5501/15054 [12:16<18:39,  8.53it/s]   

{'loss': 0.0289, 'learning_rate': 1.2692971967583367e-05, 'epoch': 1.1}


 40%|███▉      | 6001/15054 [13:16<18:19,  8.23it/s]

{'loss': 0.0257, 'learning_rate': 1.2028696691909127e-05, 'epoch': 1.2}


 43%|████▎     | 6502/15054 [14:16<16:03,  8.88it/s]

{'loss': 0.0262, 'learning_rate': 1.1364421416234887e-05, 'epoch': 1.3}


 47%|████▋     | 7001/15054 [15:16<17:47,  7.54it/s]

{'loss': 0.0229, 'learning_rate': 1.070014614056065e-05, 'epoch': 1.39}


 50%|████▉     | 7502/15054 [16:17<14:19,  8.79it/s]

{'loss': 0.0284, 'learning_rate': 1.003587086488641e-05, 'epoch': 1.49}


 53%|█████▎    | 8002/15054 [17:17<12:53,  9.12it/s]

{'loss': 0.0289, 'learning_rate': 9.371595589212171e-06, 'epoch': 1.59}


 56%|█████▋    | 8502/15054 [18:17<13:03,  8.36it/s]

{'loss': 0.0263, 'learning_rate': 8.707320313537931e-06, 'epoch': 1.69}


 60%|█████▉    | 9002/15054 [19:18<11:14,  8.97it/s]

{'loss': 0.0262, 'learning_rate': 8.043045037863691e-06, 'epoch': 1.79}


 63%|██████▎   | 9501/15054 [20:18<11:15,  8.22it/s]

{'loss': 0.0231, 'learning_rate': 7.378769762189451e-06, 'epoch': 1.89}


 66%|██████▋   | 10002/15054 [21:19<09:06,  9.25it/s]

{'loss': 0.024, 'learning_rate': 6.714494486515213e-06, 'epoch': 1.99}


                                                     
 67%|██████▋   | 10036/15054 [22:37<09:30,  8.80it/s]

{'eval_loss': 0.0413452573120594, 'eval_ADV': {'precision': 0.9755427463233314, 'recall': 0.9733404998656275, 'f1-score': 0.9744403788204907, 'support': 18605.0}, 'eval_CONJ': {'precision': 0.9969445656918376, 'recall': 0.9978156400174749, 'f1-score': 0.9973799126637555, 'support': 11445.0}, 'eval_VERB': {'precision': 0.9908237714138785, 'recall': 0.993458049338554, 'f1-score': 0.9921391617787862, 'support': 62061.0}, 'eval_ADJ': {'precision': 0.9660717009916094, 'recall': 0.9705431584109858, 'f1-score': 0.9683022676187709, 'support': 32624.0}, 'eval_ADP': {'precision': 0.9915958647049342, 'recall': 0.9936320428222047, 'f1-score': 0.992612909545596, 'support': 43342.0}, 'eval_PRON': {'precision': 0.9941856579562922, 'recall': 0.9939199572392597, 'f1-score': 0.994052789842967, 'support': 14967.0}, 'eval_X': {'precision': 0.9315707620528771, 'recall': 0.6379126730564431, 'f1-score': 0.7572692793931732, 'support': 939.0}, 'eval_NUM': {'precision': 0.9833664798408968, 'recall': 0.989628820

 70%|██████▉   | 10501/15054 [23:35<09:31,  7.97it/s]   

{'loss': 0.0157, 'learning_rate': 6.0502192108409726e-06, 'epoch': 2.09}


 73%|███████▎  | 11001/15054 [24:37<08:14,  8.20it/s]

{'loss': 0.0136, 'learning_rate': 5.385943935166733e-06, 'epoch': 2.19}


 76%|███████▋  | 11501/15054 [25:39<07:20,  8.06it/s]

{'loss': 0.0139, 'learning_rate': 4.7216686594924946e-06, 'epoch': 2.29}


 80%|███████▉  | 12001/15054 [26:41<05:52,  8.66it/s]

{'loss': 0.0127, 'learning_rate': 4.057393383818255e-06, 'epoch': 2.39}


 83%|████████▎ | 12502/15054 [27:42<04:59,  8.52it/s]

{'loss': 0.0132, 'learning_rate': 3.3931181081440153e-06, 'epoch': 2.49}


 86%|████████▋ | 13001/15054 [28:46<04:08,  8.26it/s]

{'loss': 0.0156, 'learning_rate': 2.728842832469776e-06, 'epoch': 2.59}


 90%|████████▉ | 13501/15054 [29:48<03:39,  7.08it/s]

{'loss': 0.0164, 'learning_rate': 2.064567556795536e-06, 'epoch': 2.69}


 93%|█████████▎| 14001/15054 [30:51<02:06,  8.29it/s]

{'loss': 0.0122, 'learning_rate': 1.4002922811212968e-06, 'epoch': 2.79}


 96%|█████████▋| 14501/15054 [31:54<01:09,  8.01it/s]

{'loss': 0.0135, 'learning_rate': 7.360170054470574e-07, 'epoch': 2.89}


100%|█████████▉| 15001/15054 [32:58<00:06,  8.19it/s]

{'loss': 0.0127, 'learning_rate': 7.174172977281786e-08, 'epoch': 2.99}


                                                     
100%|██████████| 15054/15054 [34:20<00:00,  8.30it/s]

{'eval_loss': 0.04342537000775337, 'eval_ADV': {'precision': 0.9765848394928514, 'recall': 0.9729105079279764, 'f1-score': 0.974744211093161, 'support': 18605.0}, 'eval_CONJ': {'precision': 0.997117903930131, 'recall': 0.9975535168195718, 'f1-score': 0.9973356628084734, 'support': 11445.0}, 'eval_VERB': {'precision': 0.9913520117021105, 'recall': 0.9937480865599975, 'f1-score': 0.9925486030642462, 'support': 62061.0}, 'eval_ADJ': {'precision': 0.9690797320855125, 'recall': 0.9712481608631682, 'f1-score': 0.9701627348019779, 'support': 32624.0}, 'eval_ADP': {'precision': 0.992139053459047, 'recall': 0.9929860181809792, 'f1-score': 0.9925623551388937, 'support': 43342.0}, 'eval_PRON': {'precision': 0.9931959175505303, 'recall': 0.9947885347765083, 'f1-score': 0.993991588223513, 'support': 14967.0}, 'eval_X': {'precision': 0.9120111731843575, 'recall': 0.6954206602768903, 'f1-score': 0.7891238670694865, 'support': 939.0}, 'eval_NUM': {'precision': 0.9828890489913544, 'recall': 0.992903930

100%|██████████| 15054/15054 [34:24<00:00,  7.29it/s]

{'train_runtime': 2064.0528, 'train_samples_per_second': 58.339, 'train_steps_per_second': 7.293, 'train_loss': 0.040027938208067514, 'epoch': 3.0}





TrainOutput(global_step=15054, training_loss=0.040027938208067514, metrics={'train_runtime': 2064.0528, 'train_samples_per_second': 58.339, 'train_steps_per_second': 7.293, 'train_loss': 0.040027938208067514, 'epoch': 3.0})

In [25]:
trainer.save_model('my_model')

In [26]:
pipe = pipeline(
    task='token-classification',
    model='my_model',
    device=device,
)

In [27]:
sentence = 'Need For Speed: Most Wanted is the best video game of all time.'
pipe(sentence)

[{'entity': 'NOUN',
  'score': 0.99942374,
  'index': 1,
  'word': 'Need',
  'start': 0,
  'end': 4},
 {'entity': 'ADP',
  'score': 0.9993299,
  'index': 2,
  'word': 'For',
  'start': 5,
  'end': 8},
 {'entity': 'NOUN',
  'score': 0.9998654,
  'index': 3,
  'word': 'Speed',
  'start': 9,
  'end': 14},
 {'entity': 'NOUN',
  'score': 0.63851696,
  'index': 4,
  'word': ':',
  'start': 14,
  'end': 15},
 {'entity': 'ADV',
  'score': 0.96408814,
  'index': 5,
  'word': 'Most',
  'start': 16,
  'end': 20},
 {'entity': 'VERB',
  'score': 0.91693956,
  'index': 6,
  'word': 'Wanted',
  'start': 21,
  'end': 27},
 {'entity': 'VERB',
  'score': 0.9999552,
  'index': 7,
  'word': 'is',
  'start': 28,
  'end': 30},
 {'entity': 'DET',
  'score': 0.9999356,
  'index': 8,
  'word': 'the',
  'start': 31,
  'end': 34},
 {'entity': 'ADJ',
  'score': 0.9996879,
  'index': 9,
  'word': 'best',
  'start': 35,
  'end': 39},
 {'entity': 'NOUN',
  'score': 0.997012,
  'index': 10,
  'word': 'video',
  'star