In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
from sklearn.metrics import f1_score, classification_report
from torchcrf import CRF

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class WNUTDataset(Dataset):
    def __init__(self, data, label_encoder):
        self.data = data
        self.label_encoder = label_encoder
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        # Preprocess data
        self.sentences = [example['tokens'] for example in self.data]
        self.labels = [example['ner_tags'] for example in self.data]
        self.num_classes = max(label_encoder.classes_)
        
        # Tokenize (basic) and encode labels
        self.tokenized_inputs = [[word.lower() for word in sentence] for sentence in self.sentences]
        # Label encoding tokenizer - assign a number to each word in the dataset vocabulary.
        self.encoded_labels = [self.label_encoder.transform(labels) for labels in self.labels]
        
        # Build vocabulary
        # using bert vocab

    def __len__(self):
        return len(self.sentences)

    def adjust_token_and_label(self,tokens_by_wrd,wrd_lbls):

        tokens = []
        valid_mask = []
        labels = []

        for i in range(len(tokens_by_wrd)):           
            if len(tokens_by_wrd[i]) < 3:
                tokens.append(tokens_by_wrd[i][0])
                valid_mask.append(1)
                labels.append(wrd_lbls[i])

            for j in range(1,len(tokens_by_wrd[i])-1):
                tokens.append(tokens_by_wrd[i][j])
                valid_mask.append(1 if j==1 else 0)
                labels.append(wrd_lbls[i] if j==1 else self.num_classes+1)

        return torch.tensor(tokens, dtype=torch.long), torch.tensor(labels, dtype=torch.long), torch.tensor(valid_mask, dtype=torch.int)

    def __getitem__(self, idx):
        
        tokenized_sentence = self.tokenized_inputs[idx]
        
        bert_tokens_by_wrd = self.tokenizer(tokenized_sentence)['input_ids']
        wrd_lbls = torch.tensor(self.encoded_labels[idx], dtype=torch.long)

        tokens, labels, valid_mask = self.adjust_token_and_label(bert_tokens_by_wrd,wrd_lbls)

        return tokens, labels, valid_mask


In [4]:
dataset = load_dataset('leondz/wnut_17',trust_remote_code=True)

label_encoder = LabelEncoder()
all_labels = [label for example in dataset['train'] for label in example['ner_tags']]
label_encoder.fit(all_labels)

train_dataset = WNUTDataset(dataset['train'], label_encoder)
val_dataset = WNUTDataset(dataset['validation'], label_encoder)
test_dataset = WNUTDataset(dataset['test'], label_encoder)

In [80]:
class BertLinearProbing(nn.Module):
    def __init__(self, num_labels, n_epochs=10, val_data=None):
        super().__init__()
        self.num_labels = num_labels
        self.backbone = BertModel.from_pretrained("bert-base-uncased")

        self.out = nn.Linear(768, self.num_labels)
        
        self.n_epochs = n_epochs
        self.val_dataset = val_data

        self.pst_prcssing = Postprorcessing(num_labels,3,val_dataset)


        
    def forward(self, input_ids,atn_msk):
        # Perform one-hot encoding of input_ids
        with torch.no_grad():
            x = self.backbone(input_ids,attention_mask=atn_msk)['last_hidden_state']

        logits = self.out(x)
        return logits

    def train_model(self, train_dataset, batch_size=16):
        # Prepare DataLoader

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn)

        if self.val_dataset is not None:
            val_dataloader = DataLoader(self.val_dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_collate_fn)


        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(device)
        
        
        # Optimizer and Loss function - feel free to change
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        criterion = nn.CrossEntropyLoss(weight=torch.tensor([0.1]+[1.0]*12+[0.0]).to(device), ignore_index=13)
        
        best_loss = None 
        best_model = None
        # Training loop
        
        
        for epc in range(self.n_epochs):
            total_loss = 0
            YP = []
            YT = []            

            self.train()

            for input_ids, labels, valid_mask, atn_msk in tqdm(train_dataloader):

                input_ids, labels, atn_msk = input_ids.to(device), labels.to(device), atn_msk.to(device)                

                optimizer.zero_grad()

                logits = self(input_ids,atn_msk)
                
                loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
                
                loss.backward()
                
                optimizer.step()

                total_loss += loss.item()

                logits = torch.argmax(logits,dim=-1)
                
                for i2 in range(len(valid_mask)):
                    yp = []
                    yt = []
                    for j2 in range(len(valid_mask[i2])):
                        if valid_mask[i2][j2]==1:
                            yp.append(logits[i2][j2].item())
                            yt.append(labels[i2][j2].item())

                    YP.extend(yp)
                    YT.extend(yt)
                    

            f1_mcro_scr = f1_score(YT, YP, average='macro')
            print(f'Epoch {epc+1} : Training Loss : {total_loss}, Macro F1 : {f1_mcro_scr}')

            if self.val_dataset is not None:
                total_loss = 0
                YP = []
                YT = []
                total_f1 = []

                self.eval()

                with torch.no_grad():
                    for input_ids, labels, valid_mask, atn_msk in tqdm(val_dataloader):

                        input_ids, labels, atn_msk = input_ids.to(device), labels.to(device), atn_msk.to(device)                

                        logits = self(input_ids,atn_msk)
                        
                        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
                        
                        total_loss += loss.item()

                        logits = torch.argmax(logits,dim=-1)
                        
                        for i2 in range(len(valid_mask)):
                            yp = []
                            yt = []
                            for j2 in range(len(valid_mask[i2])):
                                if valid_mask[i2][j2]==1:
                                    yp.append(logits[i2][j2].item())
                                    yt.append(labels[i2][j2].item())

                            YP.extend(yp)
                            YT.extend(yt)
                            
                    if (best_loss is None) or (best_loss > total_loss):
                        best_loss = total_loss
                        best_model = self
                    f1_mcro_scr = f1_score(YT, YP, average='macro')    
                    print(f'Epoch {epc+1} : Val Loss : {total_loss}, Macro F1 : {f1_mcro_scr}')
        
        best_model.pst_prcssing = best_model.pst_prcssing.train_model(train_dataset, best_model, batch_size=16)


        return best_model

    def predict(self, dataset, batch_size=16,use_crf = True) -> list[list[int]]:
        """
        Inference logic for NER task
        """
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_collate_fn)
        
        predictions = []
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(device)
        self.eval()
        
        with torch.no_grad():
            # Note: do not use gold labels for the final inference, we will remove it while testing            
            for input_ids, _, valid_mask, atn_msk in tqdm(dataloader):
                input_ids, atn_msk = input_ids.to(device), atn_msk.to(device)                

                logits = self(input_ids,atn_msk)

                ###logits = torch.argmax(logits,dim=-1)
                if use_crf:
                    logits = self.pst_prcssing.predict(logits,atn_msk)

                else:
                    logits = torch.argmax(logits,dim=-1).tolist()
                        
                for i2 in range(len(valid_mask)):
                    yp = []
                    for j2 in range(len(valid_mask[i2])):
                        if valid_mask[i2][j2]==1:
                            yp.append(logits[i2][j2])                            

                    predictions.append(yp)

        return predictions

    
# This function is meant for padding the dataset's examples so that every input is the same length
def pad_collate_fn(batch):
    
    input_ids, labels, valid_masks = zip(*batch)
    
    max_len = max(len(ids) for ids in input_ids)

    padded_inputs = [F.pad(ids, (0, max_len - len(ids)), value=0) for ids in input_ids]
    padded_inputs = torch.stack(padded_inputs)

    if labels is not None:    
        padded_labels = [F.pad(label, (0, max_len - len(label)), value=13) for label in labels]
        padded_labels = torch.stack(padded_labels)
    else:
        padded_labels = None
    
    padded_valid_masks = [F.pad(valid_mask, (0, max_len - len(valid_mask)), value=0) for valid_mask in valid_masks]
    padded_valid_masks = torch.stack(padded_valid_masks)

    attention_mask = padded_inputs.not_equal(0).clone().detach()

    return padded_inputs, padded_labels, padded_valid_masks, attention_mask



In [86]:
class Postprorcessing(nn.Module):
    def __init__(self, num_labels, n_epochs=10, val_data=None):
        super().__init__()
        self.crf_mdl = CRF(num_labels, batch_first=True)
        self.val_dataset = val_data
        self.n_epochs = n_epochs

    def forward(self, predictions, labels, masks):
        return -self.crf_mdl(predictions, labels, mask=masks)

    def train_model(self, train_dataset, trained_mdl, batch_size=16):

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn)

        if self.val_dataset is not None:
            val_dataloader = DataLoader(self.val_dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_collate_fn)


        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(device)
        trained_mdl.to(device)
        
        # Optimizer and Loss function - feel free to change
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        
        best_loss = None 
        best_model = None
        # Training loop
        
        
        for epc in tqdm(range(self.n_epochs),desc=f'Training Epoch {epc}'):
            total_loss = 0
            
            self.train()

            for input_ids, labels, valid_mask, atn_msk in tqdm(train_dataloader):

                input_ids, labels, atn_msk = input_ids.to(device), labels.to(device), atn_msk.to(device)                

                optimizer.zero_grad()
                
                with torch.no_grad():
                    logits = trained_mdl(input_ids,atn_msk)
                
                loss = self(logits,labels,atn_msk)
                
                loss.backward()
                
                optimizer.step()

                total_loss += loss.item()


                 
            tqdm.write(f'CRF Epoch {epc+1} : Training Loss : {total_loss}')

            if self.val_dataset is not None:
                total_loss = 0                

                self.eval()

                with torch.no_grad():
                    for input_ids, labels, valid_mask, atn_msk in tqdm(val_dataloader):

                        input_ids, labels, atn_msk = input_ids.to(device), labels.to(device), atn_msk.to(device)                

                        with torch.no_grad():
                            logits = trained_mdl(input_ids,atn_msk)
                        
                        loss = self(logits,labels,atn_msk)
                        
                        total_loss += loss.item()


                    if (best_loss is None) or (best_loss > total_loss):
                        best_loss = total_loss
                        best_model = self
                        
                    tqdm.write(f'CRF Epoch {epc+1} : Val Loss : {total_loss}')


        return best_model


    def predict(self, predictions, masks):
        return self.crf_mdl.decode(predictions, mask=masks)


In [77]:
import warnings

# Ignore all warnings globally
warnings.filterwarnings("ignore")

In [96]:
all_lbls = {}

for i in tqdm(range(len(train_dataset))):

    for lbl in list(train_dataset.__getitem__(i)[1].numpy()):
        if lbl not in all_lbls:
            all_lbls[lbl] = 0
        all_lbls[lbl] += 1

  0%|          | 0/3394 [00:00<?, ?it/s]

100%|██████████| 3394/3394 [00:12<00:00, 268.40it/s]


In [101]:
all_lbls[0] / sum([all_lbls[i] for i in range(len(all_lbls))])

0.9496253786067272

In [100]:
all_lbls

{0: 59570,
 7: 548,
 8: 245,
 5: 264,
 1: 221,
 9: 660,
 3: 140,
 11: 142,
 10: 335,
 4: 206,
 2: 46,
 6: 150,
 12: 203}

In [31]:
lgt, msk = trained_mdl.predict(test_dataset, batch_size=16)


  0%|          | 0/81 [00:00<?, ?it/s]


In [56]:
torch.sum(msk,dim=1)

tensor([29, 38, 26, 32, 36, 35, 11, 17,  6, 33,  6, 27, 29, 40,  8, 12],
       device='cuda:0')

In [51]:
lgt.shape,msk.shape

(torch.Size([16, 40, 14]), torch.Size([16, 40]))

In [81]:
model = BertLinearProbing(14,10,val_dataset)

trained_mdl = model.train_model(train_dataset, batch_size=16)

100%|██████████| 213/213 [00:19<00:00, 10.79it/s]


Epoch 1 : Training Loss : 245.93314416706562, Macro F1 : 0.12607467843557385


100%|██████████| 64/64 [00:04<00:00, 13.66it/s]


Epoch 1 : Val Loss : 84.4824566245079, Macro F1 : 0.1738564522949519


100%|██████████| 213/213 [00:21<00:00,  9.74it/s]


Epoch 2 : Training Loss : 168.2527379989624, Macro F1 : 0.3046543325886616


100%|██████████| 64/64 [00:04<00:00, 13.71it/s]


Epoch 2 : Val Loss : 76.25888669490814, Macro F1 : 0.22372801103230083


100%|██████████| 213/213 [00:21<00:00,  9.69it/s]


Epoch 3 : Training Loss : 149.30112915486097, Macro F1 : 0.37520487315617973


100%|██████████| 64/64 [00:05<00:00, 12.16it/s]


Epoch 3 : Val Loss : 72.52490818500519, Macro F1 : 0.2530248487324976


100%|██████████| 213/213 [00:23<00:00,  9.01it/s]


Epoch 4 : Training Loss : 138.04536245763302, Macro F1 : 0.400511810349295


100%|██████████| 64/64 [00:05<00:00, 11.70it/s]


Epoch 4 : Val Loss : 71.83571147918701, Macro F1 : 0.27119960308039925


100%|██████████| 213/213 [00:22<00:00,  9.28it/s]


Epoch 5 : Training Loss : 127.43386917561293, Macro F1 : 0.43206734305302913


100%|██████████| 64/64 [00:05<00:00, 11.46it/s]


Epoch 5 : Val Loss : 69.81153059005737, Macro F1 : 0.3006182635662313


100%|██████████| 213/213 [00:22<00:00,  9.55it/s]


Epoch 6 : Training Loss : 122.99841111898422, Macro F1 : 0.4512649034256518


100%|██████████| 64/64 [00:05<00:00, 11.96it/s]


Epoch 6 : Val Loss : 67.21107423305511, Macro F1 : 0.3334284825652064


100%|██████████| 213/213 [00:23<00:00,  9.04it/s]


Epoch 7 : Training Loss : 118.68055918812752, Macro F1 : 0.4592527588728125


100%|██████████| 64/64 [00:05<00:00, 11.28it/s]


Epoch 7 : Val Loss : 69.24949216842651, Macro F1 : 0.32779383695304415


100%|██████████| 213/213 [00:23<00:00,  8.89it/s]


Epoch 8 : Training Loss : 116.11157240718603, Macro F1 : 0.46825107257232684


100%|██████████| 64/64 [00:05<00:00, 10.88it/s]


Epoch 8 : Val Loss : 67.16994652152061, Macro F1 : 0.33675036652734064


100%|██████████| 213/213 [00:23<00:00,  9.02it/s]


Epoch 9 : Training Loss : 112.85825846344233, Macro F1 : 0.4764288911611042


100%|██████████| 64/64 [00:05<00:00, 11.64it/s]


Epoch 9 : Val Loss : 67.64563456177711, Macro F1 : 0.3351051105152229


100%|██████████| 213/213 [00:24<00:00,  8.65it/s]


Epoch 10 : Training Loss : 110.52168589830399, Macro F1 : 0.49106410785676596


100%|██████████| 64/64 [00:05<00:00, 12.31it/s]


Epoch 10 : Val Loss : 68.56126138567924, Macro F1 : 0.3421851613460948


100%|██████████| 213/213 [00:45<00:00,  4.71it/s]


CRF Epoch 1 : Training Loss : 463533.56747436523


100%|██████████| 64/64 [00:10<00:00,  6.29it/s]


CRF Epoch 1 : Val Loss : 23179.557146072388


100%|██████████| 213/213 [00:45<00:00,  4.64it/s]


CRF Epoch 2 : Training Loss : 443618.33462524414


100%|██████████| 64/64 [00:10<00:00,  6.32it/s]


CRF Epoch 2 : Val Loss : 22063.661163330078


100%|██████████| 213/213 [00:45<00:00,  4.71it/s]


CRF Epoch 3 : Training Loss : 423838.12397384644


100%|██████████| 64/64 [00:10<00:00,  6.21it/s]

CRF Epoch 3 : Val Loss : 20960.874137878418





In [84]:
predictions = trained_mdl.predict(test_dataset, batch_size=16)

true_labels = [label for sentence in dataset["test"] for label in sentence["ner_tags"]]

predicted_labels = [label for sentence in predictions for label in sentence]

macro_f1 = f1_score(true_labels, predicted_labels, average="macro")

print(f"Macro F1 Score: {macro_f1:.4f}")

print(classification_report(true_labels, predicted_labels))

  0%|          | 0/81 [00:00<?, ?it/s]

100%|██████████| 81/81 [00:11<00:00,  7.14it/s]

Macro F1 Score: 0.3373
              precision    recall  f1-score   support

           0       0.97      0.96      0.96     21654
           1       0.09      0.27      0.14        66
           2       0.21      0.14      0.17        22
           3       0.24      0.15      0.19       142
           4       0.27      0.17      0.21       218
           5       0.26      0.29      0.27       165
           6       0.18      0.26      0.21        70
           7       0.26      0.65      0.37       150
           8       0.25      0.26      0.25        94
           9       0.57      0.58      0.58       429
          10       0.59      0.49      0.54       131
          11       0.22      0.18      0.20       127
          12       0.24      0.40      0.30       126

    accuracy                           0.91     23394
   macro avg       0.34      0.37      0.34     23394
weighted avg       0.92      0.91      0.92     23394






In [85]:
predictions = trained_mdl.predict(test_dataset, batch_size=16, use_crf = False)

true_labels = [label for sentence in dataset["test"] for label in sentence["ner_tags"]]

predicted_labels = [label for sentence in predictions for label in sentence]

macro_f1 = f1_score(true_labels, predicted_labels, average="macro")

print(f"Macro F1 Score: {macro_f1:.4f}")

print(classification_report(true_labels, predicted_labels))

100%|██████████| 81/81 [00:07<00:00, 10.67it/s]

Macro F1 Score: 0.3371
              precision    recall  f1-score   support

           0       0.97      0.96      0.96     21654
           1       0.10      0.29      0.14        66
           2       0.23      0.14      0.17        22
           3       0.24      0.15      0.18       142
           4       0.32      0.15      0.20       218
           5       0.28      0.27      0.27       165
           6       0.17      0.23      0.20        70
           7       0.26      0.64      0.37       150
           8       0.27      0.26      0.26        94
           9       0.54      0.59      0.57       429
          10       0.62      0.49      0.54       131
          11       0.23      0.17      0.20       127
          12       0.27      0.37      0.31       126

    accuracy                           0.92     23394
   macro avg       0.34      0.36      0.34     23394
weighted avg       0.92      0.92      0.92     23394






In [7]:
predictions = trained_mdl.predict(test_dataset, batch_size=16)

100%|██████████| 81/81 [00:13<00:00,  5.88it/s]


In [8]:
trained_mdl.to('cpu')

BertLinearProbing(
  (backbone): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [47]:
num_tags = 13  # number of tags is 5
model = CRF(num_tags, batch_first=True)
seq_length = 60  # maximum sequence length in a batch
batch_size = 4  # number of samples in the batch
emissions = torch.randn(batch_size, seq_length, num_tags)

tags = torch.randint(0,13,(batch_size, seq_length))
mask = torch.ones((batch_size, seq_length)).bool()

model(emissions, tags, mask)

tensor(-730.7456, grad_fn=<SumBackward0>)

In [49]:
seq_length = 40  # maximum sequence length in a batch
batch_size = 4  # number of samples in the batch
emissions = torch.randn(batch_size, seq_length, num_tags)

tags = torch.randint(0,13,(batch_size, seq_length))
mask = torch.ones((batch_size, seq_length)).bool()


In [50]:
model.decode(emissions, mask)

[[1,
  2,
  6,
  9,
  9,
  11,
  8,
  6,
  5,
  11,
  11,
  12,
  1,
  1,
  4,
  1,
  11,
  12,
  2,
  3,
  0,
  9,
  8,
  10,
  8,
  4,
  7,
  5,
  7,
  1,
  0,
  1,
  10,
  5,
  2,
  2,
  5,
  9,
  9,
  11],
 [12,
  1,
  3,
  9,
  7,
  8,
  7,
  0,
  11,
  4,
  4,
  9,
  0,
  12,
  1,
  2,
  10,
  3,
  3,
  12,
  12,
  3,
  6,
  12,
  10,
  9,
  11,
  12,
  0,
  10,
  11,
  0,
  5,
  1,
  8,
  10,
  12,
  9,
  10,
  11],
 [4,
  10,
  1,
  1,
  3,
  9,
  0,
  8,
  5,
  5,
  12,
  0,
  9,
  11,
  9,
  3,
  2,
  4,
  5,
  2,
  2,
  6,
  3,
  8,
  1,
  8,
  6,
  6,
  4,
  6,
  1,
  11,
  4,
  6,
  11,
  12,
  3,
  2,
  8,
  6],
 [8,
  8,
  8,
  11,
  4,
  0,
  3,
  10,
  9,
  12,
  1,
  8,
  12,
  3,
  12,
  6,
  12,
  0,
  12,
  11,
  10,
  12,
  8,
  7,
  5,
  1,
  1,
  10,
  12,
  3,
  5,
  6,
  9,
  5,
  6,
  3,
  12,
  9,
  0,
  7]]

In [42]:
torch.ones((batch_size, seq_length)).long().dtype

torch.int64

In [None]:
pst_prcssing2 = Postprorcessing(14,10,val_dataset)
pst_prcssing2 = pst_prcssing2.train_model(train_dataset, model, batch_size=16)

  0%|          | 0/213 [00:00<?, ?it/s]

100%|██████████| 213/213 [00:39<00:00,  5.38it/s]


CRF Epoch 1 : Training Loss : 333943.411239624


100%|██████████| 64/64 [00:07<00:00,  9.07it/s]


CRF Epoch 1 : Val Loss : 19420.014863967896


100%|██████████| 213/213 [00:41<00:00,  5.16it/s]


CRF Epoch 2 : Training Loss : 313893.9598236084


100%|██████████| 64/64 [00:07<00:00,  8.95it/s]


CRF Epoch 2 : Val Loss : 18227.386039733887


100%|██████████| 213/213 [00:41<00:00,  5.12it/s]


CRF Epoch 3 : Training Loss : 294050.1819000244


100%|██████████| 64/64 [00:06<00:00,  9.20it/s]


CRF Epoch 3 : Val Loss : 17078.961069107056


100%|██████████| 213/213 [00:40<00:00,  5.22it/s]


CRF Epoch 4 : Training Loss : 274600.77292633057


100%|██████████| 64/64 [00:06<00:00,  9.30it/s]


CRF Epoch 4 : Val Loss : 16007.840351104736


100%|██████████| 213/213 [00:39<00:00,  5.41it/s]


CRF Epoch 5 : Training Loss : 255772.49723815918


100%|██████████| 64/64 [00:06<00:00,  9.27it/s]


CRF Epoch 5 : Val Loss : 15060.030818939209


100%|██████████| 213/213 [00:40<00:00,  5.31it/s]


CRF Epoch 6 : Training Loss : 237927.10653686523


100%|██████████| 64/64 [00:06<00:00,  9.30it/s]


CRF Epoch 6 : Val Loss : 14299.733177185059


100%|██████████| 213/213 [00:40<00:00,  5.20it/s]


CRF Epoch 7 : Training Loss : 221646.07652282715


100%|██████████| 64/64 [00:06<00:00,  9.35it/s]


CRF Epoch 7 : Val Loss : 13788.702234268188


 52%|█████▏    | 111/213 [00:21<00:19,  5.16it/s]

TypeError: 'WNUTDataset' object cannot be interpreted as an integer

In [9]:
predictions = trained_mdl.predict(val_dataset, batch_size=16)

100%|██████████| 64/64 [00:07<00:00,  8.12it/s]


In [8]:
true_labels = [label for sentence in dataset["test"] for label in sentence["ner_tags"]]

predicted_labels = [label for sentence in predictions for label in sentence]

macro_f1 = f1_score(true_labels, predicted_labels, average="macro")

print(f"Macro F1 Score: {macro_f1:.4f}")

Macro F1 Score: 0.1777


In [10]:
true_labels = [label for sentence in dataset["validation"] for label in sentence["ner_tags"]]

predicted_labels = [label for sentence in predictions for label in sentence]

macro_f1 = f1_score(true_labels, predicted_labels, average="macro")

print(f"Macro F1 Score: {macro_f1:.4f}")

Macro F1 Score: 0.3314


In [11]:
print(classification_report(true_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     14483
           1       0.12      0.18      0.14        34
           2       0.67      0.17      0.27        12
           3       0.38      0.12      0.19       105
           4       0.47      0.11      0.17       133
           5       0.11      0.23      0.15        39
           6       0.07      0.12      0.09        25
           7       0.50      0.61      0.55        74
           8       0.48      0.33      0.39        33
           9       0.54      0.49      0.52       470
          10       0.42      0.44      0.43       117
          11       0.33      0.19      0.24       114
          12       0.52      0.13      0.21        94

    accuracy                           0.93     15733
   macro avg       0.43      0.32      0.33     15733
weighted avg       0.92      0.93      0.92     15733



In [None]:
mdl = model.train_model(train_dataset, batch_size=16)

In [184]:
model = BertLinearProbing(14,10,test_dataset)

mdl = model.train_model(train_dataset, batch_size=16)

100%|██████████| 213/213 [00:40<00:00,  5.23it/s]


Epoch 1 : Training Loss : 79.8433912023902, Macro F1 : 0.07421472104519823


100%|██████████| 81/81 [00:15<00:00,  5.08it/s]


Epoch 1 : Val Loss : 31.82248128950596, Macro F1 : 0.0736554857533014


100%|██████████| 213/213 [00:43<00:00,  4.95it/s]


Epoch 2 : Training Loss : 46.888765420764685, Macro F1 : 0.07788697775377863


100%|██████████| 81/81 [00:17<00:00,  4.69it/s]


Epoch 2 : Val Loss : 26.950382634997368, Macro F1 : 0.0798424924104802


100%|██████████| 213/213 [00:43<00:00,  4.84it/s]


Epoch 3 : Training Loss : 39.94342878833413, Macro F1 : 0.08299446287895913


100%|██████████| 81/81 [00:17<00:00,  4.63it/s]


Epoch 3 : Val Loss : 25.061790458858013, Macro F1 : 0.08423486176161289


100%|██████████| 213/213 [00:43<00:00,  4.87it/s]


Epoch 4 : Training Loss : 36.42007787339389, Macro F1 : 0.08596639276478411


100%|██████████| 81/81 [00:17<00:00,  4.60it/s]


Epoch 4 : Val Loss : 24.169034481048584, Macro F1 : 0.08577213037636361


100%|██████████| 213/213 [00:44<00:00,  4.83it/s]


Epoch 5 : Training Loss : 33.474986869841814, Macro F1 : 0.08823023797319099


100%|██████████| 81/81 [00:17<00:00,  4.63it/s]


Epoch 5 : Val Loss : 24.081137873232365, Macro F1 : 0.08692038707483282


100%|██████████| 213/213 [00:43<00:00,  4.88it/s]


Epoch 6 : Training Loss : 32.48362027993426, Macro F1 : 0.08950824088225019


100%|██████████| 81/81 [00:17<00:00,  4.68it/s]


Epoch 6 : Val Loss : 23.25082240998745, Macro F1 : 0.08875461419516156


100%|██████████| 213/213 [00:44<00:00,  4.82it/s]


Epoch 7 : Training Loss : 31.063964569941163, Macro F1 : 0.0910846699899935


100%|██████████| 81/81 [00:16<00:00,  5.04it/s]


Epoch 7 : Val Loss : 23.2690064124763, Macro F1 : 0.08894470430007326


100%|██████████| 213/213 [00:39<00:00,  5.43it/s]


Epoch 8 : Training Loss : 29.42448954284191, Macro F1 : 0.0923581608719488


100%|██████████| 81/81 [00:15<00:00,  5.16it/s]


Epoch 8 : Val Loss : 23.190548341721296, Macro F1 : 0.08974364996107292


100%|██████████| 213/213 [00:39<00:00,  5.35it/s]


Epoch 9 : Training Loss : 29.573216265067458, Macro F1 : 0.09213253534927146


100%|██████████| 81/81 [00:15<00:00,  5.16it/s]


Epoch 9 : Val Loss : 22.719720989465714, Macro F1 : 0.09117607501935004


100%|██████████| 213/213 [00:39<00:00,  5.43it/s]


Epoch 10 : Training Loss : 29.195627957582474, Macro F1 : 0.0930209101068192


100%|██████████| 81/81 [00:16<00:00,  5.04it/s]

Epoch 10 : Val Loss : 23.121299996972084, Macro F1 : 0.08975696233107182





In [135]:
YP = mdl[1]
YT = mdl[2]

In [142]:
total_f1 = []

for i in tqdm(range(len(YP))):
    yt = test_dataset.__getitem__(i)[1][torch.where(test_dataset.__getitem__(i)[2])].tolist()
    yp = YP[i]

    total_f1.append(f1_score(yt, yp, labels=np.arange(13), average='macro'))

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

In [4]:
import pickle
raw_test_predictions = pickle.load(open('raw_test_predictions.p','rb'))

In [5]:
test_predictions = torch.argmax(torch.tensor(raw_test_predictions.predictions),dim=-1)

In [85]:
test_predictions = torch.argmax(torch.tensor(raw_test_predictions.predictions),dim=-1)

YP = []
YT = []

for i in tqdm(range(len(test_predictions))):
    yt = dataset['test']['ner_tags'][i]
    yp = test_predictions[i][:len(yt)].tolist()

    YP.extend(yp)
    YT.extend(yt)

f1_score(YT,YP,average='macro')

100%|██████████| 1287/1287 [00:29<00:00, 43.93it/s]


In [86]:
f1_score(YT,YP,average='macro')

0.10374965696404287

In [87]:
print(classification_report(YT,YP))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95     21654
           1       0.00      0.00      0.00        66
           2       0.00      0.00      0.00        22
           3       0.00      0.00      0.00       142
           4       0.18      0.06      0.09       218
           5       0.02      0.01      0.01       165
           6       0.05      0.03      0.04        70
           7       0.01      0.01      0.01       150
           8       0.04      0.03      0.03        94
           9       0.03      0.02      0.02       429
          10       0.02      0.02      0.02       131
          11       0.00      0.00      0.00       127
          12       0.28      0.13      0.18       126

    accuracy                           0.89     23394
   macro avg       0.12      0.10      0.10     23394
weighted avg       0.87      0.89      0.88     23394



In [12]:
f1s = []
f1s2 = []

test_predictions = torch.argmax(torch.tensor(raw_test_predictions.predictions),dim=-1)

for i in tqdm(range(len(test_predictions))):
    yt = dataset['test']['ner_tags'][i]
    yp = test_predictions[i][:len(yt)].tolist()

    f1s.append(f1_score(yt,yp,average='macro'))
    f1s2.append(f1_score(yt,yp,labels=np.arange(13),average='macro'))



  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

In [15]:
f1s = []
f1s2 = []

test_predictions = torch.argmax(torch.tensor(raw_test_predictions.predictions),dim=-1)

for i in tqdm(range(len(test_predictions))):
    
    yp = test_predictions[i].tolist()
    yt = dataset['test']['ner_tags'][i] 
    yt = yt + [0]*(len(yp)-len(yt))

    f1s.append(f1_score(yt,yp,average='macro'))
    f1s2.append(f1_score(yt,yp,labels=np.arange(13),average='macro'))



  0%|          | 0/1287 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

In [13]:
np.mean(f1s),np.mean(f1s2)

(0.6258480270129728, 0.0719513450111264)

In [16]:
np.mean(f1s),np.mean(f1s2)

(0.6353335707296868, 0.07612490405135001)

In [10]:
print(dataset['test']['ner_tags'][0])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0]


In [7]:
len(dataset['test']['ner_tags'])

1287

In [165]:
print(dataset['test']['ner_tags'][0])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0]


In [166]:
print(test_predictions[0][:27].tolist())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8]


In [None]:
total_f1

In [169]:
f1_score(dataset['test']['ner_tags'][0], test_predictions[0][:27].tolist(), average='macro')

0.30666666666666664

In [170]:
f1_score(dataset['test']['ner_tags'][0], pred[0], average='macro')

0.49056603773584906

In [141]:
f1_score(yt, yp, labels=np.arange(13) , average='macro')

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.07239819004524886

In [175]:
np.where(np.array(total_f1)>0.1)

(array([   7,   38,   58,   61,   63,   64,   68,   70,   75,   85,   97,
         116,  117,  119,  125,  132,  136,  148,  155,  165,  168,  174,
         177,  188,  191,  202,  214,  217,  234,  236,  237,  244,  249,
         293,  299,  306,  334,  337,  349,  363,  366,  373,  389,  395,
         400,  447,  471,  558,  561,  578,  610,  681,  698,  718,  721,
         725,  726,  735,  757,  759,  782,  799,  814,  825,  833,  836,
         852,  856,  857,  869,  894,  922,  926,  948,  950,  952,  960,
         963,  977,  984,  990, 1007, 1008, 1011, 1023, 1035, 1043, 1044,
        1058, 1060, 1079, 1084, 1096, 1121, 1130, 1135, 1151, 1163, 1177,
        1183, 1206, 1242, 1250, 1253, 1258]),)

In [180]:
idx = 9
print(classification_report(pred[idx],dataset['test']['ner_tags'][idx]))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98        30
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         0

    accuracy                           0.94        31
   macro avg       0.33      0.32      0.33        31
weighted avg       0.97      0.94      0.95        31



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [124]:
np.mean(total_f1)

0.6594944281883183

In [137]:
total_f1[0]

0.49056603773584906

In [126]:
pred = mdl[0].predict(test_dataset, batch_size=128)

  0%|          | 0/11 [00:00<?, ?it/s]

100%|██████████| 11/11 [00:16<00:00,  1.48s/it]


In [133]:
list(zip(pred[0],dataset['test']['ner_tags'][0]))

[(0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 7),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0)]

In [122]:
test_dataset.__getitem__(0)[1][torch.where(test_dataset.__getitem__(0)[2])].tolist()

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 7,
 0,
 0,
 0,
 0,
 0,
 0]

In [64]:
self = model.train_model(train_dataset, val_dataset, batch_size=16,n_epochs=10)

100%|██████████| 213/213 [00:42<00:00,  4.99it/s]


Epoch 1 : Training Loss : 126.01009038090706, Macro F1 : 0.5878813833323603


100%|██████████| 213/213 [00:42<00:00,  4.98it/s]


Epoch 2 : Training Loss : 77.99768237769604, Macro F1 : 0.6000896524478999


100%|██████████| 213/213 [00:42<00:00,  5.05it/s]


Epoch 3 : Training Loss : 67.21644881367683, Macro F1 : 0.6226264454249515


100%|██████████| 213/213 [00:42<00:00,  5.01it/s]


Epoch 4 : Training Loss : 61.247911155223846, Macro F1 : 0.636334707709453


100%|██████████| 213/213 [00:42<00:00,  5.02it/s]


Epoch 5 : Training Loss : 57.868075743317604, Macro F1 : 0.6521782923560657


100%|██████████| 213/213 [00:43<00:00,  4.95it/s]


Epoch 6 : Training Loss : 55.02920798957348, Macro F1 : 0.664714721177656


100%|██████████| 213/213 [00:42<00:00,  5.02it/s]


Epoch 7 : Training Loss : 52.913348630070686, Macro F1 : 0.6730995152062895


100%|██████████| 213/213 [00:42<00:00,  5.01it/s]


Epoch 8 : Training Loss : 51.386038675904274, Macro F1 : 0.6810420992940202


100%|██████████| 213/213 [00:42<00:00,  4.99it/s]


Epoch 9 : Training Loss : 50.060820788145065, Macro F1 : 0.6834701084024193


100%|██████████| 213/213 [00:42<00:00,  5.01it/s]

Epoch 10 : Training Loss : 48.95029382035136, Macro F1 : 0.6870595683393077





[3,
 3,
 3,
 13,
 1,
 7,
 3,
 3,
 13,
 3,
 6,
 10,
 10,
 10,
 10,
 10,
 1,
 3,
 7,
 3,
 3,
 11,
 3,
 0,
 3,
 3,
 3,
 3,
 3,
 3]

In [54]:
f1_score(YT[16], YP[16], average='macro')

0.28070175438596495

In [None]:
input_ids.shape,labels.shape,valid_mask

AttributeError: 'list' object has no attribute 'shape'

In [22]:
labels

tensor([[ 0, 13, 13, 13,  0,  0,  0, 13,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [ 1, 13, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 13,  0,  0,
          0,  0,  0, 13,  0,  0,  0, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
         13, 13, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [11,  0, 13,  0, 13, 13,  0,  0, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0, 13,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [ 7,  0, 13, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 13,
         13,  0,  0,  0,  0,  0, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [ 0,  0,  0,  0,  0,  0, 13,  0,

In [20]:
valid_mask

[tensor([ 1,  0,  0,  0,  1,  1,  1,  0,  1,  1,  1,  1,  1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        dtype=torch.int32),
 tensor([ 1,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,
          1,  1,  1,  0,  1,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        dtype=torch.int32),
 tensor([ 1,  1,  0,  1,  0,  0,  1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  0,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        dtype=torch.int32),
 tensor([ 1,  1,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
          0,  1,  1,  1,  1,  1,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -

In [74]:
model2 = model.train_model(train_dataset, val_dataset, batch_size=16,n_epochs=10)

  attention_mask = torch.tensor(padded_inputs.not_equal(0),dtype=torch.long)
100%|██████████| 213/213 [00:36<00:00,  5.86it/s]


227.87816619873047


100%|██████████| 213/213 [00:36<00:00,  5.80it/s]


184.20082527399063


100%|██████████| 213/213 [00:37<00:00,  5.75it/s]


184.35802870988846


100%|██████████| 213/213 [00:37<00:00,  5.74it/s]


183.7351187467575


100%|██████████| 213/213 [00:36<00:00,  5.77it/s]


184.6822938323021


100%|██████████| 213/213 [00:36<00:00,  5.81it/s]


183.0464592576027


100%|██████████| 213/213 [00:36<00:00,  5.78it/s]


183.12996208667755


100%|██████████| 213/213 [00:36<00:00,  5.80it/s]


183.18058854341507


100%|██████████| 213/213 [00:37<00:00,  5.73it/s]


183.116557598114


100%|██████████| 213/213 [00:36<00:00,  5.78it/s]

183.4493414759636





In [75]:
ground_truths,predictions = model.evaluate(test_dataset)

  attention_mask = torch.tensor(padded_inputs.not_equal(0),dtype=torch.long)
100%|██████████| 81/81 [00:08<00:00,  9.45it/s]


In [76]:
yt_flat = []
yp_flat = []

for i in range(len(ground_truths)):
    for j in range(len(ground_truths[i])):
        yp = []
        yt = []
        for k in range(len(ground_truths[i][j])):
            if ground_truths[i][j][k]!=-1 and ground_truths[i][j][k]!=13:
                yp.append(predictions[i][j][k] if predictions[i][j][k]!=13 else 0)
                yt.append(ground_truths[i][j][k])

        yt_flat.append(yt)
        yp_flat.append(yp)

In [77]:
list(zip(yt_flat[16],yp_flat[16]))

[(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]

In [78]:
f1_score(yt_flat[16], yp_flat[16], labels=list(range(0,13)), average='macro')

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.07692307692307693

In [80]:
f1s = []

for i in range(len(yt_flat)):

    f1s.append(f1_score(yt_flat[i], yp_flat[i],  average='macro'))

In [81]:
f1s

[0.49056603773584906,
 0.32323232323232326,
 1.0,
 1.0,
 0.3111111111111111,
 0.31746031746031744,
 0.2916666666666667,
 0.22,
 0.4444444444444445,
 0.3222222222222222,
 0.09523809523809523,
 0.48936170212765956,
 0.49122807017543857,
 1.0,
 1.0,
 0.47619047619047616,
 1.0,
 0.4864864864864865,
 0.4888888888888889,
 1.0,
 0.30303030303030304,
 0.49122807017543857,
 0.23333333333333334,
 0.4814814814814815,
 1.0,
 1.0,
 1.0,
 1.0,
 0.48717948717948717,
 0.2222222222222222,
 1.0,
 0.3125,
 0.4615384615384615,
 1.0,
 1.0,
 1.0,
 1.0,
 0.16666666666666666,
 0.47619047619047616,
 0.4864864864864865,
 0.47619047619047616,
 1.0,
 1.0,
 1.0,
 0.49019607843137253,
 0.3194444444444445,
 1.0,
 1.0,
 1.0,
 1.0,
 0.3137254901960784,
 1.0,
 1.0,
 0.3148148148148148,
 0.14035087719298248,
 0.48000000000000004,
 1.0,
 0.30769230769230765,
 0.30769230769230765,
 0.26666666666666666,
 1.0,
 0.48717948717948717,
 0.4838709677419355,
 0.4897959183673469,
 0.2857142857142857,
 0.4666666666666667,
 0.473684

In [82]:
sum(f1s)/len(f1s)

0.6611369332122944

In [None]:
ground_truths,predictions = model.evaluate(train_dataset)

In [24]:
list(zip(ground_truths[0][13],predictions[0][13]))

[(0, 0),
 (13, 13),
 (13, 13),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (13, 13),
 (0, 0),
 (0, 0),
 (13, 13),
 (0, 0),
 (13, 13),
 (13, 13),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (13, 0),
 (-1, 0),
 (-1, 0),
 (-1, 0),
 (-1, 0),
 (-1, 0),
 (-1, 0),
 (-1, 0),
 (-1, 0),
 (-1, 0),
 (-1, 0)]