In [1]:
#Config Details

import transformers
import torch
from tqdm import tqdm
import torch.nn as nn

import pandas as pd
import numpy as np

import joblib
import torch

from sklearn import preprocessing
from sklearn import model_selection

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


import dataset

In [2]:

MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 10
BASE_MODEL_PATH = "bert-base-uncased"
MODEL_PATH ="../pkl_model/train_model_13_07.bin"
TRAINING_FILE = "../dataset/2000_BIO_taggingdata_ALL_ROW_WISE.csv"
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BASE_MODEL_PATH,
    do_lower_case=True
)

## Entity Dataset, Entity Model & Training Function

In [3]:
class EntityDataset:
    def __init__(self, texts, tags):
        # texts: [["hi", ",", "my", "name", "is", "abhishek"], ["hello".....]]
        # pos/tags: [[1 2 3 4 1 5], [....].....]]
        self.texts = texts
        #self.pos = pos
        self.tags = tags
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = self.texts[item]
        #pos = self.pos[item]
        tags = self.tags[item]

        ids = []
        #target_pos = []
        target_tag =[]
        
        # tokenising for BERT

        for i, s in enumerate(text):
            inputs = TOKENIZER.encode(
                s,
                add_special_tokens=False
            )
            # abhishek: ab ##hi ##sh ##ek
            input_len = len(inputs)
            ids.extend(inputs)
            #target_pos.extend([pos[i]] * input_len)
            target_tag.extend([tags[i]] * input_len)

        ids = ids[:MAX_LEN - 2]
        # -2 for adding special tokens
        #target_pos = target_pos[:config.MAX_LEN - 2]
        target_tag = target_tag[:MAX_LEN - 2]

        ids = [2] + ids + [3]
        #target_pos = [0] + target_pos + [0]
        target_tag = [0] + target_tag + [0]

        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)

        padding_len = MAX_LEN - len(ids)

        ids = ids + ([0] * padding_len)
        mask = mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        #target_pos = target_pos + ([0] * padding_len)
        target_tag = target_tag + ([0] * padding_len)

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target_tag": torch.tensor(target_tag, dtype=torch.long),
        }

In [4]:
def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        optimizer.zero_grad()
        _, loss = model(**data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()
    return final_loss / len(data_loader)


def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        _, loss = model(**data)
        final_loss += loss.item()
    return final_loss / len(data_loader)

In [5]:
def loss_fn(output, target, mask, num_labels):
    lfn = nn.CrossEntropyLoss()
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1, num_labels)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )
    loss = lfn(active_logits, active_labels)
    return loss


class EntityModel(nn.Module):
    def __init__(self, num_tag):
        super(EntityModel, self).__init__()
        self.num_tag = num_tag
        self.bert = transformers.BertModel.from_pretrained(BASE_MODEL_PATH,return_dict=False)
        self.bert_drop_1 = nn.Dropout(0.3)
        #self.bert_drop_2 = nn.Dropout(0.3)
        self.out_tag = nn.Linear(768, self.num_tag)

    
    def forward(self, ids, mask, token_type_ids, target_tag):
        o1, _ = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)

        bo_tag = self.bert_drop_1(o1)
        #bo_pos = self.bert_drop_2(o1)

        tag = self.out_tag(bo_tag)
        #pos = self.out_pos(bo_pos)

        loss_tag = loss_fn(tag, target_tag, mask, self.num_tag)
        #loss_pos = loss_fn(pos, target_pos, mask, self.num_pos)

        #loss = (loss_tag + loss_pos) / 2
        loss = loss_tag

        return tag, loss

In [6]:

def process_data(data_path):
    df = pd.read_csv(data_path, encoding="latin-1")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    print('Number of empty values are ', df["Word"].isna().sum())
    df["Word"].fillna("None", inplace = True)
    enc_pos = preprocessing.LabelEncoder()
    enc_tag = preprocessing.LabelEncoder()

    #df.loc[:, "POS"] = enc_pos.fit_transform(df["POS"])
    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
    
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    #pos = df.groupby("Sentence #")["POS"].apply(list).values
    pos = []
    tag = df.groupby("Sentence #")["Tag"].apply(list).values
    return sentences, pos, tag, enc_pos, enc_tag

## Preprocess Dataset

In [7]:
sentences, pos, tag, enc_pos, enc_tag = process_data(TRAINING_FILE)

Number of empty values are  0


In [8]:
meta_data = {
        "enc_pos": enc_pos,
        "enc_tag": enc_tag
    }
joblib.dump(meta_data, "meta.bin")

['meta.bin']

In [9]:
num_tag = len(list(enc_tag.classes_))

In [10]:
(train_sentences, test_sentences, train_tag,test_tag) = model_selection.train_test_split(sentences,  tag, random_state=42, test_size=0.2)

In [11]:
train_dataset = EntityDataset(
        texts=train_sentences, tags=train_tag
    )

train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4
    )

valid_dataset = EntityDataset(
        texts=test_sentences, tags=test_tag
    )

valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
    )
device = torch.device("cuda")
model = EntityModel(num_tag=num_tag)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


EntityModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


## Training

In [12]:
#import os
#os.mkdir(MODEL_PATH)

In [13]:
#data

In [14]:
# for data in tqdm(train_data_loader, total=len(train_data_loader)):
#   print('Next Data Loader')

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

num_train_steps = int(len(train_sentences) / TRAIN_BATCH_SIZE * EPOCHS)
optimizer = AdamW(optimizer_parameters, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )


best_loss = np.inf
for epoch in range(EPOCHS):
    print("\n**** Epoch No : ",epoch,"*****\n")
    train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
    test_loss = eval_fn(valid_data_loader, model, device)
    print(f"Train Loss = {train_loss} Valid Loss = {test_loss}")
    if test_loss < best_loss:
        torch.save(model.state_dict(), MODEL_PATH)
        best_loss = test_loss

  0%|          | 0/50 [00:00<?, ?it/s]


**** Epoch No :  0 *****



100%|██████████| 50/50 [00:47<00:00,  1.06it/s]
100%|██████████| 51/51 [00:11<00:00,  4.63it/s]


Train Loss = 0.26925276711583135 Valid Loss = 0.15001491851666393


  0%|          | 0/50 [00:00<?, ?it/s]


**** Epoch No :  1 *****



100%|██████████| 50/50 [00:47<00:00,  1.05it/s]
100%|██████████| 51/51 [00:11<00:00,  4.63it/s]


Train Loss = 0.13164631515741348 Valid Loss = 0.09993404668628


  0%|          | 0/50 [00:00<?, ?it/s]


**** Epoch No :  2 *****



100%|██████████| 50/50 [00:47<00:00,  1.05it/s]
100%|██████████| 51/51 [00:11<00:00,  4.55it/s]


Train Loss = 0.09106713145971299 Valid Loss = 0.08323846775673184


  0%|          | 0/50 [00:00<?, ?it/s]


**** Epoch No :  3 *****



100%|██████████| 50/50 [00:47<00:00,  1.05it/s]
100%|██████████| 51/51 [00:11<00:00,  4.52it/s]


Train Loss = 0.06803649090230465 Valid Loss = 0.07782209817977513


  0%|          | 0/50 [00:00<?, ?it/s]


**** Epoch No :  4 *****



100%|██████████| 50/50 [00:47<00:00,  1.05it/s]
100%|██████████| 51/51 [00:11<00:00,  4.61it/s]


Train Loss = 0.054891033247113225 Valid Loss = 0.07406233837280203


  0%|          | 0/50 [00:00<?, ?it/s]


**** Epoch No :  5 *****



100%|██████████| 50/50 [00:47<00:00,  1.05it/s]
100%|██████████| 51/51 [00:11<00:00,  4.61it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Train Loss = 0.04713058527559042 Valid Loss = 0.07451740897023211

**** Epoch No :  6 *****



100%|██████████| 50/50 [00:47<00:00,  1.05it/s]
100%|██████████| 51/51 [00:11<00:00,  4.61it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Train Loss = 0.04132081493735314 Valid Loss = 0.07537634612298479

**** Epoch No :  7 *****



100%|██████████| 50/50 [00:47<00:00,  1.05it/s]
100%|██████████| 51/51 [00:11<00:00,  4.59it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Train Loss = 0.03481026720255613 Valid Loss = 0.07600270553181569

**** Epoch No :  8 *****



100%|██████████| 50/50 [00:47<00:00,  1.05it/s]
 61%|██████    | 31/51 [00:07<00:03,  5.01it/s]

## Prediction

In [None]:
meta_data = joblib.load("meta.bin")
enc_tag = meta_data["enc_tag"]

#num_pos = len(list(enc_pos.classes_))
num_tag = len(list(enc_tag.classes_))
sentence = """
    it professional with an overall 16 years of technocommercial experience in business process consulting implementation and project delivery space for various enterprise business applications and it infrastructure products and services for clients across various industry sectors automotive retail logistics telecom etcexperienced in global service deliveries with distributed teams delivered large implementation roll out programs upgrades data migration and application value managementengagements using different project methodologiesskills summary project scheduling contracts management technical documentation risk management customer relationship management practice development change management team building and mentoring senior project management professional and business process consultant in it industrynnshe has experience in solution design preparing technocommercial proposal product and solution presentation implementation project governance and delivery of enterprise business applicationsnnshe has worked in gulf and india geographies and delivered successful large engagementsnnother skills summary nnproject scheduling nsow management nrisk management npractice development nchange management presales solution design consultant 21 years of experience in business requirement analysis technical solution design commercial architecting projects delivery and leadership and customer advocacy around industry best practices for multiple clients across various industries project delivery management business applications presales solution designing account manager financial services at wipro technologies ltd principal consultant banking bfs at wipro limited presales sap practice at wipro infotech consumer healthcare product development medical device software engineering investment banking front office operations and supply chain
    """
tokenized_sentence = TOKENIZER.encode(sentence)

sentence = sentence.split()
print(sentence)
print(tokenized_sentence)

test_dataset = EntityDataset(
        texts=[sentence], 
        tags=[[0] * len(sentence)]
    )

device = torch.device("cuda")
pred_model = EntityModel(num_tag=num_tag)
pred_model.load_state_dict(torch.load(MODEL_PATH))
pred_model.to(device)

with torch.no_grad():
  data = test_dataset[0]
  for k, v in data.items():
    data[k] = v.to(device).unsqueeze(0)
  tag, _ = pred_model(**data)

print(
    enc_tag.inverse_transform(
        tag.argmax(2).cpu().numpy().reshape(-1)
        )[:len(tokenized_sentence)]
    )
# print(
#     enc_pos.inverse_transform(
#         pos.argmax(2).cpu().numpy().reshape(-1)
#         )[:len(tokenized_sentence)]
#       )

In [None]:
preds = enc_tag.inverse_transform(
        tag.argmax(2).cpu().numpy().reshape(-1)
        )[:len(tokenized_sentence)]

In [None]:
TOKENIZER.decode(4003)

In [None]:
for elem,cat in zip(tokenized_sentence , preds):
  print(TOKENIZER.decode([elem]), '=====>',cat)

In [None]:
tag.argmax(2)

In [None]:
tokenized_sentence

In [None]:
enc_tag

In [None]:
TOKENIZER.decode(tokenized_sentence)

In [None]:
# if __name__ == "__main__":
#     sentences, pos, tag, enc_pos, enc_tag = process_data(config.TRAINING_FILE)
    
#     meta_data = {
#         "enc_pos": enc_pos,
#         "enc_tag": enc_tag
#     }

#     joblib.dump(meta_data, "meta.bin")

#     num_pos = len(list(enc_pos.classes_))
#     num_tag = len(list(enc_tag.classes_))

#     (
#         train_sentences,
#         test_sentences,
#         train_pos,
#         test_pos,
#         train_tag,
#         test_tag
#     ) = model_selection.train_test_split(sentences, pos, tag, random_state=42, test_size=0.1)

#     train_dataset = dataset.EntityDataset(
#         texts=train_sentences, pos=train_pos, tags=train_tag
#     )

#     train_data_loader = torch.utils.data.DataLoader(
#         train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4
#     )

#     valid_dataset = dataset.EntityDataset(
#         texts=test_sentences, pos=test_pos, tags=test_tag
#     )

#     valid_data_loader = torch.utils.data.DataLoader(
#         valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1
#     )

#     device = torch.device("cuda")
#     model = EntityModel(num_tag=num_tag, num_pos=num_pos)
#     model.to(device)

#     param_optimizer = list(model.named_parameters())
#     no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
#     optimizer_parameters = [
#         {
#             "params": [
#                 p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
#             ],
#             "weight_decay": 0.001,
#         },
#         {
#             "params": [
#                 p for n, p in param_optimizer if any(nd in n for nd in no_decay)
#             ],
#             "weight_decay": 0.0,
#         },
#     ]

#     num_train_steps = int(len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
#     optimizer = AdamW(optimizer_parameters, lr=3e-5)
#     scheduler = get_linear_schedule_with_warmup(
#         optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
#     )

#     best_loss = np.inf
#     for epoch in range(config.EPOCHS):
#         train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
#         test_loss = engine.eval_fn(valid_data_loader, model, device)
#         print(f"Train Loss = {train_loss} Valid Loss = {test_loss}")
#         if test_loss < best_loss:
#             torch.save(model.state_dict(), config.MODEL_PATH)
#             best_loss = test_loss