In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls "/content/drive/My Drive/bert/"

bert-base-uncased  dataset  model_2000_manually_ROW_WISE  train_model_save


In [None]:
# !pip install transformers
# !pip install dataset

In [None]:
# !pip install simpletransformers

In [None]:
# Config Details

import transformers
import torch
from tqdm import tqdm
import torch.nn as nn

import pandas as pd
import numpy as np

import joblib
import torch

from sklearn import preprocessing
from sklearn import model_selection

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


import dataset

MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 8
#BASE_MODEL_PATH = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
BASE_MODEL_PATH = "bert-base-uncased"
#TOKENIZER_PATH = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
# TOKENIZER_PATH = "/content/drive/MyDrive/bert-fine-tuned_version_2/tokenizer"
MODEL_PATH ="/content/drive/MyDrive/bert/train_model_save/model.bin"
TRAINING_FILE = "/content/drive/My Drive/bert/dataset/BIO_taggingdata_ALL_Spacy_TRAIN.csv"
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BASE_MODEL_PATH,
    do_lower_case=True
)

In [None]:
# model = torch.load('/content/drive/MyDrive/bert/model_2000_manually_ROW_WISE')

NameError: ignored

In [None]:
#!cp model.bin /content/drive/MyDrive/model_ner_ft_2.bin
#!cp meta.bin /content/drive/MyDrive/meta_ft_2.bin

In [None]:
# !pip install transformers
# !pip install dataset

## Entity Dataset, Entity Model & Training Function

In [None]:
class EntityDataset:
    def __init__(self, texts, tags):
        # texts: [["hi", ",", "my", "name", "is", "abhishek"], ["hello".....]]
        # pos/tags: [[1 2 3 4 1 5], [....].....]]
        self.texts = texts
        #self.pos = pos
        self.tags = tags
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = self.texts[item]
        #pos = self.pos[item]
        tags = self.tags[item]

        ids = []
        #target_pos = []
        target_tag =[]

        for i, s in enumerate(text):
            inputs = TOKENIZER.encode(
                s,
                add_special_tokens=False
            )
            # abhishek: ab ##hi ##sh ##ek
            input_len = len(inputs)
            ids.extend(inputs)
            #target_pos.extend([pos[i]] * input_len)
            target_tag.extend([tags[i]] * input_len)

        ids = ids[:MAX_LEN - 2]
        # for CLS/SEP tokens
        #target_pos = target_pos[:config.MAX_LEN - 2]
        target_tag = target_tag[:MAX_LEN - 2]

        ids = [2] + ids + [3]
        #target_pos = [0] + target_pos + [0]
        target_tag = [0] + target_tag + [0]

        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)

        padding_len = MAX_LEN - len(ids)

        ids = ids + ([0] * padding_len)
        mask = mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        #target_pos = target_pos + ([0] * padding_len)
        target_tag = target_tag + ([0] * padding_len)

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target_tag": torch.tensor(target_tag, dtype=torch.long),
        }

In [None]:
def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        optimizer.zero_grad()
        _, loss = model(**data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()
    return final_loss / len(data_loader)


def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        _, loss = model(**data)
        final_loss += loss.item()
    return final_loss / len(data_loader)

In [None]:
def loss_fn(output, target, mask, num_labels):
    lfn = nn.CrossEntropyLoss()
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1, num_labels)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )
    loss = lfn(active_logits, active_labels)
    return loss


class EntityModel(nn.Module):
    def __init__(self, num_tag):
        super(EntityModel, self).__init__()
        self.num_tag = num_tag
        self.bert = transformers.BertModel.from_pretrained(BASE_MODEL_PATH,return_dict=False)
        self.bert_drop_1 = nn.Dropout(0.3)
        #self.bert_drop_2 = nn.Dropout(0.3)
        self.out_tag = nn.Linear(768, self.num_tag)

    
    def forward(self, ids, mask, token_type_ids, target_tag):
        o1, _ = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)

        bo_tag = self.bert_drop_1(o1)
        #bo_pos = self.bert_drop_2(o1)

        tag = self.out_tag(bo_tag)
        #pos = self.out_pos(bo_pos)

        loss_tag = loss_fn(tag, target_tag, mask, self.num_tag)
        #loss_pos = loss_fn(pos, target_pos, mask, self.num_pos)

        #loss = (loss_tag + loss_pos) / 2
        loss = loss_tag

        return tag, loss

In [None]:

def process_data(data_path):
    df = pd.read_csv(data_path, encoding="latin-1")
    df=df.replace(r'^\s*$', np.nan, regex=True)
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    print('Number of empty values are ', df["Word"].isna().sum())
    df["Word"].fillna("None", inplace = True)
    df.loc[:, "Tag"] = df["Tag"].fillna(method="ffill")

    print("Check 1")
    enc_pos = preprocessing.LabelEncoder()
    enc_tag = preprocessing.LabelEncoder()
    print("Check 2")


    #df.loc[:, "POS"] = enc_pos.fit_transform(df["POS"])
    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
    print("Check 3")

    
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    print("Check 4")

    #pos = df.groupby("Sentence #")["POS"].apply(list).values
    pos = []
    tag = df.groupby("Sentence #")["Tag"].apply(list).values
    print("Check 5")

    return sentences, pos, tag, enc_pos, enc_tag

## Preprocess Dataset

In [None]:
sentences, pos, tag, enc_pos, enc_tag = process_data(TRAINING_FILE)
print("\nsentences:\n", sentences)
print("\n pos:\n", pos)
print("\n tag:\n", tag)
print("\n enc_pos:\n", enc_pos)
print("\n enc_tag:\n", enc_tag)

Number of empty values are  1001
Check 1
Check 2
Check 3
Check 4
Check 5

sentences:
 [list(['currently', 'employed', 'as', 'a', 'trading', 'agreements', 'specialist', 'at', 'caceis', 'bank', 'luxembourg', 'i', 'am', 'in', 'charge', 'of', 'negotiating', 'standard', 'trading', 'agreements', 'mainly', 'gmsla', 'gmra', 'and', 'isda', 'sharing', 'my', 'knowledge', 'with', 'other', 'legal', 'advisors', 'within', 'the', 'bank', 'and', 'keeping', 'an', 'eye', 'on', 'regulatory', 'improvements', 'likely', 'to', 'affect', 'our', 'current', 'templates', 'and', 'negotiation', 'guidelines', 'eg', 'eu', 'regulations', 'and', 'directives', 'from', 'time', 'to', 'time', 'i', 'get', 'to', 'be', 'involved', 'in', 'the', 'review', 'and', 'the', 'negotiation', 'of', 'tailormade', 'agreements', 'for', 'use', 'in', 'relation', 'to', 'oneshot', 'trades', 'could', 'be', 'trading', 'agreements', 'as', 'well', 'as', 'master', 'confirmation', 'agreements', 'and', 'longform', 'confirmations', 'depending', 'on', 

In [None]:
meta_data = {
        "enc_pos": enc_pos,
        "enc_tag": enc_tag
    }
joblib.dump(meta_data, "meta.bin")

['meta.bin']

In [None]:
num_tag = len(list(enc_tag.classes_))

In [None]:
print(num_tag)

3


In [None]:
(train_sentences, test_sentences, train_tag,test_tag) = model_selection.train_test_split(sentences,  tag, random_state=42, test_size=0.2)

In [None]:
train_dataset = EntityDataset(
        texts=train_sentences, tags=train_tag
    )

train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=2
    )

valid_dataset = EntityDataset(
        texts=test_sentences, tags=test_tag
    )

valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
    )
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EntityModel(num_tag=num_tag)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


EntityModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


## Training

In [None]:
#import os
#os.mkdir(MODEL_PATH)

In [None]:
#data

In [None]:
# for data in tqdm(train_data_loader, total=len(train_data_loader)):
#   print('Next Data Loader')

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

num_train_steps = int(len(train_sentences) / TRAIN_BATCH_SIZE * EPOCHS)
optimizer = AdamW(optimizer_parameters, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )


best_loss = np.inf
for epoch in range(EPOCHS):
  train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
  test_loss = eval_fn(valid_data_loader, model, device)
  print(f"Train Loss = {train_loss} Valid Loss = {test_loss}")
  if test_loss < best_loss:
    torch.save(model.state_dict(), MODEL_PATH)
    best_loss = test_loss

100%|██████████| 25/25 [00:27<00:00,  1.11s/it]
100%|██████████| 26/26 [00:07<00:00,  3.60it/s]


Train Loss = 0.2784672385454178 Valid Loss = 0.21914497419045523


100%|██████████| 25/25 [00:28<00:00,  1.14s/it]
100%|██████████| 26/26 [00:07<00:00,  3.64it/s]


Train Loss = 0.20560020446777344 Valid Loss = 0.20615282654762268


100%|██████████| 25/25 [00:28<00:00,  1.13s/it]
100%|██████████| 26/26 [00:07<00:00,  3.67it/s]


Train Loss = 0.18090063333511353 Valid Loss = 0.1459878270442669


100%|██████████| 25/25 [00:28<00:00,  1.16s/it]
100%|██████████| 26/26 [00:07<00:00,  3.62it/s]


Train Loss = 0.12466846287250519 Valid Loss = 0.11833639586201081


100%|██████████| 25/25 [00:29<00:00,  1.16s/it]
100%|██████████| 26/26 [00:07<00:00,  3.62it/s]


Train Loss = 0.10317366421222687 Valid Loss = 0.10232346619550999


100%|██████████| 25/25 [00:29<00:00,  1.17s/it]
100%|██████████| 26/26 [00:07<00:00,  3.64it/s]


Train Loss = 0.0893955871462822 Valid Loss = 0.09557091272794284


100%|██████████| 25/25 [00:29<00:00,  1.17s/it]
100%|██████████| 26/26 [00:07<00:00,  3.62it/s]


Train Loss = 0.08305739119648933 Valid Loss = 0.08910037233279301


100%|██████████| 25/25 [00:29<00:00,  1.18s/it]
100%|██████████| 26/26 [00:07<00:00,  3.66it/s]


Train Loss = 0.07843738451600074 Valid Loss = 0.08866441421783887


## Prediction

In [None]:
meta_data = joblib.load("meta.bin")
enc_tag = meta_data["enc_tag"]

#num_pos = len(list(enc_pos.classes_))
num_tag = len(list(enc_tag.classes_))
# sentence = "seasoned backend developer. entrepreneur. open source contributor. scalable, highly-available web development: python (django), ruby (rails, sinatra), node.js, go, react js/native, angular, java. web backend scalability and performance tuning: new relic, ruby-prof, cprofile. queue-based solutions: kue, resque/sidekiq, celery, jms, rabbitmq. mobile: swift, objective-c, restkit/afnetworking, coredata, corelocation, gcd, sentestingkit, android studio, play service, retrofit. deep learning: convolutional neural network. test-driven-development: rspec, cucumber, python unittest, junit, jasmine. continuous integration/delivery: travis, jenkins, capistrano, vagrant, git, subversion, rake, maven, ant, buildout, make/gnu make. cloud/container: aws, azure, docker, docker-compose, dcos, kubernetes. devops: mesos, chef, puppet, mcollective, pxe, ipmi, nagios, zabbix. scripting: bash, python, ruby, perl. open source projects that enjoy 200+ stars on github and 100+k downloads on sourceforge., seasoned backend developer and entrepreneur.\n\nspecialties: \nscalable, highly-available web development: java, ruby (rails, sinatra), python (django), node.js.\nqueue-based solutions: resque/sidekiq, celery, jms, rabbitmq.\nweb backend performance tuning.\nios: coredata, corelocation, gcd, restkit/afnetworking, sentestingkit, swift.\ntest-driven-development: rspec, cucumber, python unittest, junit.\ncontinuous integration/delivery: jenkins, chef, capistrano, vagrant, git, subversion, rake, maven, ant, buildout, make/gnu make.\ninfrastructure-as-a-service: openstack nova, aws ec2.\nplatform-as-a-service: cloudfoundry, heroku, rightscale, enstratus, scalr, juju.\ncloud storage: hadoop hdfs, aws s3, openstack swift, mongodb.\nagile methodologies: scrum, fdd (feature-driven-development).\ndevops: chef, puppet, mcollective, pxe, ipmi, nagios, zabbix.\nscripting: bash, python, ruby, perl., programmer. entrepreneur at banian labs, pdh - networking/network engineering, pdh - network planner/provisioning, vp engineering at rhumbix"
# tokenized_sentence = TOKENIZER.encode(sentence)

# sentence = sentence.split()
# print(sentence)
# print(tokenized_sentence)

# test_dataset = EntityDataset(
#         texts=[sentence], 
#         tags=[[0] * len(sentence)]
#     )

# device = torch.device("cuda")
# pred_model = EntityModel(num_tag=num_tag)
# pred_model.load_state_dict(torch.load(MODEL_PATH))
# pred_model.to(device)

# with torch.no_grad():
#   data = test_dataset[0]
#   for k, v in data.items():
#     data[k] = v.to(device).unsqueeze(0)
#   tag, _ = pred_model(**data)

# print(
#     enc_tag.inverse_transform(
#         tag.argmax(2).cpu().numpy().reshape(-1)
#         )[:len(tokenized_sentence)]
#     )
# # print(
# #     enc_pos.inverse_transform(
# #         pos.argmax(2).cpu().numpy().reshape(-1)
# #         )[:len(tokenized_sentence)]
# #       )

FileNotFoundError: ignored

In [None]:
device = torch.device("cuda")
# # 
# train_dataset = EntityDataset(
#         texts=train_sentences, tags=train_tag
#     )

# train_data_loader = torch.utils.data.DataLoader(
#         train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=2
#     )

# valid_dataset = EntityDataset(
#         texts=test_sentences, tags=test_tag
#     )

# valid_data_loader = torch.utils.data.DataLoader(
#         valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
#     )
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = EntityModel(num_tag=num_tag)
# model.to(device)
# # 

for i in df["summaries"]:
  sentence = i
  tokenized_sentence = TOKENIZER.encode(sentence)
  sentence = sentence.split()
  print("******** sentence ********", sentence)
  print("******** tokenised sentence ********", tokenized_sentence)
  test_dataset = EntityDataset(
        texts=[sentence], 
        tags=[[0] * len(sentence)]
    )
  pred_model = EntityModel(num_tag=num_tag)
  pred_model.load_state_dict(torch.load(MODEL_PATH))
  pred_model.to(device)
  with torch.no_grad():
    data = test_dataset[0]
    for k, v in data.items():
      data[k] = v.to(device).unsqueeze(0)
      tag, _ = pred_model(**data)

    preds = enc_tag.inverse_transform(
        tag.argmax(2).cpu().numpy().reshape(-1)
        )[:len(tokenized_sentence)]

    for elem,cat in zip(tokenized_sentence , preds):
      print(TOKENIZER.decode([elem]), '=====>',cat)


  


  

# sentence = "seasoned backend developer. entrepreneur. open source contributor. scalable, highly-available web development: python (django), ruby (rails, sinatra), node.js, go, react js/native, angular, java. web backend scalability and performance tuning: new relic, ruby-prof, cprofile. queue-based solutions: kue, resque/sidekiq, celery, jms, rabbitmq. mobile: swift, objective-c, restkit/afnetworking, coredata, corelocation, gcd, sentestingkit, android studio, play service, retrofit. deep learning: convolutional neural network. test-driven-development: rspec, cucumber, python unittest, junit, jasmine. continuous integration/delivery: travis, jenkins, capistrano, vagrant, git, subversion, rake, maven, ant, buildout, make/gnu make. cloud/container: aws, azure, docker, docker-compose, dcos, kubernetes. devops: mesos, chef, puppet, mcollective, pxe, ipmi, nagios, zabbix. scripting: bash, python, ruby, perl. open source projects that enjoy 200+ stars on github and 100+k downloads on sourceforge., seasoned backend developer and entrepreneur.\n\nspecialties: \nscalable, highly-available web development: java, ruby (rails, sinatra), python (django), node.js.\nqueue-based solutions: resque/sidekiq, celery, jms, rabbitmq.\nweb backend performance tuning.\nios: coredata, corelocation, gcd, restkit/afnetworking, sentestingkit, swift.\ntest-driven-development: rspec, cucumber, python unittest, junit.\ncontinuous integration/delivery: jenkins, chef, capistrano, vagrant, git, subversion, rake, maven, ant, buildout, make/gnu make.\ninfrastructure-as-a-service: openstack nova, aws ec2.\nplatform-as-a-service: cloudfoundry, heroku, rightscale, enstratus, scalr, juju.\ncloud storage: hadoop hdfs, aws s3, openstack swift, mongodb.\nagile methodologies: scrum, fdd (feature-driven-development).\ndevops: chef, puppet, mcollective, pxe, ipmi, nagios, zabbix.\nscripting: bash, python, ruby, perl., programmer. entrepreneur at banian labs, pdh - networking/network engineering, pdh - network planner/provisioning, vp engineering at rhumbix"
# tokenized_sentence = TOKENIZER.encode(sentence)

# sentence = sentence.split()
# print(sentence)
# print(tokenized_sentence)

# test_dataset = EntityDataset(
#         texts=[sentence], 
#         tags=[[0] * len(sentence)]
#     )

# device = torch.device("cuda")
# pred_model = EntityModel(num_tag=num_tag)
# pred_model.load_state_dict(torch.load(MODEL_PATH))
# pred_model.to(device)

# with torch.no_grad():
#   data = test_dataset[0]
#   for k, v in data.items():
#     data[k] = v.to(device).unsqueeze(0)
#   tag, _ = pred_model(**data)

# print(
#     enc_tag.inverse_transform(
#         tag.argmax(2).cpu().numpy().reshape(-1)
#         )[:len(tokenized_sentence)]
#     )
# # print(
# #     enc_pos.inverse_transform(
# #         pos.argmax(2).cpu().numpy().reshape(-1)
# #         )[:len(tokenized_sentence)]
# #       )

In [None]:
preds = enc_tag.inverse_transform(
        tag.argmax(2).cpu().numpy().reshape(-1)
        )[:len(tokenized_sentence)]

In [None]:
TOKENIZER.decode(30000)

'# # ᄌ'

In [None]:
for elem,cat in zip(tokenized_sentence , preds):
  print(TOKENIZER.decode([elem]), '=====>',cat)

[CLS] =====> B-ORG
seasoned =====> O
back =====> O
##end =====> O
developer =====> O
. =====> O
entrepreneur =====> O
. =====> O
open =====> O
source =====> O
contributor =====> O
. =====> O
scala =====> O
##ble =====> O
, =====> O
highly =====> O
- =====> O
available =====> O
web =====> O
development =====> O
: =====> O
python =====> B-ORG
( =====> O
dj =====> O
##ango =====> O
) =====> O
, =====> O
ruby =====> O
( =====> O
rails =====> O
, =====> O
sinatra =====> O
) =====> O
, =====> O
node =====> O
. =====> O
j =====> O
##s =====> O
, =====> O
go =====> O
, =====> O
react =====> O
j =====> O
##s =====> O
/ =====> O
native =====> O
, =====> O
angular =====> O
, =====> O
java =====> B-ORG
. =====> O
web =====> O
back =====> O
##end =====> O
scala =====> O
##bility =====> O
and =====> O
performance =====> O
tuning =====> O
: =====> O
new =====> O
relic =====> O
, =====> O
ruby =====> O
- =====> O
prof =====> O
, =====> O
cp =====> O
##ro =====> O
##fi =====> O
##le =====> O
. =====> O

In [None]:
tag.argmax(2)

tensor([[0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2,
         0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 0]], device='cuda:0')

In [None]:
tokenized_sentence

[101,
 28223,
 2067,
 10497,
 9722,
 1012,
 10670,
 1012,
 2330,
 3120,
 12130,
 1012,
 26743,
 3468,
 1010,
 3811,
 1011,
 2800,
 4773,
 2458,
 1024,
 18750,
 1006,
 6520,
 23422,
 1007,
 1010,
 10090,
 1006,
 15168,
 1010,
 19643,
 1007,
 1010,
 13045,
 1012,
 1046,
 2015,
 1010,
 2175,
 1010,
 10509,
 1046,
 2015,
 1013,
 3128,
 1010,
 16108,
 1010,
 9262,
 1012,
 4773,
 2067,
 10497,
 26743,
 8553,
 1998,
 2836,
 17372,
 1024,
 2047,
 24933,
 1010,
 10090,
 1011,
 11268,
 1010,
 18133,
 3217,
 8873,
 2571,
 1012,
 24240,
 1011,
 2241,
 7300,
 1024,
 13970,
 2063,
 1010,
 24501,
 4226,
 1013,
 2217,
 3211,
 4160,
 1010,
 8292,
 3917,
 2100,
 1010,
 1046,
 5244,
 1010,
 10442,
 2213,
 4160,
 1012,
 4684,
 1024,
 9170,
 1010,
 7863,
 1011,
 1039,
 1010,
 2717,
 23615,
 1013,
 21358,
 7159,
 21398,
 1010,
 4563,
 2850,
 2696,
 1010,
 4563,
 4135,
 10719,
 1010,
 1043,
 19797,
 1010,
 2741,
 4355,
 2075,
 23615,
 1010,
 11924,
 2996,
 1010,
 2377,
 2326,
 1010,
 22307,
 8873,
 2102,
 10

In [None]:
enc_tag

LabelEncoder()

In [None]:
TOKENIZER.decode(tokenized_sentence)

'[CLS] seasoned backend developer. entrepreneur. open source contributor. scalable, highly - available web development : python ( django ), ruby ( rails, sinatra ), node. js, go, react js / native, angular, java. web backend scalability and performance tuning : new relic, ruby - prof, cprofile. queue - based solutions : kue, resque / sidekiq, celery, jms, rabbitmq. mobile : swift, objective - c, restkit / afnetworking, coredata, corelocation, gcd, sentestingkit, android studio, play service, retrofit. deep learning : convolutional neural network. test - driven - development : rspec, cucumber, python unittest, junit, jasmine. continuous integration / delivery : travis, jenkins, capistrano, vagrant, git, subversion, rake, maven, ant, buildout, make / gnu make. cloud / container : aws, azure, docker, docker - compose, dcos, kubernetes. devops : mesos, chef, puppet, mcollective, pxe, ipmi, nagios, zabbix. scripting : bash, python, ruby, perl. open source projects that enjoy 200 + stars on 

In [None]:
# if __name__ == "__main__":
#     sentences, pos, tag, enc_pos, enc_tag = process_data(config.TRAINING_FILE)
    
#     meta_data = {
#         "enc_pos": enc_pos,
#         "enc_tag": enc_tag
#     }

#     joblib.dump(meta_data, "meta.bin")

#     num_pos = len(list(enc_pos.classes_))
#     num_tag = len(list(enc_tag.classes_))

#     (
#         train_sentences,
#         test_sentences,
#         train_pos,
#         test_pos,
#         train_tag,
#         test_tag
#     ) = model_selection.train_test_split(sentences, pos, tag, random_state=42, test_size=0.1)

#     train_dataset = dataset.EntityDataset(
#         texts=train_sentences, pos=train_pos, tags=train_tag
#     )

#     train_data_loader = torch.utils.data.DataLoader(
#         train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4
#     )

#     valid_dataset = dataset.EntityDataset(
#         texts=test_sentences, pos=test_pos, tags=test_tag
#     )

#     valid_data_loader = torch.utils.data.DataLoader(
#         valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1
#     )

#     device = torch.device("cuda")
#     model = EntityModel(num_tag=num_tag, num_pos=num_pos)
#     model.to(device)

#     param_optimizer = list(model.named_parameters())
#     no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
#     optimizer_parameters = [
#         {
#             "params": [
#                 p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
#             ],
#             "weight_decay": 0.001,
#         },
#         {
#             "params": [
#                 p for n, p in param_optimizer if any(nd in n for nd in no_decay)
#             ],
#             "weight_decay": 0.0,
#         },
#     ]

#     num_train_steps = int(len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
#     optimizer = AdamW(optimizer_parameters, lr=3e-5)
#     scheduler = get_linear_schedule_with_warmup(
#         optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
#     )

#     best_loss = np.inf
#     for epoch in range(config.EPOCHS):
#         train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
#         test_loss = engine.eval_fn(valid_data_loader, model, device)
#         print(f"Train Loss = {train_loss} Valid Loss = {test_loss}")
#         if test_loss < best_loss:
#             torch.save(model.state_dict(), config.MODEL_PATH)
#             best_loss = test_loss