In [2]:
import pandas as pd
import torch 
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import SGD

In [7]:
FILEPATH = '../data/Test1NER.csv'
header_names = ['Sentences', 'Word', 'POST', 'Predicted']
test1_df = pd.read_csv(FILEPATH, sep=';', encoding= 'unicode_escape', names=header_names)
test1_df.head()

Unnamed: 0,Sentences,Word,POST,Predicted
0,Sentence: 9001,In,IN,
1,,2005,CD,
2,,",",",",
3,,Zambia,NNP,
4,,qualified,VBD,


In [8]:
test1_df.shape[0]

19402

In [9]:
import re

def get_clean_word(inword):
  if len(str(inword)) > 1:
    stri = re.sub(".|-|'", '', str(inword))
  else:
    stri = re.sub("\b-|'\b", '', str(inword))
  return stri

In [10]:
def get_clean_word2(inword):
  if len(str(inword)) > 1:
    stri = re.sub("-|'", '', str(inword))
    stri = stri.replace('.', '')
  else:
    stri = re.sub("-|'", '', str(inword))
  return stri

In [11]:
import re
words = test1_df['Word'].values

sentence_column = test1_df['Sentences'].values
sentence_column = [str(cell) for cell in sentence_column]

sentences = []
sentence_tags = []

for i in range(len(sentence_column)):
    if sentence_column[i] != 'nan':
        if i != 0 :sentences.append(sentence)
        sentence = []
        sentence.append(get_clean_word2(words[i]))
    else:
        sentence.append(get_clean_word2(words[i]))
sentences.append(sentence)
print(f'len of sentences:{(len(sentences))}')
print('An example is:\n', sentences[54])

len of sentences:900
An example is:
 ['A', 'US', 'Energy', 'Department', 'report', 'Wednesday', 'says', 'the', 'amount', 'of', 'oil', 'available', 'in', 'the', 'United', 'States', 'declined', 'slightly', 'last', 'week', 'by', 'a', 'bit', 'more', 'than', 'one', 'million', 'barrels', ',', 'to', 'a', 'total', 'of', 'nearly', '314', 'million', 'barrels', '.']


In [12]:
words = test1_df['Word'].values

sentence_column = test1_df['Sentences'].values
sentence_column = [str(cell) for cell in sentence_column]

sentences = []
sentence_tags = []

for i in range(len(sentence_column)):
    if sentence_column[i] != 'nan':
        if i != 0 :sentences.append(sentence)
        sentence = []
        sentence.append(str(words[i]))
    else:
        sentence.append(str(words[i]))
sentences.append(sentence)
print(f'len of sentences:{(len(sentences))}')
print('An example is:\n', sentences[0])

len of sentences:900
An example is:
 ['In', '2005', ',', 'Zambia', 'qualified', 'for', 'debt', 'relief', 'under', 'the', 'Highly', 'Indebted', 'Poor', 'Country', 'Initiative', ',', 'consisting', 'of', 'approximately', 'USD', '6', 'billion', 'in', 'debt', 'relief', '.']


In [13]:
list_of_string_sentences = [[' '.join(sentence)] for sentence in sentences]
list_of_string_sentences = [item for sublist in list_of_string_sentences for item in sublist]
print('An example is:\n', list_of_string_sentences[42])

An example is:
 I was arrested for striking a happy medium ...


In [15]:
df1 = pd.read_csv('../data/myNER.csv')
df2 = pd.read_csv('../data/ner.csv')
df = pd.concat([df2, df1], axis=0)
df.head()

Unnamed: 0,text,labels
0,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
1,Iranian officials say they expect to get acces...,B-gpe O O O O O O O O O O O O O O B-tim O O O ...
2,Helicopter gunships Saturday pounded militant ...,O O B-tim O O O O O B-geo O O O O O B-org O O ...
3,They left after a tense hour-long standoff wit...,O O O O O O O O O O O
4,U.N. relief coordinator Jan Egeland said Sunda...,B-geo O O B-per I-per O B-tim O B-geo O B-gpe ...


In [16]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [17]:
label_all_tokens = False

def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):

        lb = [i.split() for i in df['labels'].values.tolist()]
        txt = df['text'].values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]
    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

In [18]:
#df = df[0:1000]

labels = [i.split() for i in df['labels'].values.tolist()]
unique_labels = set()

for lb in labels:
        [unique_labels.add(i) for i in lb if i not in unique_labels]
labels_to_ids = {k: v for v, k in enumerate(unique_labels)}
ids_to_labels = {v: k for v, k in enumerate(unique_labels)}

df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])

In [19]:
df_train.shape[0]

45567

In [20]:
class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [21]:
def align_word_ids(texts):
  
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids


def evaluate_one_text(model, sentence):


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids_to_labels[i] for i in predictions]
    return prediction_label

In [27]:
from transformers import DistilBertTokenizer


use_cuda = False
MODELPATH = '../model/bertmodel.pkl'
device = torch.device("cuda" if use_cuda else "cpu")
tokenizer = BertTokenizerFast.from_pretrained("./models/tokenizer/")

model2 = BertModel()
model2.load_state_dict(torch.load(MODELPATH, map_location=torch.device(device)))
evaluate_one_text(model2, 'Bill Gates is the founder of Microsoft')

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': './models/tokenizer/'. Use `repo_type` argument if needed.

In [79]:
list_of_string_sentences = [[' '.join(sentence)] for sentence in sentences]
list_of_string_sentences = [item for sublist in list_of_string_sentences for item in sublist]
print('An example is:\n', list_of_string_sentences[54])
print(len(list_of_string_sentences))

An example is:
 A US Energy Department report Wednesday says the amount of oil available in the United States declined slightly last week by a bit more than one million barrels , to a total of nearly 314 million barrels .
900


In [None]:
use_cuda = False
MODELPATH = '../model/bertmodel.pkl'

model2 = BertModel()
model2.load_state_dict(torch.load(MODELPATH))
model2.eval()
evaluate_one_text(model2, 'Bill Gates is the founder of Microsoft')

In [80]:
predictions = [evaluate_one_text(model, sentenc) for sentenc in list_of_string_sentences[300]]

In [83]:
predictions

[['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 [],
 ['B-obj'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 [],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 [],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 [],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 [],
 ['I-per'],
 [],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 [],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 [],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 [],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 [],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 [],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 [],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 [],
 ['I-per'],
 ['I-per'],
 [],
 ['I-per'],
 ['I-per'],
 ['I-per'],
 ['I-p

In [82]:
for i in range(len(predictions)):
  if len(predictions[i]) != len(list_of_string_sentences[i].split()):
    print(len(predictions[i]), len(list_of_string_sentences[i].split()))
    print(predictions[i])
    print(list_of_string_sentences[i])
    print(i)

1 26
['I-per']
In 2005 , Zambia qualified for debt relief under the Highly Indebted Poor Country Initiative , consisting of approximately USD 6 billion in debt relief .
0
1 13
['I-per']
Poverty remains a significant problem in Zambia , despite a stronger economy .
1
1 39
['I-per']
Zambia s dependency on copper makes it vulnerable to depressed commodity prices , but record high copper prices and a bumper maize crop in 2010 helped Zambia rebound quickly from the world economic slowdown that began in 2008 .
2
1 34
['I-per']
A high birth rate , relatively high HIV / AIDS burden , and market distorting agricultural policies have meant that Zambia s economic growth has not dramatically decreased the stubbornly high poverty rates .
3
1 15
['I-per']
Slovakia s roots can be traced to the 9th century state of Great Moravia .
4
1 20
['I-per']
Subsequently , the Slovaks became part of the Hungarian Kingdom , where they remained for the next 1 years .
5
1 50
['I-per']
Following the formation of the

In [13]:
use_cuda = False
MODELPATH = './model/bert_NER_10_24.pt'
device = torch.device("cuda" if use_cuda else "cpu")

model = torch.load(MODELPATH, map_location=torch.device(device))


In [29]:
def train_loop(model, df_train, df_val):

    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][train_label[i] != -100]
              label_clean = train_label[i][train_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_train += acc
              total_loss_train += loss.item()

            loss.backward()
            optimizer.step()
            model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][val_label[i] != -100]
              label_clean = val_label[i][val_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_val += acc
              total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')

LEARNING_RATE = 5e-3
EPOCHS = 10
BATCH_SIZE = 2

model = BertModel()
train_loop(model, df_train, df_val)



KeyboardInterrupt: 