In [44]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tqdm

In [3]:
df_data = pd.read_csv("ner_dataset.csv",sep=",",encoding="latin1").fillna(method='ffill')
df_data.shape

(1048575, 4)

In [34]:
tag_list=df_data.Tag.unique()

In [22]:
from sklearn.model_selection import train_test_split

x_train,x_test=train_test_split(df_data,test_size=0.20,shuffle=False)

agg_func = lambda s: [ [w,t] for w,t in zip(s["Word"].values.tolist(),s["Tag"].values.tolist())]

x_train_grouped = x_train.groupby("Sentence #").apply(agg_func)
x_test_grouped = x_test.groupby("Sentence #").apply(agg_func)

In [28]:
x_train_sentences = [[s[0] for s in sent] for sent in x_train_grouped.values]
x_test_sentences = [[s[0] for s in sent] for sent in x_test_grouped.values]

In [30]:
x_train_tags = [[t[1] for t in tag] for tag in x_train_grouped.values]
x_test_tags = [[t[1] for t in tag] for tag in x_test_grouped.values]

In [27]:
MAX_LENGTH=128
BERT_MODEL="bert-base-cased"

BATCH_SIZE=32

pad_token=0
pad_token_segment_id=0
sequence_a_segment_id=0

In [33]:
from transformers import (
    TF2_WEIGHTS_NAME,
    BertConfig,
    BertTokenizer,
    TFBertForTokenClassification,
    create_optimizer)

MODEL_CLASSES = {"bert": (BertConfig, TFBertForTokenClassification, BertTokenizer)}

In [36]:
label_map = {label: i for i, label in enumerate(tag_list)}

In [40]:
num_labels = len(tag_list) + 1

In [35]:
pad_token_label_id = 0

In [38]:
config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']

In [41]:
config = config_class.from_pretrained(BERT_MODEL,num_labels=num_labels)

In [42]:
tokenizer = tokenizer_class.from_pretrained(BERT_MODEL,do_lower_case=False)

In [43]:
model = model_class.from_pretrained(
                BERT_MODEL,
                from_pt=bool(".bin" in BERT_MODEL),
                config=config)

All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
model.layers[-1].activation = tf.keras.activations.softmax

In [46]:
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [47]:
from keras.preprocessing.sequence import pad_sequences
max_seq_length =128

def convert_to_input(sentences,tags):
  input_id_list,attention_mask_list,token_type_id_list=[],[],[]
  label_id_list=[]
  
  for x,y in tqdm.tqdm(zip(sentences,tags),total=len(tags)):
  
    tokens = []
    label_ids = []

    for word, label in zip(x, y):
      word_tokens = tokenizer.tokenize(word)
      tokens.extend(word_tokens)
      # Use the real label id for the first token of the word, and padding ids for the remaining tokens
      label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

  
    special_tokens_count =  2
    if len(tokens) > max_seq_length - special_tokens_count:
      tokens = tokens[: (max_seq_length - special_tokens_count)]
      label_ids = label_ids[: (max_seq_length - special_tokens_count)]

    label_ids = [pad_token_label_id]+label_ids+[pad_token_label_id]
    inputs = tokenizer.encode_plus(tokens,add_special_tokens=True, max_length=max_seq_length)

    input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
    attention_masks = [1] * len(input_ids)

    attention_mask_list.append(attention_masks)
    input_id_list.append(input_ids)
    token_type_id_list.append(token_type_ids)

    label_id_list.append(label_ids)

  return input_id_list,token_type_id_list,attention_mask_list,label_id_list

NameError: name 'x' is not defined