In [None]:
!pip install -q simpletransformers

In [None]:
import pandas as pd
import json
from datasets import load_dataset

In [None]:
lst20 = load_dataset("lst20", data_dir="/kaggle/input/lst20-corpus/LST20_Corpus")

In [None]:
train_df = pd.DataFrame(lst20['train'])
validation_df = pd.DataFrame(lst20['validation'])
test_df = pd.DataFrame(lst20['test'])

In [None]:
NER_TAGS = [
       "O",
        "B_BRN",        "B_DES",        "B_DTM",        "B_LOC",        "B_MEA",        "B_NUM",        "B_ORG",        "B_PER",        "B_TRM",        "B_TTL",
       "I_BRN",        "I_DES",        "I_DTM",        "I_LOC",        "I_MEA",        "I_NUM",        "I_ORG",        "I_PER",        "I_TRM",        "I_TTL",
        "E_BRN",        "E_DES",        "E_DTM",        "E_LOC",        "E_MEA",        "E_NUM",        "E_ORG",        "E_PER",        "E_TRM",        "E_TTL"]
print(NER_TAGS)

In [None]:
df_filter = ['id', 'tokens', 'ner_tags']
train_df = train_df[df_filter]
validation_df = validation_df[df_filter]
test_df = test_df[df_filter]

In [None]:
def convert_data_to_df(df):
  data_df = pd.DataFrame()
  sentence_id = []
  words = []
  labels = []

  for sentence in range(len(df)):
    for token in range(len(df['tokens'][sentence])):
      sentence_id.append(sentence)
      words.append(df['tokens'][sentence][token])
      labels.append(NER_TAGS[df['ner_tags'][sentence][token]]) #Map 0 to "O", 1 to "B_BRN"

  return pd.DataFrame(
      {"sentence_id": sentence_id, "words": words, "labels": labels}
  )

In [None]:
train_data = convert_data_to_df(train_df)
#Re-process to validate and test dataset
eval_data = convert_data_to_df(validation_df )
test_data = convert_data_to_df(test_df)

In [None]:

import logging
from simpletransformers.ner import NERModel, NERArgs
import torch

# Simple Transformer https://simpletransformers.ai/docs/ner-minimal-start/
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

ner_args = NERArgs()
ner_args.train_batch_size = 202 #192 is fit for GPU T4, 512 for A100
ner_args.use_multiprocessing = True
ner_args.evaluate_during_training = True
ner_args.eval_batch_size = 202
ner_args.num_train_epochs = 20
ner_args.max_seq_length = 81
ner_args.overwrite_output_dir = True

In [None]:
model = NERModel(
     "camembert", # Model Type
     "pythainlp/thainer-corpus-v2-base-model ",  #Ner Pre-trained Model
     args=ner_args, use_cuda=torch.cuda.is_available(), labels=NER_TAGS , ignore_mismatched_sizes=True)

In [None]:
model.train_model(train_data, eval_data=eval_data)

In [None]:
result, model_outputs, preds_list = model.eval_model(eval_data)

In [None]:
print(result)

In [None]:
texts_test = pd.read_csv('/kaggle/input/nithan-chadok-name-entity-recognition/test.csv')
texts_test_raw = texts_test['word'].tolist()

In [None]:
def blank_space(x):
  if x == '':
    x = '_'
  return x

#Loop replace blank to "_"
for i in range(len(texts_test_raw)):
  texts_test_raw[i] = blank_space(texts_test_raw[i])

def split_into_sentences(tokens, tokens_per_sentence=20):
    sentences = []
    for i in range(0, len(tokens), tokens_per_sentence):
        sentence = tokens[i:i+tokens_per_sentence]
        sentences.append(sentence)
    return sentences

my_token = split_into_sentences(texts_test_raw)
def data_inside(data_list):
  x = 0
  for i in range(len(data_list)):
    a = len(data_list[i])
    x = x+a
  return x

my_token_list = []
for i in range(len(my_token)):
  sent_join = ' '.join(my_token[i])
  print(sent_join)
  my_token_list.append(sent_join)

In [None]:
predictions, raw_outputs = model.predict(my_token, False)

In [None]:
final_test_df = []
for i in range(len(predictions)):
# for i in range(3):
  for j in range(len(predictions[i])):
    data = predictions[i][j]
    value = data.values()
    final_test_df += value

final_result = pd.DataFrame(final_test_df)
data = {
    'New_id': [
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
        10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
        30, 31
    ],
    'New_tag': [
        'O', 'B_ORG', 'B_PER', 'B_LOC', 'B_MEA', 'I_DTM', 'I_ORG', 'E_ORG', 'I_PER', 'B_TTL',
        'E_PER', 'B_DES', 'E_LOC', 'B_DTM', 'B_NUM', 'I_MEA', 'E_DTM', 'E_MEA', 'I_LOC', 'I_DES',
        'E_DES', 'I_NUM', 'E_NUM', 'B_TRM', 'B_BRN', 'I_TRM', 'E_TRM', 'I_TTL', 'I_BRN', 'E_BRN',
        'E_TTL', 'B_NAME'
    ]
}
# Create DataFrame
tag_df = pd.DataFrame(data)
final_result['New_id'] = final_result[0].map(tag_df.set_index('New_tag')['New_id'])
submisstion_df = pd.read_csv('/kaggle/input/nithan-chadok-name-entity-recognition/sample_submission.csv')
submisstion_df['token'] = pd.DataFrame({'Token': texts_test_raw})
submisstion_df['Final_pred'] = pd.DataFrame({'Final_pred': final_test_df})
submisstion_df['Predicted'] = final_result['New_id']
submission_df = submisstion_df[['i','Predicted']]
submission_df = submission_df.rename(columns={'Predicted': 'pred'})
submission_df = submission_df.set_index('i')
submission_df.to_csv("submission3.csv")