# **INSTALL simpletransformers**

In [None]:
!pip install simpletransformers

# **IMPORT LIBRARIES**

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from io import BytesIO
import pandas as pd
import numpy as np
import requests
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
np.random.seed(0)
plt.style.use("ggplot")
import tensorflow as tf
from simpletransformers.ner import NERModel, NERArgs

print('Tensorflow version:', tf.__version__)
print('GPU detected:', tf.config.list_physical_devices('GPU'))

# **LOAD DATA**

In [None]:
# Efficiently load training, dev, and test data sets for train and optimal model evaluation

train = pd.read_csv('data/Sequence_labeling_based_version/Word/train_BIO_Word.csv')
dev = pd.read_csv('data/Sequence_labeling_based_version/Word/dev_BIO_Word.csv')
test = pd.read_csv('data/Span Extraction-based version/test.csv')

# Delete redundant column
train.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
dev.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
test.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

# **BASIC PROCESS DATA BEFORE TRAINING**

In [None]:
# Fill Null vale
train.replace(np.nan, "NULL", inplace=True)
dev.replace(np.nan, "NULL", inplace=True)
test.replace(np.nan, "NULL", inplace=True)

In [None]:
# Get number of tag in training dataset
tags = list(set(train["Tag"].values))
num_tags = len(tags)

In [None]:
# Convert training set and dev set to Simpletransformer format
train = pd.DataFrame({'sentence_id': train['sentence_id'],'words': train['Word'], 'labels': train['Tag']})
dev = pd.DataFrame({'sentence_id': dev['sentence_id'],'words': dev['Word'], 'labels': dev['Tag']})

# **TRAINING MODEL**

Model should be trained multiple times with different random seeds to get the best model.

In [None]:
# Set up param for model

args = NERArgs()
args.num_train_epochs = 10
args.learning_rate = 2e-5
args.overwrite_output_dir =True
args.train_batch_size = 10
args.eval_batch_size = 10
args.use_cached_eval_features = False
args.use_multiprocessing = False
args.reprocess_input_data = True


In [None]:
# Load model XLMR large 
model = NERModel("auto", "vinai/phobert-large", labels=tags, args=args)

In [None]:
# Training model
model.train_model(train, validation_data=dev, acc=accuracy_score)

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(test)
result

# **EVALUATE AND PREDICT**

In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [None]:
# Predict follow IBO format for test data

test['index_spans'] = test['index_spans'].apply(literal_eval)


def IBO_pred(tokens):
  ibo_list = []
  value_list = []
  pred = model.predict(tokens)
  for i in range(len(pred[0])):
    ibo_pred = pred[0][i][0]
    value_pred = list(ibo_pred.keys())
    ibo_pred = list(ibo_pred.values())
    ibo_list.append(ibo_pred[0])
    value_list.append(value_pred[0])
  return value_list, ibo_list

def IBO_pred_test(test):
  list_ibo = []
  list_value =[]
  df_pred = pd.DataFrame()
  for i in range(len(test)):
    text = test['content'][i]
    tokens = text.split()
    ibo_, t = IBO_pred(tokens)
    list_ibo.append(ibo_)
    list_value.append(t)
  df_pred['Value_pred'] = list_value
  df_pred['IBO_pred'] = list_ibo
  return df_pred

In [None]:
# Predict IBO task for each comment in test data

df_ibo = IBO_pred_test(test)

In [None]:
# Get token predict and token correct 

def tokenize(text, pos):
    tokens = text.split()
    alignment = []
    start = 0
    for t in tokens:
        res = text.find(t, start)
        alignment.append(pos[res:res + len(t)])
        start = res + len(t)
    assert len(tokens) == len(alignment)
    return tokens, alignment
def y_pred(data, df_predict):
  index_pred = []
  for i in range(len(df_predict)):
    value_predict_i = df_predict['Value_pred'][i]
    text = data['content'][i]
    pos = [i for i in range(len(text))]
    tokens, alignment = tokenize(text, pos)
    df_point = pd.DataFrame()
    df_point['spans'] = pos
    df_point['spans'] = 0
    for i, token in enumerate(value_predict_i):
      if token == 'B-T' or token == 'I-T':
        for ali in alignment[i]:
          df_point['spans'][ali] = 1
    index_pred.append(list(df_point['spans']))
  return index_pred
def y_true(data):
  index_true = []
  for i in range(len(data)):
    text = data['content'][i]
    pos = [i for i in range(len(text))]
    df_point = pd.DataFrame()
    df_point['spans'] = pos
    df_point['spans'] = 0
    if not data['index_spans'][i]:
      index_true.append(list(df_point['spans']))
    else:
      for j in data['index_spans'][i]:
        df_point['spans'][j] = 1
      index_true.append(list(df_point['spans']))
  return index_true

true = y_true(test)
pred = y_pred(test, df_ibo)

# **SCORE**

In [None]:
# Dataframe for save evaluation metrics

from sklearn.metrics import precision_recall_fscore_support
scores_f1_macro = []
scores_f1_micro = []
scores_precision_macro = []
scores_precision_micro = []
scores_recall_macro = []
scores_recall_micro = []

for i in range(len(true)):
  score_macro = precision_recall_fscore_support(true[i], pred[i], average='macro')
  score_micro = precision_recall_fscore_support(true[i], pred[i], average='micro')

  scores_f1_macro.append(score_macro[2])
  scores_f1_micro.append(score_micro[2])
  scores_precision_macro.append(score_macro[0])
  scores_precision_micro.append(score_micro[0])
  scores_recall_macro.append(score_macro[1])
  scores_recall_micro.append(score_micro[1])

scores = pd.DataFrame()
scores['eval_loss'] = [list(result.values())[0]]
scores['F1_ner'] = [list(result.values())[1]]
scores['F1-micro'] = [np.mean(scores_f1_micro)]
scores['F1-macro'] = [np.mean(scores_f1_macro)]
scores['Precision-macro'] = [np.mean(scores_precision_macro)]
scores['Precision-micro'] = [np.mean(scores_precision_micro)]
scores['Recall-macro'] = [np.mean(scores_recall_macro)]
scores['Recall-micro'] = [np.mean(scores_recall_micro)]

print(scores)

# **ERROR DATAFRAME FOR EVALUATE RESULT**

In [None]:
def word_true(df):
  list_toxic_true = []
  for idx in range(len(test)):
    toxic_true = []
    m = df.loc[df['sentence_id'] == idx]
    m.reset_index(inplace=True)
    k = list(m['words'])
    for i in range(len(k)):
      if(i == (len(k) - 1)):
        if(m['labels'][i] == 'B-T'):
          toxic_true.append(m['words'][i])
        elif(m['labels'][i] == 'O'):
          continue
      else:
        if(m['labels'][i] == 'B-T' and m['labels'][i+1] == 'O'):
          toxic_true.append(m['words'][i])
        elif(m['labels'][i] == 'B-T' and m['labels'][i+1] == 'B-T'):
          toxic_true.append(m['words'][i])
        elif(m['labels'][i] == 'B-T' and m['labels'][i+1] == 'I-T'):
          j = i + 1
          list_word = m['words'][i]
          while(m['labels'][j] == 'I-T'):
            list_word = list_word + " " + m['words'][j]
            if(j == (len(k) -1 )):
              break;
            else:
              j = j + 1
          toxic_true.append(list_word)
    list_toxic_true.append(toxic_true) 
  return list_toxic_true
def word_pred(df):
  pre_value =[]
  for i in range(len(df)):
    temp = []
    for j in range(len(df['Value_pred'][i])):
      if df['Value_pred'][i][j] == 'B-T' or df['Value_pred'][i][j] == 'I-T':
        temp.append(df['IBO_pred'][i][j])
    pre_value.append(temp)
  return pre_value



# Error dataframe
error = pd.DataFrame()
error['True'] = word_true(test)
error['Pred'] = word_pred(df_ibo)

# **SAVE MODEL**

In [None]:
import torch
torch.save(model, 'model.pt')