# Install related dependencies

In [28]:
!pip install datasets transformers[sentencepiece] simpletransformers -qq

^C


In [29]:
!pip install seqeval -q

^C


In [None]:
!pip install sentencepiece -q

In [None]:
!pip install tensorboard

# Code

## import library

In [None]:
import datetime as dt
import math
import pickle
import logging
import numpy as np
import pandas as pd
import tensorboard
from tqdm import tqdm
from pathlib import Path
from pprint import pprint

import sklearn
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim

from datasets import load_dataset, load_metric
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForTokenClassification,
    CamembertTokenizer,
    AutoModelForTokenClassification,
    )
from simpletransformers.ner import NERModel, NERArgs

np.random.seed(1)
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

## Prepare model and process the data

In [None]:
def read_data(train_ratio, label_type='NER', ret_type='list'):
    with open('./dataset/processed_data.pickle', 'rb') as file:
        processed_file = pickle.load(file)
    np.random.shuffle(processed_file)
    sent_id = []
    text_lst = []
    pos_tag = []
    ner_tag = []
    max_length = -1
    for i, inp in tqdm(enumerate(processed_file)):
        for (text, pos, ner) in inp:
            sent_id.append('sent{}'.format(i+1))
            text_lst.append(text)
            pos_tag.append(pos)
            ner_tag.append(ner)
        if len(inp) > max_length:
          max_length = len(inp)
    print(max_length)
    data_size = len(sent_id)
    sep_pos = round(data_size*train_ratio)
    train_id, test_id = sent_id[:sep_pos], sent_id[sep_pos:]
    train_data, test_data = text_lst[:sep_pos], text_lst[sep_pos:]
    if label_type == 'NER':
        train_label, test_label = ner_tag[:sep_pos], ner_tag[sep_pos:]
    elif label_type == 'POS':
        train_label, test_label = pos_tag[:sep_pos], pos_tag[sep_pos:]
    if ret_type=='dataframe':
      train_df = pd.DataFrame({
        'sentence_id': train_id,
        'words': train_data,
        'labels': train_label
      })
      test_df = pd.DataFrame({
        'sentence_id': test_id,
        'words': test_data,
        'labels': test_label
      })
      return train_df, test_df
    else:
      return train_data, train_label, test_data, test_label

In [None]:
train_df, test_df = read_data(0.7, label_type='NER', ret_type='dataframe')
train_df, val_df = train_test_split(train_df, test_size=.2, shuffle=False)

## Define tokenizer, model and training

In [None]:
model_checkpoint = "airesearch/wangchanberta-base-att-spm-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

_NER_TAGS = [
        "O",
        "B-Claim", 
        "B-Person", 
        "B-Scence", 
        "I-Claim", 
        "I-Person", 
        "I-Scence"
    ]

# Configure the model
N_Epoch = 30
Batch_size = 16
ner_args = NERArgs()

if N_Epoch > 10:
  SAVE_EVERY_N_EPOCHS = 5
  steps_per_epoch = math.floor(len(train_df) / SAVE_EVERY_N_EPOCHS)
  if(len(train_df) % SAVE_EVERY_N_EPOCHS > 0):
      steps_per_epoch +=1
  ner_args.save_steps = (steps_per_epoch * SAVE_EVERY_N_EPOCHS)
  ner_args.save_model_every_epoch = False
ner_args.max_seq_length = 100
ner_args.output_dir = "./outputs/NER-BERT-ST_{}".format(dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
ner_args.train_batch_size = Batch_size
ner_args.show_running_loss = True
ner_args.evaluate_during_training = True
ner_args.overwrite_output_dir = True
ner_args.num_train_epochs = N_Epoch #10

model = NERModel(
    "camembert", model_checkpoint, args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS
)

# Train the model
model.train_model(train_df, eval_data=val_df, show_running_loss=True)

In [None]:
%tensorboard dev upload --logdir \
    'runs'

In [None]:
result, model_outputs, wrong_preds = model.eval_model(test_df)

In [None]:
print(len(wrong_preds), len(np.unique(test_df.sentence_id)))

In [None]:
test = sklearn.metrics.classification_report(list(test_df['labels']), wrong_preds)

In [None]:
flat_list = [item for sublist in wrong_preds for item in sublist]

In [None]:
len(flat_list)

In [None]:
test = sklearn.metrics.recall_score(list(test_df[test_df.sentence_id=='sent1536']['labels']), wrong_preds[4], average = None)

In [None]:
test

In [None]:
for item in model_outputs:
  print('len: ', (len(item)), item)
  for i in item:
    print(len(i), i)
    [print(_NER_TAGS[np.argmax(_)]) for _ in i]

  break

In [None]:
predictions, raw_outputs = model.predict(["กรอบ นอก นุ่ม ใน ทานง่าย ทำ ไม มัน ทำ นาย ออก มา ไม่ ครบ แว้ ต้อง หาย ไป ตลอด เลย กี่อัน ต่อ กี่อัน ก็หายไป ไม่ เข้า ใจโว้ยยยยยย ทด ทด/ ทกทฟห ท าฟกหืฟหก  ร าฟดืากห ่ฟ าสา สฟืาหากหฟส กนกาหห น นกนฟ ำไา าห น าน test test"], split_on_space=True)

In [None]:
print(predictions)

# Using the fine-tuning model

In [None]:
model_checkpoint = "airesearch/wangchanberta-base-att-spm-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

_NER_TAGS = [
        "O",
        "B-Claim", 
        "B-Person", 
        "B-Scence", 
        "I-Claim", 
        "I-Person", 
        "I-Scence"
    ]

# Configure the model
ner_args = NERArgs()
ner_args.output_dir = "./outputs/NER-BERT-ST_2023-04-26_22-48-00/"
ner_args.train_batch_size = 8
ner_args.evaluate_during_training = True
ner_args.overwrite_output_dir = True
ner_args.num_train_epochs = 20 #10
ner_args.max_seq_length = 450

model = NERModel(
    "camembert", "./outputs/NER-BERT-ST_2023-04-26_22-48-00/", args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS
)

In [None]:
#@title Predict the text
input_text = "\u0E02\u0E19\u0E32\u0E14 60 \u0E01\u0E23\u0E31\u0E21/\u0E41\u0E1E\u0E47\u0E04   1: \u0E44\u0E27\u0E17\u0E4C\u0E0A\u0E47\u0E2D\u0E04 \u0E42\u0E01\u0E41\u0E25\u0E47\u0E15(\u725B\u5976\u5473\uFF09 2: \u0E0A\u0E32\u0E40\u0E02\u0E35\u0E22\u0E27(\u7EFF\u8336\u5473\uFF09 3: \u0E0A\u0E47\u0E2D\u0E04 \u0E42\u0E01\u0E41\u0E25\u0E47\u0E15(\u5DE7\u514B\u529B\u5473\uFF09 4: \u0E14\u0E2D\u0E01\u0E01\u0E38\u0E2B\u0E25\u0E32\u0E1A\uFF08\u73AB\u7470\u5473\uFF09  \u0E2B\u0E2D\u0E21 \u0E25\u0E30\u0E21\u0E38\u0E19 \u0E44\u0E21\u0E48\u0E2B\u0E27\u0E32\u0E19\u0E08\u0E19\u0E40\u0E01\u0E34\u0E19\u0E44\u0E1B \u0E2D\u0E23\u0E48\u0E2D\u0E22\u0E25\u0E07\u0E15\u0E31\u0E27\u0E2A\u0E38\u0E14\u0E46 \u0E43\u0E04\u0E23\u0E44\u0E14\u0E49\u0E25\u0E2D\u0E07\u0E0A\u0E34\u0E21\u0E15\u0E49\u0E2D\u0E07\u0E15\u0E34\u0E14\u0E43\u0E08 \u0E0B\u0E37\u0E49\u0E2D\u0E40\u0E1B\u0E47\u0E19\u0E02\u0E2D\u0E07\u0E02\u0E27\u0E31\u0E0D \u0E02\u0E2D\u0E07\u0E1D\u0E32\u0E01 \u0E01\u0E47\u0E40\u0E2B\u0E21\u0E32\u0E30 * * \u0E23\u0E2A\u0E41\u0E19\u0E30\u0E19\u0E33 \u0E2A\u0E48\u0E27\u0E19\u0E15\u0E31\u0E27\u0E41\u0E21\u0E48\u0E04\u0E49\u0E32\u0E0A\u0E2D\u0E1A \u0E44\u0E27\u0E17\u0E4C\u0E0A\u0E47\u0E2D\u0E04\u0E42\u0E01\u0E41\u0E25\u0E47\u0E15\u0E17\u0E35\u0E48\u0E2A\u0E38\u0E14 \u0E2B\u0E2D\u0E21\u0E19\u0E21 \u0E19\u0E38\u0E48\u0E21\u0E25\u0E34\u0E49\u0E19 \u0E15\u0E31\u0E14\u0E40\u0E1B\u0E23\u0E35\u0E49\u0E22\u0E27\u0E44\u0E14\u0E49\u0E14\u0E35  \u0E41\u0E25\u0E30\u0E43\u0E2B\u0E49\u0E04\u0E27\u0E32\u0E21\u0E2B\u0E2D\u0E21\u0E02\u0E2D\u0E07\u0E0A\u0E47\u0E2D\u0E04\u0E42\u0E01\u0E41\u0E25\u0E47\u0E15\u0E01\u0E33\u0E25\u0E31\u0E07\u0E14\u0E35 \u0E44\u0E21\u0E48\u0E2B\u0E27\u0E32\u0E19\u0E08\u0E19\u0E40\u0E01\u0E34\u0E19 \u0E44\u0E21\u0E48\u0E40\u0E25\u0E35\u0E48\u0E22\u0E19 \u0E17\u0E32\u0E19\u0E40\u0E1E\u0E25\u0E34\u0E19\u0E46" #@param {type:"string"}
tokens = tokenizer(input_text)

model_inps = []
for i in tokens['input_ids']:
  decode_token = tokenizer.decode(i)
  if decode_token!='<s>' or decode_token!='</s>':
    model_inps.append(decode_token)

model_inps = model_inps[1:-1]
if "" in model_inps:
  model_inps.remove("")
model_inps = [" ".join(model_inps)]
print('model input: ',model_inps)

predictions, outputs = model.predict(model_inps, split_on_space=True)
pprint(predictions)

In [None]:
#remove emoji
import re
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', str(string))

In [None]:
df = pd.read_csv('./dataset/shopee1_stu.csv')
df.head()

## Join name and description
df['name_desc'] = df['name'] + ' ' + df['description']
## Rename Unnamed: 0 to id
df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
## Keep only id and name_desc
name_desc_df = df[['id', 'name_desc']]
name_desc_df.head()

## Remove emoji in name_desc_df
name_desc_df['name_desc'] = name_desc_df['name_desc'].apply(lambda x: remove_emoji(x))
name_desc_df.head()

In [None]:
# NER
for item in model_outputs:
  print('len: ', (len(item)), item)
  for i in item:
    print(len(i), i)
    [print(_NER_TAGS[np.argmax(_)]) for _ in i]

  break

In [None]:
name_desc_df_test = name_desc_df[:1000]
name_desc_df_test.head()

In [None]:
## NER
def ner_predict(text):
  pred, raw_outputs = model.predict([text])
  return pred[0]

In [None]:
## Test ner_predict
ner_predict('กรอบ นอก นุ่ม ใน ทานง่าย ทำ ไม มัน ทำ นาย ออก มา ไม่ ครบ แว้ ต้อง หาย ไป ตลอด เลย กี่อัน ต่อ กี่อัน ก็หายไป ไม่ เข้า ใจโว้ยยยยยย ทด ทด/ ทกทฟห ท าฟกหืฟหก  ร าฟดืากห ่ฟ าสา สฟืาหากหฟส กนกาหห น นกนฟ ำไา าห น าน test')

In [None]:
## ner_predict all name_desc_df_test
name_desc_df_test['ner'] = name_desc_df_test['name_desc'].apply(lambda x: ner_predict(x))

In [None]:
## Print ner_predict result
name_desc_df_test.head()
name_desc_df_test.to_csv("./dataset/name_desc_df_test.csv")

In [None]:
## Break array to string
def break_array_to_string(array):
  string = ''
  for item in array:
    string += item + ' '
  return string

## Break array for name_desc_df_test
name_desc_df_test['ner'] = name_desc_df_test['ner'].apply(lambda x: break_array_to_string(x))
## Remove emoji from name_desc_df_test
name_desc_df_test['ner'] = name_desc_df_test['ner'].apply(lambda x: remove_emoji(x))
name_desc_df_test.head()
name_desc_df_test.to_csv("./dataset/name_desc_df_test.csv")

# Real Data processing

In [None]:
import pandas as pd

In [None]:
_main_df = pd.read_csv('./drive/MyDrive/Dataset/2022.07.11_shopee_simplified_category.csv')

In [None]:
start = 0
length = len(_main_df)
step = length//10
end = step
while end<length:
  print(start, end-1)
  tmp_df = _main_df.iloc[start:end]
  tmp_df.to_csv('./drive/MyDrive/Dataset/Chunk/2022.07.11_shopee_simplified_category_{}-{}.csv'.format(start, end-1))
  start+=step
  if length-end > step:
    end+=step
  else:
    end = length
    tmp_df = _main_df.iloc[start:end]
    tmp_df.to_csv('./drive/MyDrive/Dataset/Chunk/2022.07.11_shopee_simplified_category_{}-{}.csv'.format(start, end-1))
    break

In [None]:
len(_main_df)

In [None]:
_main_df.tail()

In [None]:
tmp_df

In [None]:
_main_df.iloc[length-1:length]

In [None]:
if True:
  with open('./drive/MyDrive/Dataset/preprocessed_unseen_shopee.pickle', 'rb') as f:
    inps = pickle.load(f)
else:
  _is_official_df = pd.read_csv('./drive/MyDrive/Dataset/add_col_is_official_shop_2022_07_11_shopee.csv')
  _main_df = pd.read_csv('./drive/MyDrive/Dataset/2022.07.11_shopee_simplified_category.csv')
  joined_df = pd.merge(_main_df, _is_official_df, on='itemid', how='inner')
  sentences = list(joined_df.description.dropna())
  inps = []
  for sent in tqdm(sentences):
    tokens = tokenizer(sent)
    model_inps = []
    for i in tokens['input_ids']:
      decode_token = tokenizer.decode(i)
      if decode_token!='<s>' or decode_token!='</s>':
        model_inps.append(decode_token)

    model_inps = model_inps[1:-1]
    if "" in model_inps:
      model_inps.remove("")
    model_inps = " ".join(model_inps)
    inps.append(model_inps)

In [None]:
step = len(inps)//20
start = 0
end = start+step
while start < len(inps):
  if end >= len(inps):
    end=len(inps)-1
  output = model.predict(inps[start:end], split_on_space=True) #predictions, outputs
  with open('./drive/MyDrive/Dataset/predict_output{}-{}.pickle'.format(start, end), 'wb') as f:
    pickle.dump(output, f)
  start+=step
  end+=step
  del output

In [None]:
def extract_c_t(prediction):
  claims = []
  tags = []
  for curr in tqdm(predictions):
    i=0
    c = ''
    claim = []
    tag = []
    prev_tag = None
    tag_split = None
    while True:
      if tag_split!=None and len(tag_split)>1:
        prev_tag = tag_split[1]
      tag_split = list(curr[i].values())[0].split('-')
      if tag_split[0] == 'B':
        if prev_tag!=None:
          tag.append(prev_tag)
          claim.append(c)
        c = ''
        c+=list(curr[i].keys())[0]
      elif tag_split[0] == 'I':
        c+=list(curr[i].keys())[0]
      i+=1
      if i==len(curr):
        claim.append(c)
        tag.append(prev_tag)
        break
    claims.append(claim)
    tags.append(tag)
  return claims, tags

In [None]:
c_t_df = pd.DataFrame()
step = 214379//20
start = 0
end = start+step
while start < 214379:
  if end >= 214379:
    end=214379-1
  with open('./drive/MyDrive/Dataset/predict_output{}-{}.pickle'.format(start, end), 'rb') as f:
    test = pickle.load(f)
  predictions, _ = test
  del test, _

  claims, tags = extract_c_t(predictions)
  tmp = pd.DataFrame(data={'Claims':claims, 'NER_Tags':tags})
  c_t_df = pd.concat([c_t_df, tmp], ignore_index=True)
  del tmp
  start+=step
  end+=step

In [None]:
del c_t_df#.to_csv('./drive/MyDrive/Dataset/shopee-claims_tags_dropna-description.csv')

In [None]:
del _is_official_df# = pd.read_csv('./drive/MyDrive/Dataset/add_col_is_official_shop_2022_07_11_shopee.csv')
del _main_df# = pd.read_csv('./drive/MyDrive/Dataset/2022.07.11_shopee_simplified_category.csv')
del joined_df# = pd.merge(_main_df, _is_official_df, on='itemid', how='inner')
joined_df = joined_df[joined_df['description'].notna()]

In [None]:
with open('./drive/MyDrive/Dataset/predict_output214360-214378.pickle', 'rb') as f:
  test = pickle.load(f)

In [None]:
len(test)

In [None]:
test[0]