In [1]:
import kashgari
from kashgari.tasks.labeling import BiLSTM_Model
from kashgari.embeddings import BertEmbedding

In [4]:
## DATA
# SAVE_DATA_clus_tr = "Save/clus_bistlstm_dataset_train.csv"
# SAVE_DATA_clus_ts = "Save/clus_bistlstm_dataset_test.csv"
# SAVE_DATA_clus_dv = "Save/clus_bistlstm_dataset_dev.csv"

SAVE_DATA_noclus_tr = "Save/noclus_bistlstm_dataset_train.csv"
SAVE_DATA_noclus_ts = "Save/noclus_bistlstm_dataset_test.csv"
SAVE_DATA_noclus_dv = "Save/noclus_bistlstm_dataset_dev.csv"

train = open(SAVE_DATA_noclus_tr).read()
test = open(SAVE_DATA_noclus_ts).read()
val = open(SAVE_DATA_noclus_dv).read()

def file_convert2format(file):
    split_file = file.split("\n\t\n")
    t_x = []
    t_y = []
    for sent in split_file:
        t_x.append([])
        t_y.append([])
        sent_split = sent.split("\n")
        for lines in sent_split:
            line_split = lines.split("\t")
            t_x[-1].append(line_split[0])
            t_y[-1].append(line_split[1])
            
    return(t_x,t_y)

In [5]:
train_x, train_y = file_convert2format(train)
test_x, test_y = file_convert2format(test)
valid_x, valid_y = file_convert2format(val)

In [24]:
## using pretrained embeddings to fine tune for task
bert_embed = BertEmbedding('BERTmodels/cased_L-12_H-768_A-12')
model = BiLSTM_Model()
hyper = model.default_hyper_parameters()

hyper['layer_blstm']['units'] = 128
hyper['layer_dropout']['rate'] = 0.4
hyper['layer_activation']['activation'] = 'softmax'
model = BiLSTM_Model(bert_embed, sequence_length=100,hyper_parameters=hyper)

print(hyper)

2021-05-05 01:30:29,394 [DEBUG] kashgari - ------------------------------------------------
2021-05-05 01:30:29,395 [DEBUG] kashgari - Loaded transformer model's vocab
2021-05-05 01:30:29,396 [DEBUG] kashgari - config_path       : BERTmodels/cased_L-12_H-768_A-12/bert_config.json
2021-05-05 01:30:29,397 [DEBUG] kashgari - vocab_path      : BERTmodels/cased_L-12_H-768_A-12/vocab.txt
2021-05-05 01:30:29,398 [DEBUG] kashgari - checkpoint_path : BERTmodels/cased_L-12_H-768_A-12/bert_model.ckpt
2021-05-05 01:30:29,399 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '[unused27]', '[unused28]', '[unused29]', '[unused30]', '[unused31]', '[unused3

{'layer_blstm': {'units': 128, 'return_sequences': True}, 'layer_dropout': {'rate': 0.4}, 'layer_time_distributed': {}, 'layer_activation': {'activation': 'softmax'}}


In [25]:
model.fit(train_x, train_y, valid_x, valid_y)

Preparing text vocab dict: 100%|██████████| 3081/3081 [00:00<00:00, 85297.46it/s]
Preparing text vocab dict: 100%|██████████| 410/410 [00:00<00:00, 78241.26it/s]
2021-05-05 01:30:35,628 [DEBUG] kashgari - --- Build vocab dict finished, Total: 3279 ---
2021-05-05 01:30:35,629 [DEBUG] kashgari - Top-10: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', 'the', ',', '.', 'to', 'of', 'and']
Preparing text vocab dict: 100%|██████████| 3081/3081 [00:00<00:00, 123424.33it/s]
Preparing text vocab dict: 100%|██████████| 410/410 [00:00<00:00, 112972.32it/s]
2021-05-05 01:30:35,665 [DEBUG] kashgari - --- Build vocab dict finished, Total: 7 ---
2021-05-05 01:30:35,666 [DEBUG] kashgari - Top-10: ['[PAD]', 'O', 'I-CLAIM', 'I-ACT', 'B-CLAIM', 'B-ACT', '']
2021-05-05 01:30:41,548 [DEBUG] kashgari - fit input shape: (2, 64, 100)
2021-05-05 01:30:41,549 [DEBUG] kashgari - fit input shape: (64, 100)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fa00846eac8>

In [26]:
# Evaluate the model
model.evaluate(test_x, test_y)

2021-05-05 01:45:24,200 [DEBUG] kashgari - predict seq_length: None, input: (2, 617, 95)




2021-05-05 01:45:57,537 [DEBUG] kashgari - predict output: (617, 95)
2021-05-05 01:45:57,539 [DEBUG] kashgari - predict output argmax: [[0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 ...
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]]



           precision    recall  f1-score   support

    CLAIM     0.0658    0.0581    0.0617        86
      ACT     0.1818    0.0615    0.0920        65

micro avg     0.0918    0.0596    0.0723       151
macro avg     0.1157    0.0596    0.0747       151



{'detail': {'CLAIM': {'precision': 0.06578947368421052,
   'recall': 0.05813953488372093,
   'f1-score': 0.06172839506172839,
   'support': 86},
  'ACT': {'precision': 0.18181818181818182,
   'recall': 0.06153846153846154,
   'f1-score': 0.09195402298850576,
   'support': 65}},
 'precision': 0.11573560632466175,
 'recall': 0.059602649006622516,
 'f1-score': 0.07473942695073853,
 'support': 151}

In [27]:
# Model data will save to 'saved_act_claim_model' folder
# model.save('saved_act_claim_model_1')
loaded_model = BiLSTM_Model.load_model('saved_act_claim_model_1')

2021-05-05 05:09:41,962 [DEBUG] kashgari - ------------------------------------------------
2021-05-05 05:09:41,963 [DEBUG] kashgari - Loaded transformer model's vocab
2021-05-05 05:09:41,964 [DEBUG] kashgari - config_path       : BERTmodels/cased_L-12_H-768_A-12/bert_config.json
2021-05-05 05:09:41,965 [DEBUG] kashgari - vocab_path      : BERTmodels/cased_L-12_H-768_A-12/vocab.txt
2021-05-05 05:09:41,965 [DEBUG] kashgari - checkpoint_path : BERTmodels/cased_L-12_H-768_A-12/bert_model.ckpt
2021-05-05 05:09:41,966 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '[unused27]', '[unused28]', '[unused29]', '[unused30]', '[unused31]', '[unused3

In [31]:
# Load saved model
print(test_x[133])
loaded_model.predict(test_x[133:134])

2021-05-05 05:10:51,567 [DEBUG] kashgari - predict seq_length: None, input: (2, 1, 19)


["'", "'", 'This', 'characteristic', 'delusion', 'of', 'imperial', 'power', 'is', 'to', 'confuse', 'global', 'power', 'with', 'global', 'domination', '.']


2021-05-05 05:10:56,355 [DEBUG] kashgari - predict output: (1, 19)
2021-05-05 05:10:56,356 [DEBUG] kashgari - predict output argmax: [[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0]]


[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']]