In [1]:
## Libraries
from kashgari.tasks.labeling import BiLSTM_CRF_Model
from kashgari.embeddings import BertEmbedding
import numpy as np

In [2]:
## DATA
SAVE_DATA_noclus_tr = "Save/noclus_bistlstm_dataset_train.csv"
SAVE_DATA_noclus_ts = "Save/noclus_bistlstm_dataset_test.csv"
SAVE_DATA_noclus_dv = "Save/noclus_bistlstm_dataset_dev.csv"

train = open(SAVE_DATA_noclus_tr).read()
test = open(SAVE_DATA_noclus_ts).read()
val = open(SAVE_DATA_noclus_dv).read()

def file_convert2format(file):
    split_file = file.split("\n\t\n")
    t_x = []
    t_y = []
    for sent in split_file:
        t_x.append([])
        t_y.append([])
        sent_split = sent.split("\n")
        for lines in sent_split:
            line_split = lines.split("\t")
            t_x[-1].append(line_split[0])
            t_y[-1].append(line_split[1])
            
    return(t_x,t_y)

In [3]:
train_x, train_y = file_convert2format(train)
test_x, test_y = file_convert2format(test)
valid_x, valid_y = file_convert2format(val)

In [4]:
total_x = np.array(list(train_x + test_x + valid_x))
total_y = np.array(list(train_y + test_y + valid_y))

  """Entry point for launching an IPython kernel.
  


In [24]:
print(len(total_x))
print(len(total_y))
index = np.array(list(range(len(total_x))))    

## shuffling the dataset
np.random.shuffle(index)
total_x = total_x[index]
total_y = total_y[index]

## dividing the dataset
train = int(len(total_x)*0.75)          ## 75% of the 100%
val = int((len(total_x) - train)*0.4)  ## 40% of the 30%
test = (len(total_x)-train-val)        ## 60% of the 30%
print(test,train,val)

## the dataset
## tokens
train_x = list(total_x[:train])
test_x = list(total_x[train:(train+test)])
valid_x = list(total_x[(train+test):])

## labels
train_y = list(total_y[:train])
test_y = list(total_y[train:(train+test)])
valid_y = list(total_y[(train+test):])


4108
4108
617 3081 410


In [25]:
## using pretrained embeddings to fine tune for task
## BERT_Tiny_2_128
## cased_L-12_H-768_A-12

bert_embed = BertEmbedding('BERTmodels/BERT_Tiny_2_128')
model = BiLSTM_CRF_Model()
hyper = model.default_hyper_parameters()

hyper['layer_blstm']['units'] = 32
hyper['layer_dropout']['rate'] = 0.8
hyper['layer_activation']['activation'] = 'softmax'
model = BiLSTM_CRF_Model(bert_embed, sequence_length=50,hyper_parameters=hyper)

print(hyper)

In [26]:
## Model training
# model.fit(train_x, train_y, valid_x, valid_y,epochs=1000)

In [27]:
# Evaluate the model
model.evaluate(test_x, test_y)

2021-05-05 14:36:39,032 [DEBUG] kashgari - predict seq_length: None, input: (2, 617, 95)




2021-05-05 14:36:40,186 [DEBUG] kashgari - predict output: (617, 95)
2021-05-05 14:36:40,188 [DEBUG] kashgari - predict output argmax: [[0 5 3 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 ...
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]]



           precision    recall  f1-score   support

    CLAIM     0.5410    0.4371    0.4835       151
      ACT     0.5432    0.4490    0.4916        98

micro avg     0.5419    0.4418    0.4867       249
macro avg     0.5419    0.4418    0.4867       249



{'detail': {'CLAIM': {'precision': 0.5409836065573771,
   'recall': 0.4370860927152318,
   'f1-score': 0.4835164835164836,
   'support': 151},
  'ACT': {'precision': 0.5432098765432098,
   'recall': 0.4489795918367347,
   'f1-score': 0.49162011173184356,
   'support': 98}},
 'precision': 0.5418598092024036,
 'recall': 0.44176706827309237,
 'f1-score': 0.4867058632960229,
 'support': 249}

In [28]:
# Model data will save to 'saved_act_claim_model' folder
# model.save('saved_act_claim_model_1000_BERTtiny_bilstmcrf_2')
loaded_model = BiLSTM_CRF_Model.load_model('saved_act_claim_model_1000_BERTtiny_bilstmcrf_2')

2021-05-05 14:39:45,654 [DEBUG] kashgari - ------------------------------------------------
2021-05-05 14:39:45,655 [DEBUG] kashgari - Loaded transformer model's vocab
2021-05-05 14:39:45,656 [DEBUG] kashgari - config_path       : BERTmodels/BERT_Tiny_2_128/bert_config.json
2021-05-05 14:39:45,657 [DEBUG] kashgari - vocab_path      : BERTmodels/BERT_Tiny_2_128/vocab.txt
2021-05-05 14:39:45,658 [DEBUG] kashgari - checkpoint_path : BERTmodels/BERT_Tiny_2_128/bert_model.ckpt
2021-05-05 14:39:45,659 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused0]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '[unused27]', '[unused28]', '[unused29]', '[unused30]', '[unused31]', '[unused32]', 

In [35]:
# Load predict model
print((" ").join(test_x[15]))
(" ").join((loaded_model.predict(test_x[15:16]))[0])

2021-05-05 14:55:08,715 [DEBUG] kashgari - predict seq_length: None, input: (2, 1, 57)


The memo had its effect last Friday , when Dr. Watson lost his bid for re - election after the administration threw its weight behind the ' ' let 's drag our feet ' ' candidate , Dr. Rajendra Pachauri of New Delhi , who is known for his virulent anti - American statements .


2021-05-05 14:55:08,788 [DEBUG] kashgari - predict output: (1, 57)
2021-05-05 14:55:08,790 [DEBUG] kashgari - predict output argmax: [[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 2 2 2
  2 1 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 0]]


'O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM O I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM O O O O O O O O O O O O'