In [1]:
## Libraries
from kashgari.tasks.labeling import BiLSTM_Model
from kashgari.embeddings import BertEmbedding
import numpy as np

In [2]:
## DATA
# SAVE_DATA_clus_tr = "Save/clus_bistlstm_dataset_train.csv"
# SAVE_DATA_clus_ts = "Save/clus_bistlstm_dataset_test.csv"
# SAVE_DATA_clus_dv = "Save/clus_bistlstm_dataset_dev.csv"

SAVE_DATA_noclus_tr = "Save/noclus_bistlstm_dataset_train.csv"
SAVE_DATA_noclus_ts = "Save/noclus_bistlstm_dataset_test.csv"
SAVE_DATA_noclus_dv = "Save/noclus_bistlstm_dataset_dev.csv"

train = open(SAVE_DATA_noclus_tr).read()
test = open(SAVE_DATA_noclus_ts).read()
val = open(SAVE_DATA_noclus_dv).read()

def file_convert2format(file):
    split_file = file.split("\n\t\n")
    t_x = []
    t_y = []
    for sent in split_file:
        t_x.append([])
        t_y.append([])
        sent_split = sent.split("\n")
        for lines in sent_split:
            line_split = lines.split("\t")
            t_x[-1].append(line_split[0])
            t_y[-1].append(line_split[1])
            
    return(t_x,t_y)

In [3]:
train_x, train_y = file_convert2format(train)
test_x, test_y = file_convert2format(test)
valid_x, valid_y = file_convert2format(val)

In [4]:
total_x = np.array(list(train_x + test_x + valid_x))
total_y = np.array(list(train_y + test_y + valid_y))

  """Entry point for launching an IPython kernel.
  


In [60]:
print(len(total_x))
print(len(total_y))
index = np.array(list(range(len(total_x))))    

## shuffling the dataset
np.random.shuffle(index)
total_x = total_x[index]
total_y = total_y[index]

## dividing the dataset
train = int(len(total_x)*0.75)          ## 75% of the 100%
val = int((len(total_x) - train)*0.4)  ## 40% of the 30%
test = (len(total_x)-train-val)        ## 60% of the 30%
print(test,train,val)

## the dataset
## tokens
train_x = list(total_x[:train])
test_x = list(total_x[train:(train+test)])
valid_x = list(total_x[(train+test):])

## labels
train_y = list(total_y[:train])
test_y = list(total_y[train:(train+test)])
valid_y = list(total_y[(train+test):])

4108
4108
617 3081 410


In [61]:
## using pretrained embeddings to fine tune for task
## BERT_Tiny_2_128
## cased_L-12_H-768_A-12

bert_embed = BertEmbedding('BERTmodels/BERT_Tiny_2_128')
model = BiLSTM_Model()
hyper = model.default_hyper_parameters()

hyper['layer_blstm']['units'] = 32
hyper['layer_dropout']['rate'] = 0.8
hyper['layer_activation']['activation'] = 'softmax'
model = BiLSTM_Model(bert_embed, sequence_length=32,hyper_parameters=hyper)

print(hyper)

{'layer_blstm': {'units': 32, 'return_sequences': True}, 'layer_dropout': {'rate': 0.8}, 'layer_time_distributed': {}, 'layer_activation': {'activation': 'softmax'}}


In [62]:
## training the model with our data for 1000 epochs
# model.fit(train_x, train_y, valid_x, valid_y,epochs=1000)

In [63]:
# Evaluate the model
model.evaluate(test_x, test_y)

2021-05-05 14:31:49,353 [DEBUG] kashgari - predict seq_length: None, input: (2, 617, 94)




2021-05-05 14:31:50,335 [DEBUG] kashgari - predict output: (617, 94)
2021-05-05 14:31:50,337 [DEBUG] kashgari - predict output argmax: [[0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 ...
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]]



           precision    recall  f1-score   support

    CLAIM     0.1414    0.0892    0.1094       157
      ACT     0.1111    0.0114    0.0206        88

micro avg     0.1389    0.0612    0.0850       245
macro avg     0.1305    0.0612    0.0775       245



{'detail': {'CLAIM': {'precision': 0.1414141414141414,
   'recall': 0.08917197452229299,
   'f1-score': 0.10937499999999999,
   'support': 157},
  'ACT': {'precision': 0.1111111111111111,
   'recall': 0.011363636363636364,
   'f1-score': 0.020618556701030927,
   'support': 88}},
 'precision': 0.1305297876726448,
 'recall': 0.06122448979591836,
 'f1-score': 0.07749513465179886,
 'support': 245}

In [64]:
# Model data will save to 'saved_act_claim_model' folder
# model.save('saved_act_claim_model_1000_dropout_0.8_BERTtiny_2')
loaded_model = BiLSTM_Model.load_model('saved_act_claim_model_1000_dropout_0.8_BERTtiny_2')

2021-05-05 14:32:09,087 [DEBUG] kashgari - ------------------------------------------------
2021-05-05 14:32:09,088 [DEBUG] kashgari - Loaded transformer model's vocab
2021-05-05 14:32:09,089 [DEBUG] kashgari - config_path       : BERTmodels/BERT_Tiny_2_128/bert_config.json
2021-05-05 14:32:09,090 [DEBUG] kashgari - vocab_path      : BERTmodels/BERT_Tiny_2_128/vocab.txt
2021-05-05 14:32:09,091 [DEBUG] kashgari - checkpoint_path : BERTmodels/BERT_Tiny_2_128/bert_model.ckpt
2021-05-05 14:32:09,091 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused0]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '[unused27]', '[unused28]', '[unused29]', '[unused30]', '[unused31]', '[unused32]', 

In [71]:
# Load predict model
print(test_x[112])
(" ").join((loaded_model.predict(test_x[112:113]))[0])

2021-05-05 14:55:33,754 [DEBUG] kashgari - predict seq_length: None, input: (2, 1, 39)


['This', 'sort', 'of', 'fuzzy', 'thinking', 'provides', 'the', 'warm', 'illusion', 'of', 'saving', 'the', 'sky', ',', 'which', 'we', "'re", 'not', 'even', 'sure', 'needs', 'saving', ',', 'but', 'it', 'wo', "n't", 'have', 'any', 'impact', 'at', 'all', 'on', 'global', 'warming', ',', "''"]


2021-05-05 14:55:33,823 [DEBUG] kashgari - predict output: (1, 39)
2021-05-05 14:55:33,824 [DEBUG] kashgari - predict output argmax: [[0 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2
  1 1 0]]


'O I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM O I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM O I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM I-CLAIM O O'