# Cài đặt môi trường import các package

In [None]:
from google.colab import drive  
drive.mount("/content/drive")

In [None]:
!pip install -q keras-bert==0.85.0
!pip install -q keras-rectified-adam
!pip install keras==2.3.1
%tensorflow_version 1.x


In [None]:
import pickle
import pandas as pd
path = "/content/drive/MyDrive/ML/Sentiment-Analysis-using-BERT/"
import os
os.environ['TF_KERAS'] = '1'
import codecs
import tensorflow as tf
import tensorflow.keras as keras
from keras_radam import RAdam
from keras import backend as K
from keras_bert import load_trained_model_from_checkpoint
import numpy as np
from keras_bert.layers import Extract
from keras.regularizers import l1

# Tao hệ thống tử điển

Cài đặt và giải nén hệ thống từ điển được lưu sẵn trên google storage.

In [None]:
!wget -q https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
!unzip -o multi_cased_L-12_H-768_A-12.zip

Đọc file *vocab.txt* bao gồm các từ vựng và thứ tự của chúng cũng là mức độ tích cực tăng dần

In [None]:
pretrained_path = 'multi_cased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')
token_dict = {}
with codecs.open(vocab_path, 'rb','utf-8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)# the first word is the most negative

from keras_bert import Tokenizer
tokenizer = Tokenizer(token_dict,cased=True)

In [None]:
SEQ_LEN = 256
BATCH_SIZE = 16
EPOCHS = 1
LR = 2e-5

# Load dữ liệu train và dữ liệu test từ file lên

Định nghĩ hàn **load_data** chuyển câu thành vector.

In [None]:
from sklearn.model_selection import train_test_split
def load_data(data, sentiments):
    global tokenizer
    indices = []
    for text in data:
      ids, segments = tokenizer.encode(text, max_len=SEQ_LEN)
      indices.append(ids)

    return [indices, np.zeros_like(indices)], np.array(sentiments)

Load dữ liệu huấn luyện và kiểm thử từ NTC_SV

In [None]:
df = pd.read_csv(path+"/Data/NTC_SV/NTC_SV_train.csv")
df = df.dropna()
data = df.review.to_list()
label = df.label.to_list()
data_train,data_valid,label_train,label_valid = train_test_split(data,label,test_size=0.15,random_state=48)
X_train,Y_train = load_data(data_train,label_train)
X_valid,Y_valid = load_data(data_valid,label_valid)

In [None]:
test = pd.read_csv(path+'/Data/NTC_SV/NTC_SV_test.csv')
test = test.dropna()
data_test = test.review.tolist()
label_test = test.label.tolist()
X_test,Y_test = load_data(data_test,label_test)

# Khởi tạo model và train

In [None]:
model = load_trained_model_from_checkpoint(
    config_path,
    checkpoint_path,
    trainable=True,
    seq_len=SEQ_LEN,
    output_layer_num=4
)
inputs = model.inputs[:2]
newout = Extract(index=0)(model.output)
newout = keras.layers.Dense(768,activation='relu')(newout)
outputs = keras.layers.Dense(units=1, activation='sigmoid')(newout)
model = keras.models.Model(inputs, outputs)
model.compile(
  loss='binary_crossentropy',
  optimizer = RAdam(learning_rate=LR),
  metrics=['accuracy'],
)

In [None]:
from keras.callbacks import ModelCheckpoint
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=path+"checkpoint.ckpt",                                            
                                                 verbose=1)
callbacks_list = [checkpoint]

In [None]:
model.load_weights(path+"/Data/Base_weight/lastweight")

In [None]:
# model.fit(X_train,Y_train,epochs=2,batch_size=16,verbose = 1,validation_data=[X_valid,Y_valid],callbacks=callbacks_list) 
model.fit(X_train,Y_train,epochs=1,batch_size=16,verbose = 1,validation_data=[X_valid,Y_valid])   

# Kiểm thử

In [None]:
def classify_sentiment(list_text):
  sample, _ = load_data(list_text,[])
  probability = model.predict(sample)
  result=[] 
  for i in range(len(list_text)):  
    print(probability)
    if np.round(probability[i])==1:
      result.append("Tích cực")
    else:
      result.append("Tiêu cực")
  return result

In [None]:
print(classify_sentiment(["thức ăn cực kì ngon"]))

In [None]:
model.save_weights(path+"/Data/OneEpoch/lastweight")

In [None]:
from sklearn.metrics import precision_score,recall_score,f1_score
y_pred = np.round(model.predict(X_test))
print(precision_score(Y_test,y_pred))
print(recall_score(Y_test,y_pred))
print(f1_score(Y_test,y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import itertools
matplotlib.rcParams.update({'font.size': 16})
labels = [0,1]
cn = confusion_matrix(Y_test,y_pred,labels=labels)

In [None]:
#@title plot confusion matrix
def plot_confusion_matrix(cm,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True,
                          target_names=None,
                          path_file='1.svg'):
    
    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(path_file,format='svg')
    plt.show()

In [None]:
plot_confusion_matrix(cn,title='Data vreview',normalize=False,target_names=labels,path_file=path+'/Data/Base_weight/bert_base_vreview_cm.svg')

In [None]:
for i in range(1000):
  if y_pred[i]!=Y_test[i]:
    print(y_pred[i],Y_test[i])
    print(data_test[i])