In [0]:
from google.colab import drive
drive.mount('/content/drive')


### global config


In [0]:

import os
import platform
import datetime,pytz

root_ = '/content/drive/My Drive/colab/' if platform.system() == 'Linux' else '/Users/love/Code/'
_research_ = '' if platform.system() == 'Linux' else 'research'
rod_ = os.path.join(root_, _research_, 'ro-database')
rwd_ = os.path.join(root_, _research_, 'rw-database')
log_ = os.path.join(root_, _research_, 'log')

if not os.path.exists(rod_):
    os.makedirs(rod_)

if not os.path.exists(rwd_):
    os.makedirs(rwd_)

if not os.path.exists(log_):
    os.makedirs(log_)

word2idx_path_extend_ = os.path.join(rod_, 'bert_word2idx_extend.json')


corpus_dir_ = os.path.join(rwd_, 'corpus_dir')

if not os.path.exists(corpus_dir_):
    os.makedirs(corpus_dir_)

#bert.model.epoch.4
bert_model_ = os.path.join(rod_, "bert_model")

# train model
sentiment_model_ = os.path.join(rwd_, "sentiment_model")
if not os.path.exists(sentiment_model_):
    os.makedirs(sentiment_model_)

sentiment_train_log_ = os.path.join(log_, 'df_train_log.pickle')
sentiment_test_log_ = os.path.join(log_, datetime.datetime.now(tz=pytz.timezone('Asia/Bangkok')).strftime('%Y-%m-%d %H-%M') + '-df_test_log.pickle')

sentiment_test_excel_log_ = os.path.join(log_, datetime.datetime.now(tz=pytz.timezone('Asia/Bangkok')).strftime('%Y-%m-%d %H-%M') + '.xlsx')

best_model_ = os.path.join(log_, 'best_model')
if not os.path.exists(best_model_):
    os.makedirs(best_model_)

best_model_script_ = os.path.join(log_, 'best_model_script')
if not os.path.exists(best_model_script_):
    os.makedirs(best_model_script_)

best_excel_ = os.path.join(log_, 'best_excel')
if not os.path.exists(best_excel_):
    os.makedirs(best_excel_)

best_df_ = os.path.join(log_, 'best_df')
if not os.path.exists(best_df_):
    os.makedirs(best_df_)

corpus_weibo_ = os.path.join(rod_, 'weibo_senti_100k.csv')
corpus_online_shopping_ = os.path.join(rod_, 'online_shopping_10_cats.csv')
corpus_online_chnsenticorp_ = os.path.join(rod_, 'chnsenticorp.csv')


### util


In [0]:

import json
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt
import openpyxl
import numpy as np
from sklearn.utils import shuffle
import os
import matplotlib.pyplot as plt
from openpyxl import Workbook
from openpyxl.styles import Color, PatternFill, Font, Border
from openpyxl.styles import colors
from openpyxl.cell import Cell


def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    # plt.subplots(figsize=(22, 7))  # set the size of the plot
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


def calculateDictLen(path):
    with open(path, 'r', encoding='utf8')as fp:
        dicts = json.load(fp)
    return len(dicts)


def writeExcel(path, sheetName, data, label, predict, all_store=False):
    if os.path.exists(path):
        workbook = openpyxl.load_workbook(path)
    else:
        workbook = openpyxl.Workbook()

    worksheet = workbook.create_sheet(title=str(sheetName), index=0)

    for y, item in enumerate(data[0]):
        worksheet.cell(1, y + 1, item)

    dl = len(data)
    index = 2
    red_fill = PatternFill("solid", fgColor="FF0000")
    for i in range(1, dl):
        if all_store or (data[i][label] != data[i][predict]):
            for y, item in enumerate(data[i]):
                if data[i][label] != data[i][predict]:
                    worksheet.cell(index, y + 1).fill = red_fill
                worksheet.cell(index, y + 1, item)
            index += 1
    workbook.save(filename=path)


def getTrainTestData(dataset, index, test_proportion=0.2, sentence_len=110):
    # all_positive and all_negative contain all positive and negative samples
    corpus_positive = os.path.join(rod_, 'corpus_dir', dataset, 'positive.txt')
    corpus_negative = os.path.join(rod_, 'corpus_dir', dataset, 'negative.txt')

    with open(corpus_negative, "r", encoding="utf-8") as f:
        all_negative = [line.strip() for line in f.readlines()]
    with open(corpus_positive, "r", encoding="utf-8") as f:
        all_positive = [line.strip() for line in f.readlines()]

    # Get the length of all text
    all_length = [len(i) for i in all_negative] + [len(i) for i in all_positive]

    print('positive: ', len(all_positive))
    print('negative: ', len(all_negative))
    print('all: ', len(all_length))

    plt.hist(all_length, bins=30)
    plt.show()

    print('sentence length ration <', sentence_len, np.mean(np.array(all_length) < sentence_len))

    # Put all the corpus into the list, each corpus is a dict: {"text": text, "label": classification}
    all_data = []
    for text in all_positive:
        all_data.append({"text": text, "label": 1})
    for text in all_negative:
        all_data.append({"text": text, "label": 0})

    # shuffle
    all_data = shuffle(all_data, random_state=666)

    # Use the data to test
    test_idx = int(len(all_data) * test_proportion)

    # Split training set and test set
    train_data = all_data[0:test_idx * index] + all_data[test_idx * (index + 1):]
    test_data = all_data[test_idx * index:test_idx * (index + 1)]

    train_path = os.path.join(rwd_, 'corpus_dir', dataset, 'train.json')
    test_path = os.path.join(rwd_, 'corpus_dir', dataset, 'test.json')
    if not os.path.exists(os.path.dirname(train_path)):
        os.makedirs(os.path.dirname(train_path))
    if not os.path.exists(os.path.dirname(test_path)):
        os.makedirs(os.path.dirname(test_path))

    with open(train_path, "w", encoding="utf-8") as f:
        for i, line in enumerate(train_data):
            f.write(str(line))
            if i != len(train_data) - 1:
                f.write("\n")

    with open(test_path, "w", encoding="utf-8") as f:
        for i, line in enumerate(test_data):
            f.write(str(line))
            if i != len(test_data) - 1:
                f.write("\n")

    print('total test:', len(train_data))
    print('total test:', len(test_data))
    return train_data, test_data




### metrics


In [0]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score


def find_best_threshold(all_predictions, all_labels):
    all_predictions = np.ravel(all_predictions)
    all_labels = np.ravel(all_labels)
    # Define 99 alternative thresholds from 0 to 1 with an interval of 0.01, ranging from 0.01 to 0.99
    thresholds = [i / 100 for i in range(100)]
    all_f1s = []
    for threshold in thresholds:
        # Calculate the f1 score of the current threshold
        preds = (all_predictions >= threshold).astype("int")
        f1 = f1_score(y_true=all_labels, y_pred=preds)
        all_f1s.append(f1)
    # Find the threshold that can maximize f1 socre
    best_threshold = thresholds[int(np.argmax(np.array(all_f1s)))]
    all_predictions = (all_predictions >= best_threshold).astype("int")
    ac, pc, rc, f1 = accuracy_score(all_labels, all_predictions), precision_score(all_labels, all_predictions), recall_score(all_labels, all_predictions), f1_score(all_labels, all_predictions)
    return ac, pc, rc, f1, best_threshold


### sentiment dataset


In [0]:
from torch.utils.data import Dataset
import tqdm
import json
import torch
import random
import numpy as np
from sklearn.utils import shuffle
import re


class CLSDataset(Dataset):
    def __init__(self, corpus_path, word2idx, max_seq_len):

        self.word2idx = word2idx
        # define max length
        self.max_seq_len = max_seq_len
        # directory of corpus dataset
        self.corpus_path = corpus_path
        # define special symbols
        self.pad_index = 0
        self.unk_index = 1
        self.cls_index = 2
        self.sep_index = 3
        self.mask_index = 4
        self.num_index = 5

        # load corpus
        with open(corpus_path, "r", encoding="utf-8") as f:
            # Load all data sets into memory
            self.lines = [eval(line) for line in tqdm.tqdm(f, desc="Loading Dataset")]
            # shuffle
            self.lines = shuffle(self.lines)
            print(corpus_path, 'length:', len(self.lines))
            # Get data length
            self.corpus_lines = len(self.lines)

    def __len__(self):
        return self.corpus_lines

    def __getitem__(self, item):
        # Get the text after tokenize and the corresponding sentiment classification
        text, label = self.get_text_and_label(item)

        text_input = self.tokenize_char(text)

        # Add #CLS# and #SEP# special tokens
        text_input = [self.cls_index] + text_input + [self.sep_index]
        # If the length of the sequence exceeds the length defined by self.max_seq_len, it is truncated
        text_input = text_input[:self.max_seq_len]

        output = {"text_input": torch.tensor(text_input),
                  "label": torch.tensor([label]), "text": text}
        return output

    def get_text_and_label(self, item):
        # Get text and label
        text = self.lines[item]["text"]
        label = self.lines[item]["label"]
        return text, label

    def tokenize_char(self, segments):
        return [self.word2idx.get(char, self.unk_index) for char in segments]



### bert base model


In [0]:

from __future__ import absolute_import, division, print_function, unicode_literals
import copy
import math
import sys
from io import open
import torch
from torch import nn
from torch.nn import CrossEntropyLoss


def gelu(x):
    """Implementation of the gelu activation function.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu}


class BertConfig(object):
    """Configuration class to store the configuration of a `BertModel`.
    """

    def __init__(self,
                 vocab_size,  # Dictionary length
                 hidden_size=384,  # hidden layer length
                 num_hidden_layers=6,
                 num_attention_heads=12,
                 intermediate_size=384 * 4,  # Dimension of linear mapping of feedforward layer
                 hidden_act="gelu",  # activity function
                 hidden_dropout_prob=0.4,  # dropout
                 attention_probs_dropout_prob=0.4,
                 max_position_embeddings=512 * 2,
                 type_vocab_size=2,  # Used to predict next sentence,
                 initializer_range=0.02,  # Standard deviation used to initialize model parameters
                 layer=3,
                 ):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer = layer


class BertEmbeddings(nn.Module):
    """
    Construct the embeddings from word, position and token_type embeddings.
    """

    def __init__(self, config):
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        # Embedding matrix initialization
        nn.init.orthogonal_(self.word_embeddings.weight)
        nn.init.orthogonal_(self.token_type_embeddings.weight)

        # Embedding matrix for normalization
        epsilon = 1e-8
        self.word_embeddings.weight.data = \
            self.word_embeddings.weight.data.div(torch.norm(self.word_embeddings.weight, p=2, dim=1, keepdim=True).data + epsilon)
        self.token_type_embeddings.weight.data = \
            self.token_type_embeddings.weight.data.div(torch.norm(self.token_type_embeddings.weight, p=2, dim=1, keepdim=True).data + epsilon)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids, positional_enc, token_type_ids=None):
        """
        :param input_ids: dimension [batch_size, sequence_length]
        :param positional_enc: positional encoding [sequence_length, embedding_dimension]
        :param token_type_ids: During BERT training, the first sentence is 0 and the second sentence is 1
        :return: dimension [batch_size, sequence_length, embedding_dimension]
        """
        # lookup word vector in table
        words_embeddings = self.word_embeddings(input_ids)

        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = words_embeddings + positional_enc + token_type_embeddings
        # embeddings: [batch_size, sequence_length, embedding_dimension]
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


class BertSelfAttention(nn.Module):

    def __init__(self, config):
        super(BertSelfAttention, self).__init__()
        # Determine whether the embedding dimension can be divisible by num_attention_heads
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        # Q, K, V linear mapping
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        # Input x is one of QKV, dimensions: [batch_size, seq_length, embedding_dim]
        # The output dimensions are reshape and transposed: [batch_size, num_heads, seq_length, embedding_dim / num_heads]
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask, get_attention_matrices=False):
        # Q, K, V linear mapping
        # Q, K, V dimension [batch_size, seq_length, num_heads * embedding_dim]
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)
        # Split QKV into num_heads
        # Convert the dimensions to [batch_size, num_heads, seq_length, embedding_dim / num_heads]
        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        # Dot product of Q and K
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        # attention_scores: [batch_size, num_heads, seq_length, seq_length]
        # Divide by the dimension of K, take the square root to normalize the normal distribution
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        attention_scores = attention_scores + attention_mask
        # attention_mask 注意力矩阵mask: [batch_size, 1, 1, seq_length]
        # After the elements are added, they will be broadcast to the dimensions: [batch_size, num_heads, seq_length, seq_length]

        # Softmax normalization, get attention matrix
        # Normalize the attention scores to probabilities.
        attention_probs_ = nn.Softmax(dim=-1)(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs_)

        # Weight V with attention matrix
        context_layer = torch.matmul(attention_probs, value_layer)
        # Reshape the weighted V to get [batch_size, length, embedding_dimension]
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        # Output the attention matrix for visualization
        if get_attention_matrices:
            return context_layer, attention_probs_
        return context_layer, None


class BertLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-12):
        """Construct a layernorm module in the TF style (epsilon inside the square root).
        """
        super(BertLayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.weight * x + self.bias


class BertSelfOutput(nn.Module):
    # Encapsulated LayerNorm and residual connection, used to process the output of SelfAttention
    def __init__(self, config):
        super(BertSelfOutput, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class BertAttention(nn.Module):
    # The encapsulated multi-head attention mechanism part, including LayerNorm and residual connection
    def __init__(self, config):
        super(BertAttention, self).__init__()
        self.self = BertSelfAttention(config)
        self.output = BertSelfOutput(config)

    def forward(self, input_tensor, attention_mask, get_attention_matrices=False):
        self_output, attention_matrices = self.self(input_tensor, attention_mask, get_attention_matrices=get_attention_matrices)
        attention_output = self.output(self_output, input_tensor)
        return attention_output, attention_matrices


class BertIntermediate(nn.Module):
    # Encapsulated FeedForward layer and activation layer
    def __init__(self, config):
        super(BertIntermediate, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        self.intermediate_act_fn = ACT2FN[config.hidden_act]

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


class BertOutput(nn.Module):
    # The encapsulated LayerNorm and residual connection are used to process the output of the FeedForward layer
    def __init__(self, config):
        super(BertOutput, self).__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class BertLayer(nn.Module):
    # one transformer block
    def __init__(self, config):
        super(BertLayer, self).__init__()
        self.attention = BertAttention(config)
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)

    def forward(self, hidden_states, attention_mask, get_attention_matrices=False):
        # Attention layer (including LayerNorm and residual connection)
        attention_output, attention_matrices = self.attention(hidden_states, attention_mask, get_attention_matrices=get_attention_matrices)
        # FeedForward
        intermediate_output = self.intermediate(attention_output)
        # LayerNorm connects the output layer with the residual
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output, attention_matrices


class BertEncoder(nn.Module):
    # transformer blocks * N
    def __init__(self, config):
        super(BertEncoder, self).__init__()
        layer = BertLayer(config)
        # Copy N transformer blocks
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])

    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, get_attention_matrices=False):
        all_attention_matrices = []
        all_encoder_layers = []
        for layer_module in self.layer:
            hidden_states, attention_matrices = layer_module(hidden_states, attention_mask, get_attention_matrices=get_attention_matrices)
            if output_all_encoded_layers:
                all_encoder_layers.append(hidden_states)
                all_attention_matrices.append(attention_matrices)
        if not output_all_encoded_layers:
            all_encoder_layers.append(hidden_states)
            all_attention_matrices.append(attention_matrices)
        return all_encoder_layers, all_attention_matrices


class BertPooler(nn.Module):
    def __init__(self, config):
        super(BertPooler, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


# Linear mapping, activation, LayerNorm
class BertPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super(BertPredictionHeadTransform, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.transform_act_fn = ACT2FN[config.hidden_act]
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


class BertLMPredictionHead(nn.Module):
    def __init__(self, config, bert_model_embedding_weights):
        super(BertLMPredictionHead, self).__init__()
        # Linear mapping, activation, LayerNorm
        self.transform = BertPredictionHeadTransform(config)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
                                 bert_model_embedding_weights.size(0),
                                 bias=False)
        self.decoder.weight = bert_model_embedding_weights
        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))

    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states) + self.bias
        return hidden_states


class BertPreTrainingHeads(nn.Module):
    def __init__(self, config, bert_model_embedding_weights):
        super(BertPreTrainingHeads, self).__init__()

        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
        # Map the [batch_size, seq_len, embed_dim] output by the transformer block to [batch_size, seq_len, vocab_size]
        # Used to map pooled_output, which is the vector corresponding to #CLS#, to 2 categories
        # Used to predict Next Sentence
        self.seq_relationship = nn.Linear(config.hidden_size, 2)

    def forward(self, sequence_output, pooled_output):
        prediction_scores = self.predictions(sequence_output)
        seq_relationship_score = self.seq_relationship(pooled_output)
        return prediction_scores, seq_relationship_score


# 用来初始化模型参数
class BertPreTrainedModel(nn.Module):
    """
    An abstract class to handle weights initialization and
    a simple interface for dowloading and loading pretrained models.
    """

    def __init__(self, config, *inputs, **kwargs):
        super(BertPreTrainedModel, self).__init__()
        if not isinstance(config, BertConfig):
            raise ValueError(
                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
                "To create a model from a Google pretrained model use "
                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
                    self.__class__.__name__, self.__class__.__name__
                ))
        self.config = config

    def init_bert_weights(self, module):
        """ Initialize the weights.
        """
        if isinstance(module, (nn.Linear)):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, BertLayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()


class BertModel(BertPreTrainedModel):
    """BERT model ("Bidirectional Embedding Representations from a Transformer").

    Params:
        config: a BertConfig class instance with the configuration to build a new model

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.

    Outputs: Tuple of (encoded_layers, pooled_output)
        `encoded_layers`: controled by `output_all_encoded_layers` argument:
            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
                to the last attention block of shape [batch_size, sequence_length, hidden_size],
        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
            classifier pretrained on top of the hidden state associated to the first character of the
            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = modeling.BertModel(config=config)
    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
    ```
    """

    def __init__(self, config):
        super(BertModel, self).__init__(config)
        self.embeddings = BertEmbeddings(config)
        self.encoder = BertEncoder(config)
        self.pooler = BertPooler(config)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, positional_enc, token_type_ids=None, attention_mask=None,
                output_all_encoded_layers=True, get_attention_matrices=False):
        if attention_mask is None:
            # torch.LongTensor
            # attention_mask = torch.ones_like(input_ids)
            attention_mask = (input_ids > 0)
            # attention_mask [batch_size, length]
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        # Attention matrix mask: [batch_size, 1, 1, seq_length]

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        # Add a large negative offset to the invalid area of padding in the attention matrix, in order to make these invalid areas remain 0 after softmax, and do not participate in subsequent calculations

        # embedding
        embedding_output = self.embeddings(input_ids, positional_enc, token_type_ids)
        # Output after all defined transformer blocks
        encoded_layers, all_attention_matrices = self.encoder(embedding_output,
                                                              extended_attention_mask,
                                                              output_all_encoded_layers=output_all_encoded_layers,
                                                              get_attention_matrices=get_attention_matrices)
        if get_attention_matrices:
            return all_attention_matrices
        sequence_output = encoded_layers[-1]
        pooled_output = self.pooler(sequence_output)
        if not output_all_encoded_layers:
            encoded_layers = encoded_layers[-1]
        return encoded_layers, pooled_output


class BertForPreTraining(BertPreTrainedModel):
    """BERT model with pre-training heads.
    This module comprises the BERT model followed by the two pre-training heads:
        - the masked language modeling head, and
        - the next sentence classification head.

    Params:
        config: a BertConfig class instance with the configuration to build a new model.

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
            is only computed for the labels set in [0, ..., vocab_size]
        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
            with indices selected in [0, 1].
            0 => next sentence is the continuation, 1 => next sentence is a random sentence.

    Outputs:
        if `masked_lm_labels` and `next_sentence_label` are not `None`:
            Outputs the total_loss which is the sum of the masked language modeling loss and the next
            sentence classification loss.
        if `masked_lm_labels` or `next_sentence_label` is `None`:
            Outputs a tuple comprising
            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
            - the next sentence classification logits of shape [batch_size, 2].

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = BertForPreTraining(config)
    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
    ```
    """

    def __init__(self, config):
        super(BertForPreTraining, self).__init__(config)
        self.bert = BertModel(config)
        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
        self.apply(self.init_bert_weights)
        self.vocab_size = config.vocab_size
        self.next_loss_func = CrossEntropyLoss()
        self.mlm_loss_func = CrossEntropyLoss(ignore_index=0)

    def compute_loss(self, predictions, labels, num_class=2, ignore_index=-100):
        loss_func = CrossEntropyLoss(ignore_index=ignore_index)
        return loss_func(predictions.view(-1, num_class), labels.view(-1))

    def forward(self, input_ids, positional_enc, token_type_ids=None, attention_mask=None,
                masked_lm_labels=None, next_sentence_label=None):
        sequence_output, pooled_output = self.bert(input_ids, positional_enc, token_type_ids, attention_mask,
                                                   output_all_encoded_layers=False)
        mlm_preds, next_sen_preds = self.cls(sequence_output, pooled_output)
        return mlm_preds, next_sen_preds


### bert sentiment analysis


In [0]:
from torch import nn


class Bert_Sentiment_Analysis(nn.Module):
    def __init__(self, config):
        super(Bert_Sentiment_Analysis, self).__init__()
        self.bert = BertModel(config)
        self.dense = nn.Linear(config.hidden_size * 2, config.hidden_size)
        self.final_dense = nn.Linear(config.hidden_size, 1)
        self.activation = nn.Sigmoid()
        self.layer = config.layer

    def compute_loss(self, predictions, labels):
        # Flatten the dimensions of forecast and labeling to prevent dimensional inconsistencies
        predictions = predictions.view(-1)
        labels = labels.float().view(-1)
        epsilon = 1e-8
        # Cross entropy
        loss = -labels * torch.log(predictions + epsilon) - \
            (torch.tensor(1.0) - labels) * torch.log(torch.tensor(1.0) - predictions + epsilon)
        # Find the average and return the loss that can be reversed
        # loss is a real number
        loss = torch.mean(loss)
        return loss

    def forward(self, text_input, positional_enc, labels=None):
        encoded_layers, _ = self.bert(text_input, positional_enc,
                                      output_all_encoded_layers=True)
        sequence_output = encoded_layers[self.layer]
        # The dimension of sequence_output is [batch_size, seq_len, embed_dim]
        avg_pooled = sequence_output.mean(1)
        max_pooled = torch.max(sequence_output, dim=1)
        pooled = torch.cat((avg_pooled, max_pooled[0]), dim=1)
        pooled = self.dense(pooled)

        # The following is the mapping from [batch_size, hidden_dim * 2] to [batch_size, 1],
        # what we are going to solve here is the two classification problem
        predictions = self.final_dense(pooled)

        # Use the sigmoid function to activate and return a value between 0-1
        predictions = self.activation(predictions)
        if labels is not None:
            # Calculate loss
            loss = self.compute_loss(predictions, labels)
            return predictions, loss
        else:
            return predictions


### sentiment trainning


In [0]:

from torch.optim import Adam
from torch.utils.data import DataLoader
from sklearn import metrics
import tqdm
import pandas as pd
import numpy as np
import configparser
import os
import json
import datetime, pytz


class Sentiment_trainer:
    def __init__(self, config, max_seq_len,
                 batch_size,
                 lr,  # learning rate
                 with_cuda=True,  # GPU
                 ):
        self.config = config
        self.vocab_size = int(self.config["vocab_size"])
        self.batch_size = batch_size
        self.lr = lr
        # loading dictionary
        with open(self.config["word2idx_path"], "r", encoding="utf-8") as f:
            self.word2idx = json.load(f)
        # there is a GPU available
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")
        # max length of sentence
        self.max_seq_len = max_seq_len
        # model hyperparameter
        bertconfig = BertConfig(vocab_size=self.vocab_size, layer=self.config["layer"], hidden_dropout_prob=self.config['dropout'])
        # init BERT model
        self.bert_model = Bert_Sentiment_Analysis(config=bertconfig)
        # Send the model to the computing device (GPU or CPU)
        self.bert_model.to(self.device)
        # Declare the training dataset and define the data set class according to the requirements of pytorch
        train_dataset = CLSDataset(corpus_path=self.config["train_corpus_path"],
                                   word2idx=self.word2idx,
                                   max_seq_len=self.max_seq_len
                                   )
        self.train_dataloader = DataLoader(train_dataset,
                                           batch_size=self.batch_size,
                                           num_workers=0,
                                           collate_fn=lambda x: x  # dynamic padding
                                           )
        # Declare test dataset
        test_dataset = CLSDataset(corpus_path=self.config["test_corpus_path"],
                                  word2idx=self.word2idx,
                                  max_seq_len=self.max_seq_len
                                  )
        self.test_dataloader = DataLoader(test_dataset,
                                          batch_size=self.batch_size,
                                          num_workers=0,
                                          collate_fn=lambda x: x)
        # Initial position code
        self.hidden_dim = bertconfig.hidden_size
        self.positional_enc = self.init_positional_encoding()

        # positional_enc: [max_sen_len, embedding_dimension] => [1, max_sen_len, embedding_dimension]
        self.positional_enc = torch.unsqueeze(self.positional_enc, dim=0)

        # Declare the parameters to be optimized
        self.optim_parameters = list(self.bert_model.parameters())

        self.init_optimizer(lr=self.lr)
        if not os.path.exists(self.config["sentiment_model"]):
            os.mkdir(self.config["sentiment_model"])

    def init_optimizer(self, lr):
        # Initialize the optimizer with the specified learning rate
        self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3)

    def init_positional_encoding(self):
        position_enc = np.array([
            [pos / np.power(10000, 2 * i / self.hidden_dim) for i in range(self.hidden_dim)]
            if pos != 0 else np.zeros(self.hidden_dim) for pos in range(self.max_seq_len)])

        position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
        position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
        denominator = np.sqrt(np.sum(position_enc ** 2, axis=1, keepdims=True))
        # Normalization
        position_enc = position_enc / (denominator + 1e-8)
        position_enc = torch.from_numpy(position_enc).type(torch.FloatTensor)
        return position_enc

    def load_model(self, model, dir_path="../output", load_bert=False):
        checkpoint_dir = self.find_most_recent_state_dict(dir_path)
        checkpoint = torch.load(checkpoint_dir)

        # When the sentiment analysis model is trained first, it needs to load the pre-trained BERT
        if load_bert:
            checkpoint["model_state_dict"] = {k[5:]: v for k, v in checkpoint["model_state_dict"].items()
                                              if k[:4] == "bert" and "pooler" not in k}

        model.load_state_dict(checkpoint["model_state_dict"], strict=False)
        torch.cuda.empty_cache()
        model.to(self.device)
        print("{} loaded!".format(checkpoint_dir))

    def train(self, epoch, df_path="./output_bert/df_log.pickle"):
        # An epoch training
        self.bert_model.train()
        self.iteration(epoch, self.train_dataloader, train=True, df_name=df_path)

    def test(self, epoch, df_path="./output_bert/df_log.pickle"):
        # An epoch test, and return the f1 of the test set
        self.bert_model.eval()
        with torch.no_grad():
            return self.iteration(epoch, self.test_dataloader, train=False, df_name=df_path)

    def padding(self, output_dic_lis):
        """Dynamic padding, fill the length with the largest sentence length in the current mini batch"""
        text_input = [i["text_input"] for i in output_dic_lis]
        text_input = torch.nn.utils.rnn.pad_sequence(text_input, batch_first=True)
        label = torch.cat([i["label"] for i in output_dic_lis])
        return {"text_input": text_input, "label": label}

    def iteration(self, epoch, data_loader, train=True, df_name="df_log.pickle"):
        # Progress bar
        str_code = "train" if train else "test"
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP_%s:%d" % (str_code, epoch),
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")

        total_loss = 0
        # Store all predicted results and markers for logging
        all_predictions, all_labels, all_texts = [], [], []

        for i, data in data_iter:
            texts = [i["text"] for i in data]
            # padding
            data = self.padding(data)
            # Send data to computing device
            data = {key: value.to(self.device) for key, value in data.items()}
            # According to the length of the text sequence after padding, the position code of the corresponding length is intercepted
            positional_enc = self.positional_enc[:, :data["text_input"].size()[-1], :].to(self.device)

            # Forward propagation, get prediction results and loss
            predictions, loss = self.bert_model.forward(text_input=data["text_input"],
                                                        positional_enc=positional_enc,
                                                        labels=data["label"]
                                                        )
            # Extract the predicted results and labels, and store them in all_predictions, all_labels
            # compute auc
            predictions = predictions.detach().cpu().numpy().reshape(-1).tolist()
            labels = data["label"].cpu().numpy().reshape(-1).tolist()
            all_predictions.extend(predictions)
            all_labels.extend(labels)
            all_texts.extend(texts)
            fpr, tpr, thresholds = metrics.roc_curve(y_true=all_labels, y_score=all_predictions)
            auc = metrics.auc(fpr, tpr)

            # Backpropagation
            if train:
                # Clear the previous gradient
                self.optimizer.zero_grad()
                # Backpropagation, get a new gradient
                loss.backward()
                # Update model parameters with the obtained gradient
                self.optimizer.step()

            # 为计算当前epoch的平均loss
            total_loss += loss.item()

            if train:
                log_dic = {"epoch": epoch, "train_loss": total_loss / (i + 1), "train_auc": auc}
            else:
                log_dic = {"epoch": epoch, "test_loss": total_loss / (i + 1), "test_auc": auc}

            if i % 500 == 0:
                data_iter.write(str({k: v for k, v in log_dic.items() if v != 0}))

        # logging, just store test logs
        if not train:
            # Initialize a pandas DataFrame for storage of training logs
            if not os.path.isfile(df_name):
                df = pd.DataFrame(columns=["epoch", "pc", "rc", "f1", "ac", "test_loss", "test_auc"])
                df.to_pickle(df_name)
                print("===========log test DataFrame created!==========")
            ac_, pc_, rc_, f1_, threshold_ = find_best_threshold(all_predictions, all_labels)
            log_dic = {
                "epoch": epoch, "pc": pc_, "rc": rc_, "f1": f1_, "ac": ac_, "threshold": threshold_,
                "test_loss": total_loss / (i + 1), "test_auc": auc
            }
            data_iter.write('===========' + str({k: v for k, v in log_dic.items() if v != 0}) + '===========')
            df = pd.read_pickle(df_name)
            df = df.append([log_dic])
            df.reset_index(inplace=True, drop=True)
            df.to_pickle(df_name)

            # store Excel
            if f1_ > 0.95:
                datas = [('text', 'predict value', 'threshold', 'label', 'predict')]
                for z in zip(all_texts,
                             all_predictions,
                             [threshold_] * len(all_predictions),
                             all_labels,
                             (np.ravel(all_predictions) >= threshold_).astype("int"),
                             ):
                    datas.append(z)
                writeExcel(sentiment_test_excel_log_, str(f1_) + '-' + str(epoch), datas, 3, 4)

            # Return f1, as a measure of early stop
            return f1_, all_labels, all_predictions

    def find_most_recent_state_dict(self, dir_path):
        dic_lis = [i for i in os.listdir(dir_path)]
        if len(dic_lis) == 0:
            raise FileNotFoundError("can not find any state dict in {}!".format(dir_path))
        dic_lis = [i for i in dic_lis if "model" in i]
        dic_lis = sorted(dic_lis, key=lambda k: int(k.split(".")[-1]))
        return dir_path + "/" + dic_lis[-1]

    def save_state_dict(self, model, epoch, sentiment_model="../output", file_path="bert.model"):
        """Store current model parameters"""
        if not os.path.exists(sentiment_model):
            os.mkdir(sentiment_model)
        save_path = sentiment_model + "/" + file_path + ".epoch.{}".format(str(epoch))
        model.to("cpu")
        torch.save({"model_state_dict": model.state_dict()}, save_path)
        print("{} saved!".format(save_path))
        model.to(self.device)




In [0]:

if __name__ == '__main__':

    dataset = 'weibo_senti_100k'
    cross = 2
    layer = 3
    dropout = 0.4

    config = {}
    config["train_corpus_path"] = os.path.join(corpus_dir_, dataset, 'train.json')
    config["test_corpus_path"] = os.path.join(corpus_dir_, dataset, 'test.json')
    config["word2idx_path"] = word2idx_path_extend_
    config["bert_model"] = bert_model_
    config["sentiment_model"] = sentiment_model_
    config["state_dict_dir"] = corpus_dir_
    config["df_train_log"] = sentiment_train_log_
    config["df_test_log"] = sentiment_test_log_
    config["layer"] = layer
    config["dropout"] = dropout
    config["batch_size"] = 24
    config["max_seq_len"] = 155
    config["vocab_size"] = 32162
    config["lr"] = 2e-6
    config["num_workers"] = 0

    # split dataset into test and train
    getTrainTestData(dataset, cross, test_proportion=0.2, sentence_len=config["max_seq_len"])


    def init_trainer(dynamic_lr, batch_size=config["batch_size"]):
        trainer = Sentiment_trainer(max_seq_len=config["max_seq_len"], batch_size=batch_size, lr=dynamic_lr, with_cuda=True, config=config)
        return trainer, dynamic_lr


    start_epoch = 0
    train_epoches = 999
    trainer, dynamic_lr = init_trainer(dynamic_lr=config["lr"], batch_size=config["batch_size"])

    all_f1score = []
    threshold = 0
    patient = 10
    best_loss = 99999999
    for epoch in range(start_epoch, start_epoch + train_epoches):
        if epoch == start_epoch and epoch == 0:
            print('loading bert')
            # The training of the first epoch needs to load the pre-trained BERT model
            trainer.load_model(trainer.bert_model, dir_path=config["bert_model"], load_bert=True)
            print('loading bert finish')
        elif epoch == start_epoch:
            print('loading sentiment')
            trainer.load_model(trainer.bert_model, dir_path=trainer.config["sentiment_model"])
            print('loading sentiment finish')
        print("train with learning rate {}".format(str(dynamic_lr)))
        # Train an epoch
        trainer.train(epoch, df_path=config["df_train_log"])
        # Save the current epoch model parameters
        trainer.save_state_dict(trainer.bert_model, epoch,
                                sentiment_model=trainer.config["sentiment_model"],
                                file_path="sentiment.model")

        f1score, all_labels, all_predictions = trainer.test(epoch, df_path=config["df_test_log"])

        all_f1score.append(f1score)
        best_f1score = max(all_f1score)
        if all_f1score[-1] < best_f1score:
            threshold += 1
            dynamic_lr *= 0.8
            trainer.init_optimizer(lr=dynamic_lr)
        else:
            threshold = 0

        if threshold >= patient:
            best_epoch = (start_epoch + np.argmax(np.array(all_f1score)))

            filename = dataset + '-' + str(best_f1score) + '-L' + str(layer) + '-C' + str(cross) + '-E' + str(best_epoch) + '-D' + str(dropout) + '-' + "T" + datetime.datetime.now(tz=pytz.timezone('Asia/Bangkok')).strftime('%Y-%m-%d %H-%M')
            print('data:', filename)
            model_file = os.path.join(sentiment_model_, "sentiment.model.epoch." + str(best_epoch))
            os.rename(model_file, os.path.join(best_model_, filename + "-sentiment.model.epoch." + str(best_epoch)))

            os.rename(os.path.join(root_, 'TCBERT.ipynb'), os.path.join(best_model_script_, filename + '.ipynb'))

            os.rename(sentiment_test_excel_log_, os.path.join(best_excel_, filename + '.xlsx'))

            os.rename(sentiment_test_log_, os.path.join(best_df_, filename + '.test.pickle'))

            print("early stop!")
            break
