In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [2]:
import configs
import os
import json

import numpy as np
import jieba
import tensorflow as tf

from transformers import TFGPT2LMHeadModel, TFGPT2ForSequenceClassification, GPT2Config
from transformers import TFTrainer, TFTrainingArguments
from transformers import XLNetTokenizer


task_name = 'chid'

In [3]:
import jieba


class XLNetTokenizer(XLNetTokenizer):
    translator = str.maketrans(" \n", "\u2582\u2583")

    def _tokenize(self, text, *args, **kwargs):
        text = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]
        text = " ".join(text)
        return super()._tokenize(text, *args, **kwargs)

    def _decode(self, *args, **kwargs):
        text = super()._decode(*args, **kwargs)
        text = text.replace(" ", "").replace("\u2582", " ").replace("\u2583", "\n")
        return text



In [3]:

def load_json2list(filename, task_name):
    with open(filename) as f:
        for line in f.readlines():
            if line:
                data = json.loads(line)
                yield data


def load_dataset(task_name: str):
    path = os.path.join(configs.DATA_PATH, task_name)

    train_file = os.path.join(path, "train.json")
    dev_file = os.path.join(path, "dev.json")
    test_file = os.path.join(path, "test.json")
    train_dataset = list(load_json2list(train_file, task_name))
    dev_dataset = list(load_json2list(dev_file, task_name))
    test_dataset = list(load_json2list(test_file, task_name))
    return train_dataset, dev_dataset, test_dataset

train_dataset, dev_dataset, test_dataset = load_dataset(task_name)

In [4]:

def load_tokenizer():
    tokenizer = XLNetTokenizer.from_pretrained(configs.MODEL_PATH, padding_side="right")
    return tokenizer

tokenizer = load_tokenizer()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
import re

def load_answer_dict(task_name: str):
    path = os.path.join(configs.DATA_PATH, task_name)
    train_answer = json.load(open(os.path.join(path, "train_answer.json")))
    dev_answer = json.load(open(os.path.join(path, "dev_answer.json")))
    return train_answer, dev_answer

def conver4ZeroShot(item, tokenizer, answer_dict):
    max_length = 0
    
    for sentence in item['content']:
        print(sentence)
        re.search()


train_answer, dev_answer = load_answer_dict(task_name)
dev_dataset[0]
answer_dict = dev_answer
# conver4ZeroShot(dev_dataset[0], tokenizer, dev_answer)

In [7]:
def load_model(task_name: str) -> TFGPT2ForSequenceClassification:
    model = TFGPT2LMHeadModel.from_pretrained(configs.MODEL_PATH)
    return model

model = load_model(task_name)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at /data2/Text-Suggestion/models.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [17]:
candidates = dev_dataset[0]['candidates']

sentence = dev_dataset[0]['content'][2]
res =  re.finditer(r"#idiom\d+#", sentence, flags=0)
sentence_filled = []

for r in res:
    answer = answer_dict[r.group()]
    for candidate in candidates:
        new_sentence = sentence[:r.start()] + candidate + sentence[r.end():]
        new_sentence = re.sub('#idiom\d+#', '', new_sentence)
        print(r.group(), new_sentence)
        sentence_filled.append(new_sentence)


ids = tokenizer(sentence_filled, padding=True, return_tensors='tf', add_special_tokens=False)
ids['labels'] = ids['input_ids']
labels = ids['labels'][:, 1:]
# labels = ids['input_ids']
# mask = tf.equal(labels, tokenizer.pad_token_id)
# active_label = tf.not_equal(labels, tokenizer.pad_token_id)
# active_label = labels * tf.cast(active_label, tf.int32)
# mask = tf.cast(mask, tf.int32) * -100
# labels = active_label + mask
# print(active_label)
# print(ids)
# out = model(ids, return_dict=True)
# out.loss
# labels = labels[:, :-1]

#idiom577159# 对消费者而言，保险是一种较为特殊的金融理财产品，其最基本原则是对风险的保障。虽然目前保险公司开发了诸多强调投资收益的险种，但无论任何种类的产品都不能偏离保险的本质。尤其是对于注重保障的人身保险而言，强调投资价值实属祸不单行。消费者在投资保险理财产品时，需端正心态，树立科学、健康的保险理念，避免走进购买误区。
#idiom577159# 对消费者而言，保险是一种较为特殊的金融理财产品，其最基本原则是对风险的保障。虽然目前保险公司开发了诸多强调投资收益的险种，但无论任何种类的产品都不能偏离保险的本质。尤其是对于注重保障的人身保险而言，强调投资价值实属急功近利。消费者在投资保险理财产品时，需端正心态，树立科学、健康的保险理念，避免走进购买误区。
#idiom577159# 对消费者而言，保险是一种较为特殊的金融理财产品，其最基本原则是对风险的保障。虽然目前保险公司开发了诸多强调投资收益的险种，但无论任何种类的产品都不能偏离保险的本质。尤其是对于注重保障的人身保险而言，强调投资价值实属瓜熟蒂落。消费者在投资保险理财产品时，需端正心态，树立科学、健康的保险理念，避免走进购买误区。
#idiom577159# 对消费者而言，保险是一种较为特殊的金融理财产品，其最基本原则是对风险的保障。虽然目前保险公司开发了诸多强调投资收益的险种，但无论任何种类的产品都不能偏离保险的本质。尤其是对于注重保障的人身保险而言，强调投资价值实属画蛇添足。消费者在投资保险理财产品时，需端正心态，树立科学、健康的保险理念，避免走进购买误区。
#idiom577159# 对消费者而言，保险是一种较为特殊的金融理财产品，其最基本原则是对风险的保障。虽然目前保险公司开发了诸多强调投资收益的险种，但无论任何种类的产品都不能偏离保险的本质。尤其是对于注重保障的人身保险而言，强调投资价值实属本末倒置。消费者在投资保险理财产品时，需端正心态，树立科学、健康的保险理念，避免走进购买误区。
#idiom577159# 对消费者而言，保险是一种较为特殊的金融理财产品，其最基本原则是对风险的保障。虽然目前保险公司开发了诸多强调投资收益的险种，但无论任何种类的产品都不能偏离保险的本质。尤其是对于注重保障的人身保险而言，强调投资价值实属因噎废食。消费者在投资保险理财产品时，需端正心态，树立科学、健康的保

In [18]:

def loss_fun(labels, logits):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
    return loss_fn(labels, logits)


def build_loss(tokenizer):
    def custom_loss(labels, logits):
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE
        )
        mask = tf.not_equal(labels, tokenizer.pad_token_id)

        labels = tf.cast(labels,tf.int32) * tf.cast(mask,tf.int32) 
        raw_loss = loss_fn(labels, logits)

        mask = tf.cast(mask,tf.float32)
        losses = tf.cast(raw_loss,tf.float32) * mask
        losses = tf.reduce_sum(losses,axis=-1) / tf.reduce_sum(mask,axis=-1)
        return losses

    return custom_loss

loss = build_loss(tokenizer)
model.compute_loss = loss
out = model(ids, return_dict=True)

print("loss1:", out.loss)
print(np.argmin(out.loss))

mask = tf.not_equal(labels, -100)

labels = tf.cast(labels,tf.int32) * tf.cast(mask,tf.int32) 
raw_loss = loss_fun(labels,out.logits)

mask = tf.cast(mask,tf.float32)

losses = tf.cast(raw_loss, tf.float32) * mask
losses = tf.reduce_sum(losses,axis=-1) / tf.reduce_sum(mask,axis=-1)
print(losses)
print(np.argmin(losses))

print("truth answer", answer)
print(labels)
# from transformers.modeling_tf_utils import shape_list

# logits, labels = out.logits, ids['labels'][:,:-1]
# print(labels.shape)
# shape1 = labels.shape[-1]
# active_loss = tf.not_equal(tf.reshape(labels, (-1,)), -100)
# labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
# print(labels)
# reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
# reduced_logits = tf.reshape(reduced_logits, (-1, shape1, reduced_logits.shape[-1]))
# labels = tf.reshape(labels, (-1, shape1))
# print(reduced_logits)
# print(loss_fun(labels, reduced_logits))


loss1: tf.Tensor(
[7.7241654 7.719291  7.808548  7.731908  7.706756  7.7336817 7.72089
 7.7075067 7.7773967 7.680266 ], shape=(10,), dtype=float32)
9
tf.Tensor(
[7.7241654 7.719291  7.808548  7.731908  7.706756  7.7336817 7.72089
 7.7075067 7.7773967 7.680266 ], shape=(10,), dtype=float32)
9
truth answer 4
tf.Tensor(
[[27480 27856 27205 ... 28346 27322 26966]
 [27480 27856 27205 ... 28346 27322 26966]
 [27480 27856 27205 ... 28346 27322 26966]
 ...
 [27480 27856 27205 ... 28346 27322 26966]
 [27480 27856 27205 ... 28346 27322 26966]
 [27480 27856 27205 ... 28346 27322 26966]], shape=(10, 141), dtype=int32)


In [76]:
ids

{'input_ids': <tf.Tensor: shape=(10, 119), dtype=int32, numpy=
array([[  39,    8, 2272, ...,    8,   12,    5],
       [  39,    8, 2272, ...,   12,    5,    5],
       [  39,    8, 2272, ...,    8,   12,    5],
       ...,
       [  39,    8, 2272, ...,    8,   12,    5],
       [  39,    8, 2272, ...,  194,    8,   12],
       [  39,    8, 2272, ...,   12,    5,    5]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(10, 119), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 3, 3],
       [0, 0, 0, ..., 0, 0, 3],
       ...,
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 3, 3]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(10, 119), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 1, 0, 0],
       [1, 1, 1, ..., 1, 1, 0],
       ...,
       [1, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 0, 0]], dtype=int32)>, 'labels': <tf.Tensor: shape=(10, 11

In [9]:
start_i = 0
losses = []
for mask in ids['attention_mask']:
    active_count = sum(mask)
    loss = out.loss[start_i: start_i+active_count]
    print(loss[:3].numpy())
    loss_sum = sum(loss)
    print(loss_sum, active_count)
    loss = loss_sum / tf.cast(active_count, tf.float32)
    losses.append(loss.numpy())
    start_i += active_count
print(losses)

[9.835991  6.812191  4.5883446]
tf.Tensor(1113.1414, shape=(), dtype=float32) tf.Tensor(125, shape=(), dtype=int32)
[ 6.812191   4.5883446 13.638993 ]
tf.Tensor(1087.4413, shape=(), dtype=float32) tf.Tensor(122, shape=(), dtype=int32)
[ 4.5883446 13.638993  11.257914 ]
tf.Tensor(1110.6835, shape=(), dtype=float32) tf.Tensor(124, shape=(), dtype=int32)
[13.638993 11.257914 10.230735]
tf.Tensor(1050.977, shape=(), dtype=float32) tf.Tensor(122, shape=(), dtype=int32)
[8.90513, 8.913453, 8.957125, 8.614566]


In [113]:
def loss_fun(labels, logits):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
    return loss_fn(labels, logits)
mask = tf.not_equal(labels, 0)
print("mask", mask)
labels = tf.cast(labels,tf.int32) * tf.cast(mask,tf.int32)
print("labels", labels)
raw_loss = loss_fun(labels,out.logits)
print("raw_loss", raw_loss)
mask = tf.cast(mask,tf.float32)
losses = tf.cast(raw_loss,tf.float32) * mask
print("losses", losses)
# losses = tf.reduce_sum(losses,axis=-1) / tf.reduce_sum(mask,axis=-1)
# print("losses", losses)

mask tf.Tensor(
[[ True  True  True  True  True  True False False False]
 [ True  True  True  True  True  True  True  True  True]], shape=(2, 9), dtype=bool)
labels tf.Tensor(
[[28051  5749 26988 27841 27160 27840     0     0     0]
 [26977 27172 26971 26968 27320 27952 27970  6796  1149]], shape=(2, 9), dtype=int32)
raw_loss tf.Tensor(
[[9.3518257e+00 4.0583386e+00 4.1490216e+00 6.1921062e+00 5.2340332e-02
  1.2677022e-03 1.0334850e+01 7.9349532e+00 7.4530206e+00]
 [4.3851023e+00 5.5624809e+00 1.8138936e+00 6.0646114e+00 1.5625546e-02
  1.1773869e+01 4.4465199e+00 1.3087559e+00 6.4660233e-01]], shape=(2, 9), dtype=float32)
losses tf.Tensor(
[[9.3518257e+00 4.0583386e+00 4.1490216e+00 6.1921062e+00 5.2340332e-02
  1.2677022e-03 0.0000000e+00 0.0000000e+00 0.0000000e+00]
 [4.3851023e+00 5.5624809e+00 1.8138936e+00 6.0646114e+00 1.5625546e-02
  1.1773869e+01 4.4465199e+00 1.3087559e+00 6.4660233e-01]], shape=(2, 9), dtype=float32)


In [115]:
tf.reduce_sum(losses,axis=-1), tf.reduce_sum(mask,axis=-1)

(<tf.Tensor: shape=(2,), dtype=float32, numpy=array([23.804901, 36.01746 ], dtype=float32)>,
 <tf.Tensor: shape=(2,), dtype=float32, numpy=array([6., 9.], dtype=float32)>)