In [118]:
from bertviz import head_view, model_view
from transformers import GPT2Tokenizer, GPT2Model

from tokenizations.bpe_tokenizer import get_encoder
from module import GPT2Config, GPT2Model, GPT2LMHeadModel
from tokenizations import tokenization_bert
from utils import get_sentence_pinyin_finals

In [13]:
import argparse
import os

parser = argparse.ArgumentParser()

# 各种数据路径
parser.add_argument('--model_dir', default='model', type=str, required=False, help='模型存放位置')
parser.add_argument('--root_path', default='data/lyrics/', type=str, required=False, help='根目录')
parser.add_argument('--raw_data_dir', default='lyric_with_final_small', type=str, required=False, help='原始数据目录名称')
parser.add_argument('--model_sign', default='1a', type=str, required=False, help='模型签名: 区分模型和log存储子目录')
parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径')

# 各种语料库
parser.add_argument('--tokenizer_path', default='tokenizations/chinese_dicts.txt', type=str, required=False, help='选择词库')
parser.add_argument('--finalizer_path', default='tokenizations/finals.txt', type=str, required=False, help='选择韵母词库')
parser.add_argument('--sentencer_path', default='tokenizations/sentences.txt', type=str, required=False, help='选择句子词库')
parser.add_argument('--poser_path', default='tokenizations/sentences.txt', type=str, required=False, help='选择相对位置词库')
parser.add_argument('--beater_path', default='tokenizations/beats.txt', type=str, required=False, help='选择鼓点词库')

# 训练参数
parser.add_argument('--device', default='0', type=str, required=False, help='设置使用哪些显卡')
parser.add_argument('--init_device', default=0, type=int, required=False, help='设置使用主显卡')
parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False,
help='选择模型参数')
parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环')
parser.add_argument('--start_epoch', default=0, type=int, required=False, help='从哪个epoch开始训练')
parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size')
parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率')
parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数')
parser.add_argument('--log_step', default=10, type=int, required=False,
help='多少步汇报一次loss，设置为gradient accumulation的整数倍')
parser.add_argument('--stride', default=512, type=int, required=False, help='训练时取训练数据的窗口步长')
parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累')
parser.add_argument('--fp16', action='store_true', help='混合精度')
parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False)
parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False)
parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份')
parser.add_argument('--min_length', default=0, type=int, required=False, help='最短收录文章长度')
parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径')

# 数据处理方式
parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json" , required=False)
parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe" , required=False)
parser.add_argument('--raw', action='store_true', help='是否从preprocessing开始', required=False)
parser.add_argument('--tokenize', action='store_true', help='是否作tokenize', required=False)
parser.add_argument('--segment', action='store_true', help='中文以词为单位', required=False)
parser.add_argument('--bpe_token', action='store_true', help='subword', required=False)
parser.add_argument('--enable_final', action='store_true', help='是否加入韵母embedding', required=False)
parser.add_argument('--enable_sentence', action='store_true', help='是否加入sentence embedding', required=False)
parser.add_argument('--enable_relative_pos', action='store_true', help='是否加入inner-sentence positional embedding', required=False)
parser.add_argument('--enable_beat', action='store_true', help='是否加入beat embedding', required=False)
parser.add_argument('--reverse', action='store_true', help='是否采用反向生成', required=False)
parser.add_argument('--with_beat', action='store_true', help='是否同时生成beat', required=False)
parser.add_argument('--beat_mode', default=0, type=int, help='beat控制模式：0.不控制；1.global；2.local', required=False)

args = parser.parse_args(['--model_dir', 'model/lyrics/lyrics_rap_train_reverse/1c_train/model_epoch1',
                          '--raw_data_dir', 'lyrics_rap_test',
                          '--device', '2',
                          '--batch_size', '1',
                          '--enable_final',
                          '--enable_sentence',
                          '--enable_relative_pos',
                          '--enable_beat',
                          '--reverse',
                          '--with_beat',
                          '--beat_mode', '0',
                          '--num_pieces', '2'
                          ]) # use 10 pieces for test


os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
import torch
import torch.nn.functional as F
from module import GPT2LMHeadModel
if args.device == 'cpu':
    device = 'cpu'
else:
    device = "cuda" if torch.cuda.is_available() else "cpu"
model = GPT2LMHeadModel.from_pretrained(args.model_dir, output_attentions=True)
model.eval()
model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(16000, 768)
    (wpe): Embedding(1024, 768)
    (wse): Embedding(1032, 768)
    (wfe): Embedding(32, 768)
    (wre): Embedding(1032, 768)
    (wbe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_d

In [5]:
full_tokenizer = tokenization_bert.BertTokenizer(
    vocab_file=args.tokenizer_path, 
    do_lower_case=False
)
full_finalizer = tokenization_bert.BertTokenizer(
    vocab_file=args.finalizer_path, 
    tokenize_chinese_chars=False, 
    do_lower_case=False
)
full_sentencer = tokenization_bert.BertTokenizer(
    vocab_file=args.sentencer_path, 
    tokenize_chinese_chars=False, 
    do_lower_case=False
)
full_poser = tokenization_bert.BertTokenizer(
    vocab_file=args.poser_path, 
    tokenize_chinese_chars=False, 
    do_lower_case=False
)
full_beater = tokenization_bert.BertTokenizer(
    vocab_file=args.beater_path, 
    tokenize_chinese_chars=False, 
    do_lower_case=False
)


In [48]:
def get_pinyin(text):
    pinyins = []
    text = text.split('[SEP]')
    for i in text:
        pinyin_i, _ = get_sentence_pinyin_finals(i)
        pinyin_i = ' '.join(pinyin_i)
        pinyins.append(pinyin_i)
    
    pinyin =  ' [SEP] '.join([i for i in pinyins])
    return pinyin


我们有黄色的皮肤流着滚烫的血[SEP]让这首歌来医疗失眠的夜[SEP]
o en ou ang e e i u ou e en ang e ie [SEP] ang e ou e ai i ao i an e ie [SEP] 


In [121]:
text = '我们有黄色的皮肤流着滚烫的血[SEP]还在战斗坚定了勇气面对一切[SEP]用说唱来续写古昔峥嵘岁月[SEP]'
texts = text.split('[SEP]')
if len(texts[-1]) == 0:
    texts = texts[0:-1]
texts_reverse = [t[::-1] for t in texts]
text_reverse = '[SEP]' + '[SEP]'.join(texts_reverse)

token = full_tokenizer.tokenize(text)
token_ids = full_tokenizer.convert_tokens_to_ids(token)
token_reverse = full_tokenizer.tokenize(text_reverse)
token_ids_reverse = full_tokenizer.convert_tokens_to_ids(token_reverse)

pinyin = get_pinyin(text)
final = full_finalizer.tokenize(pinyin)
final_ids = full_finalizer.convert_tokens_to_ids(final)
pinyin_reverse = get_pinyin(text_reverse)
final_reverse = full_finalizer.tokenize(pinyin_reverse)
final_ids_reverse = full_finalizer.convert_tokens_to_ids(final_reverse)
    
pinyins = pinyin.split('[SEP]')
if len(pinyins[-1]) == 1 and pinyins[-1][0] == ' ':
    pinyins = pinyins[0:-1]
num_lines = len(pinyins)

sentence = ' '.join(' '.join([str(i)] * (len(pinyins[i].split()) + 1)) for i in range(num_lines))
sentence = full_sentencer.tokenize(sentence)
sentence_ids = full_sentencer.convert_tokens_to_ids(sentence) 
sentence_ids_reverse = sentence_ids

pos = ' [SEP] '.join(' '.join([str(i) for i in range(len(pinyins[i].split()))]) for i in range(num_lines)) + ' [SEP]'
pos = full_poser.tokenize(pos)
pos_ids = full_poser.convert_tokens_to_ids(pos)
pos_reverse = '[SEP] ' + ' [SEP] '.join(' '.join([str(i) for i in range(len(pinyins[i].split()))]) for i in range(num_lines))
pos_reverse = full_poser.tokenize(pos_reverse)
pos_ids_reverse = full_poser.convert_tokens_to_ids(pos_reverse)

beat = ' '.join(['0'] * len(token_ids))
beat = full_beater.tokenize(beat)
beat_ids = full_beater.convert_tokens_to_ids(beat)
beat_ids_reverse = beat_ids


assert len(token_ids) == len(final_ids) == len(sentence_ids) == len(pos_ids) ==len(beat_ids)
print(pos_ids_reverse)
print(sentence_ids_reverse)
print(final_ids_reverse)

[1024, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1024, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1024, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[0, 18, 10, 11, 14, 10, 24, 7, 6, 10, 10, 11, 24, 14, 5, 0, 18, 6, 13, 8, 6, 17, 10, 19, 8, 24, 8, 12, 12, 0, 18, 13, 17, 14, 6, 7, 18, 6, 12, 11, 5, 17]


In [123]:
batch_token_ids = torch.Tensor([token_ids_reverse[0:]]).long().to(device)
batch_sentence_ids = torch.Tensor([sentence_ids_reverse[0:]]).long().to(device)
batch_final_ids = torch.Tensor([final_ids_reverse[0:]]).long().to(device)
batch_pos_ids = torch.Tensor([pos_ids_reverse[0:]]).long().to(device)
batch_beat_ids = torch.Tensor([beat_ids_reverse[0:]]).long().to(device)
print(batch_token_ids)

tensor([[    0,  9738,   624,  5900,  3786, 10633,  6574,  4741,  3815,   624,
          7723,  4362,  3478,  4162,  5589,     0,  6603,   645, 10193,  3594,
          5418,  4205,  6228,  1736,  2604,  3515,  3467,  9885,  9167,     0,
          1533,  2925,  8642,  3428,  2558,  7734,  3026,  5703,  3290,  4452,
          3014,  7365]], device='cuda:0')


In [125]:
attention = model(input_ids=batch_token_ids, 
                  sentence_ids=batch_sentence_ids, 
                  final_ids=batch_final_ids,
                  pos_ids=batch_pos_ids,
                  beat_ids=batch_beat_ids)[-1]

from IPython.core.display import HTML
display(HTML('<script src="/static/components/requirejs/require.js"></script>'))
# Above two lines only needed when running in Colab
tokens = full_tokenizer.convert_ids_to_tokens(token_ids_reverse)[0:]
print(tokens)
head_view(attention, tokens)


['[SEP]', '血', '的', '烫', '滚', '着', '流', '肤', '皮', '的', '色', '黄', '有', '们', '我', '[SEP]', '切', '一', '对', '面', '气', '勇', '了', '定', '坚', '斗', '战', '在', '还', '[SEP]', '月', '岁', '嵘', '峥', '昔', '古', '写', '续', '来', '唱', '说', '用']


<IPython.core.display.Javascript object>

In [119]:
model_view(attention, tokens)

<IPython.core.display.Javascript object>