In [13]:
%pylab inline

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import sys
import codecs
import pandas as pd
import numpy as np
from tqdm import tqdm


def read_data(path='data_v2/'):
    vid_info = pd.read_csv(os.path.join(path, 'vid_info.csv'))
    vid_info['stars'] = vid_info['stars'].apply(eval)
    vid_info['tags'] = vid_info['tags'].apply(eval)
    vid_info['key_word'] = vid_info['key_word'].apply(eval)
    vid_info.sort_values(by=['cid', 'serialno'], inplace=True)
    # vid_info.set_index('vid', inplace=True)

    seq_train = pd.read_csv(os.path.join(path, 'main_vv_seq_train.csv'))
    seq_train = seq_train.sort_values(by=['did', 'seq_no'])
    seq_train.reset_index(inplace=True)

    candidate_items = pd.read_csv(os.path.join(path, 'candidate_items_A.csv'))
    return vid_info, seq_train, candidate_items


vid_info, seq_train, candidate_items = read_data()

Populating the interactive namespace from numpy and matplotlib


In [14]:
from transformers import AutoTokenizer

import torch
from torch.utils.data import DataLoader, Dataset

from transformers import AutoConfig, AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

In [15]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/lyz/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolv

In [16]:
tokenizer.add_tokens(candidate_items['vid'].tolist())

13406

In [4]:
# 简单自定义dataset
class PreTrainDataset(Dataset):
    def __init__(self, data_list, tokenizer, max_seq_len):
        super(PreTrainDataset, self).__init__()
        self.data_list = data_list
        self.len = len(data_list)
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __getitem__(self, index):
        example = self.data_list[index]
        data = self.tokenizer.encode_plus(' '.join(example), return_token_type_ids=True, padding='max_length', truncation=True,
                                          return_attention_mask=True, max_length=self.max_seq_len)
        return {'input_ids':  torch.tensor(data['input_ids'][:self.max_seq_len], dtype=torch.long),
                'token_type_ids':  torch.tensor(data['token_type_ids'][:self.max_seq_len], dtype=torch.long),
                'attention_mask':  torch.tensor(data['attention_mask'][:self.max_seq_len], dtype=torch.long)}

    def __len__(self):
        return self.len

In [17]:
seq_vid_list = seq_train.groupby(['did'])['vid'].apply(list)

In [18]:
train_dataset = PreTrainDataset(
    seq_vid_list[:-2000],
    tokenizer,
    16
)

val_dataset = PreTrainDataset(
    seq_vid_list[-2000:],
    tokenizer,
    16
)

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 加载模型
config = AutoConfig.from_pretrained('bert-base-uncased')
config.vocab_size = len(tokenizer)
model = AutoModelForMaskedLM.from_config(config)
model.resize_token_embeddings(len(tokenizer))

model.to(device)
# 定义trainer
training_args = TrainingArguments(
                          output_dir='pretrain_bert',
                          overwrite_output_dir=True,
                          do_train=True, 
                          do_eval=True,
                          per_device_train_batch_size=256,
                          per_device_eval_batch_size=256,
                          evaluation_strategy='steps',
                          logging_steps=100,
                          eval_steps = None,
                          prediction_loss_only=True,
                          learning_rate = 1e-5,
                          weight_decay=0.01,
                          adam_epsilon = 1e-8,
                          max_grad_norm = 1.0,
                          num_train_epochs = 10,
                          save_steps = 200,
                        push_to_hub=False
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

# 分类 top6

In [None]:
trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 168909
  Num Epochs = 10
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 6600


Step,Training Loss,Validation Loss
100,8.0598,6.861248
200,6.2097,5.313494
300,4.9362,4.277523
400,4.064,3.528285


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 256
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 256
Saving model checkpoint to pretrain_bert/checkpoint-200
Configuration saved in pretrain_bert/checkpoint-200/config.json
Model weights saved in pretrain_bert/checkpoint-200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 256
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 256
Saving model checkpoint to pretrain_bert/checkpoint-400
Configuration saved in pretrain_bert/checkpoint-400/config.json
Model weights saved in pretrain_bert/checkpoint-400/pytorch_model.bin


In [58]:
from transformers import AutoTokenizer

import torch
from torch.utils.data import DataLoader, Dataset

from transformers import AutoConfig, AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

# tokenizer = AutoTokenizer.from_pretrained('./pretrain_bert/checkpoint-6600/')
model = AutoModelForMaskedLM.from_pretrained('./pretrain_bert/checkpoint-6600/')
model.resize_token_embeddings(len(tokenizer))

loading configuration file ./pretrain_bert/checkpoint-6600/config.json
Model config BertConfig {
  "_name_or_path": "./pretrain_bert/checkpoint-6600/",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file ./pretrain_bert/checkpoint-6600/pytorch_model.bin
All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint a

Embedding(43928, 768)

In [76]:
# 简单自定义dataset
class TestDataset(Dataset):
    def __init__(self, data_list, tokenizer, max_seq_len):
        super(TestDataset, self).__init__()
        self.data_list = data_list
        self.len = len(data_list)
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __getitem__(self, index):
        example = self.data_list[index]
        example[-1] = '[MASK]'
        data = self.tokenizer.encode_plus(' '.join(example[-14:]), return_token_type_ids=True, padding='max_length', truncation=True,
                                          return_attention_mask=True, max_length=self.max_seq_len)
        return {'input_ids':  torch.tensor(data['input_ids'][:self.max_seq_len], dtype=torch.long),
                'token_type_ids':  torch.tensor(data['token_type_ids'][:self.max_seq_len], dtype=torch.long),
                'attention_mask':  torch.tensor(data['attention_mask'][:self.max_seq_len], dtype=torch.long)}

    def __len__(self):
        return self.len

In [77]:
val_dataset = TestDataset(
    seq_vid_list[-2000:],
    tokenizer,
    16
)

val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=16)

In [78]:
for data in val_dataloader:
    break

In [79]:
pred = model(input_ids=data['input_ids'], attention_mask=data['attention_mask']).logits

In [80]:
pred = pred.data.numpy()

In [82]:
idx = 0
for input_ids in data['input_ids'].data.numpy():
    break

2683

In [89]:
tokenizer.decode(pred[0, np.where(input_ids == 103)[0][0]].argmax())

'##9'