In [None]:
import torch
from transformers import LineByLineTextDataset

# 数据预处理

In [None]:
train_path = './data/gaiic_track3_round1_train_20210228.tsv'
test_path = './data/gaiic_track3_round1_testA_20210228.tsv'

In [None]:
vocab_file = './tokens/tokens.txt'

In [None]:
raw_text = './data/raw_text.txt'

In [None]:
train_result =[]
vocab = set()
with open(train_path, encoding="utf-8") as f:
    for line in f.read().splitlines():
        rows =  line.split('\t')[0:2]
        for sent in rows:
            vocab.update(sent.split(' '))
        train_result.append(rows[0] + ' [SEP] ' +rows[1])
test_result = []
with open(test_path, encoding="utf-8") as f:
    for line in f.read().splitlines():
        rows =  line.split('\t')[0:2]
        for sent in rows:
            vocab.update(sent.split(' '))
        test_result.append(rows[0] + ' [SEP] ' +rows[1])
all_result = train_result + test_result
with open(raw_text, 'w') as f:
    for key in all_result:
        f.write(str(key)+'\n')
vocab = ['[PAD]','[UNK]','[CLS]','[SEP]','[MASK]'] + list(vocab)

In [None]:
with open(vocab_file,'w') as f:
    for key in vocab:
        f.write(str(key)+'\n')

In [None]:
from transformers import BertTokenizer, LineByLineTextDataset
tokenizer = BertTokenizer(vocab_file='./tokens/tokens.txt')

In [None]:
dataset= LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = raw_text,
    block_size = 128  # maximum sequence length
)

In [None]:
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling

config = BertConfig(
    vocab_size=tokenizer.vocab_size+1000,
    hidden_size=768, 
    num_hidden_layers=6, 
    num_attention_heads=12,
    max_position_embeddings=512
)
 
model = BertForMaskedLM(config)


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./mybert/',
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=96,
    save_steps=10000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    
)

In [None]:
trainer.train()
trainer.save_model("./mybert")

# 训练

In [1]:
from transformers import RobertaTokenizerFast,BertTokenizerFast,BertTokenizer
from torch.utils.data.dataset import Dataset
from transformers import DataCollatorForLanguageModeling,DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from transformers import RobertaTokenizerFast,BertTokenizer,RobertaTokenizer
from transformers import RobertaModel,BertModel
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader
import math

import torch
import numpy as np
import torch.nn as nn


In [2]:
train_path = './data/gaiic_track3_round1_train_20210228.tsv'
test_path = './data/gaiic_track3_round1_testA_20210228.tsv'

In [3]:
def load_data(path):
    max_length = 0
    samples = []
    labels = []
    with open(path, encoding="utf-8") as f:
        for line in f.read().splitlines():
            temp = line.split('\t')
            new_line =temp[0] +' ' + temp[1]
            samples.append(new_line)
            if max_length<  len(new_line):
                max_length = len(new_line)
            if len(temp)>2:
                labels.append(int(temp[2]))
    return samples,labels,max_length

In [4]:
train_texts,train_labels,max_length =  load_data(train_path)
test_texts, _, _ = load_data(test_path)
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.1)

In [5]:
f'train: {len(train_texts)},val:{len(val_texts)}'

'train: 90000,val:10000'

In [6]:
tokenizer = BertTokenizer(vocab_file='./tokens/tokens.txt')
bert = BertModel.from_pretrained('./mybert')

Some weights of BertModel were not initialized from the model checkpoint at ./mybert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [8]:
class MatchingDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels,tokenizer):
        self.texts = texts
        self.labels = labels
    def __getitem__(self, idx):
        #item = tokenizer(self.texts[idx])
        examples = tokenizer(self.texts[idx], add_special_tokens=True, truncation=True, max_length=max_length)
        #examples = batch_encoding
        #examples = {"input_ids": sent_1['input_ids'],"input_ids_sent2":sent_2['input_ids']}
        if self.labels:
            examples['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return examples
    def __len__(self):
        return len(self.texts)


In [9]:
class TextMatching(nn.Module):
    def __init__(self, embeding_layer, embeding_size):
        super(TextMatching, self).__init__()
        self.embeding_layer = embeding_layer
        self.embeding_size = embeding_size
        self.liner = nn.Linear(embeding_size, embeding_size)
        #self.lstm =  nn.LSTM(embeding_size, embeding_size)
        self.dropout = nn.Dropout(0.1)

        self.liner2= nn.Linear(embeding_size, 1)
        #self.criterion = torch.nn.BCEWithLogitsLoss()
        self.sigmoid = nn.Sigmoid()
    def forward(self, input_ids,attention_mask):
        
        hidden = self.embeding_layer(input_ids =input_ids,attention_mask=attention_mask)
        #hidden = self.lstm(hidden.last_hidden_state)
        hidden = self.dropout(hidden.pooler_output)
        hidden = self.liner(hidden)
        hidden = self.liner2(hidden)
        output = self.sigmoid(hidden)
        return output.squeeze()

In [10]:
model = TextMatching(bert,768)
model.to(device)

TextMatching(
  (embeding_layer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21605, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [11]:
train_dataset = MatchingDataset(train_texts,train_labels,tokenizer)
val_dataset = MatchingDataset(val_texts,val_labels,tokenizer)
data_collator = DataCollatorWithPadding(tokenizer)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True,collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True,collate_fn=data_collator)

In [12]:
criterion = nn.BCELoss()
lr = 1e-5 # learning rate
#optimizer = torch.optim.SGD(model.parameters(), lr=lr)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [13]:
import time
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    for batch_index, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #attention_mask = batch['attention_mask'].to(device)
        output = model(input_ids,attention_mask)
        loss = criterion(output, batch['labels'].to(device))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        total_loss += loss.item()
        log_interval = 200
        if batch_index % log_interval == 0 and batch_index > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.6f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch_index, len(train_loader) , lr,
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))#scheduler.get_lr()[0]
            total_loss = 0
            start_time = time.time()


def evaluate(eval_model, val_loader):
    eval_model.eval() 
    y_pred = []
    y_true = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            output = eval_model(input_ids,attention_mask)
            y_pred.append(output.cpu().numpy())
            y_true.append(batch['labels'].numpy())
    y_pred = np.concatenate(y_pred)
    y_true = np.concatenate(y_true)
    return roc_auc_score(y_true, y_pred)

In [14]:
best_val_auc = float("inf")
epochs = 5
best_model = None
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_auc = evaluate(model, val_loader)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid auc {:5.5f} '.format(epoch, (time.time() - epoch_start_time),val_auc))
    print('-' * 89)

    if val_auc > best_val_auc:
        best_val_loss = val_auc
        best_model = model
    #scheduler.step()

| epoch   1 |   200/ 1407 batches | lr 0.000010 | ms/batch 147.76 | loss  0.52 | ppl     1.68
| epoch   1 |   400/ 1407 batches | lr 0.000010 | ms/batch 142.60 | loss  0.46 | ppl     1.58
| epoch   1 |   600/ 1407 batches | lr 0.000010 | ms/batch 143.66 | loss  0.42 | ppl     1.52
| epoch   1 |   800/ 1407 batches | lr 0.000010 | ms/batch 146.42 | loss  0.39 | ppl     1.48
| epoch   1 |  1000/ 1407 batches | lr 0.000010 | ms/batch 142.41 | loss  0.39 | ppl     1.47
| epoch   1 |  1200/ 1407 batches | lr 0.000010 | ms/batch 141.16 | loss  0.37 | ppl     1.45
| epoch   1 |  1400/ 1407 batches | lr 0.000010 | ms/batch 142.57 | loss  0.36 | ppl     1.44
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 209.58s | valid auc 0.92586 
-----------------------------------------------------------------------------------------
| epoch   2 |   200/ 1407 batches | lr 0.000010 | ms/batch 140.87 | loss  0.32 | ppl     1.38
| epoch   2 

KeyboardInterrupt: 

In [15]:
evaluate(model, train_loader)

0.991646027756472

# test

In [16]:
test_dataset = MatchingDataset(test_texts,None,tokenizer)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False,collate_fn=data_collator)

In [17]:
model.eval() 
y_pred = []
with torch.no_grad():
    for batch_index, batch in enumerate(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        output = model(input_ids,attention_mask)
        y_pred.append(output.cpu().numpy())
y_pred = np.concatenate(y_pred)


In [18]:
with open('./result/result.txt', 'w') as f:
    for val in y_pred:
        f.write(str(val)+'\n')