In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader

from transformers import AutoModel, AutoTokenizer, AdamW
from datasets import load_dataset, load_from_disk
import evaluate
import math
import os
from tqdm import tqdm

In [2]:
os.environ["http_proxy"] = "http://172.23.236.216:7890"
os.environ["https_proxy"] = "http://172.23.236.216:7890"

In [3]:
path = "/data1/cuimenglong/huggingface/models/opus-mt-de-en"
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModel.from_pretrained(path)

Some weights of the model checkpoint at /data1/cuimenglong/huggingface/models/opus-mt-de-en were not used when initializing MarianModel: ['final_logits_bias']
- This IS expected if you are initializing MarianModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MarianModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def collate_fn(data):
    de = [example['translation']['de'] for example in data]
    en = [example['translation']['en'] for example in data]
    data = tokenizer.batch_encode_plus(de, padding=True, truncation=True, max_length=128, return_tensors='pt')

    with tokenizer.as_target_tokenizer():
        data['labels'] = tokenizer.batch_encode_plus(en, padding=True, truncation=True, max_length=128, return_tensors='pt')['input_ids']

    data['decoder_input_ids'] = torch.full_like(data['labels'], tokenizer.get_vocab()['<pad>'])
    data['decoder_input_ids'][:,1:] = data['labels'][:,:-1]

    return data

In [5]:
dataset = load_from_disk("/data1/cuimenglong/huggingface/datasets/wmt16/de-en")
train_dataloader = DataLoader(dataset['train'], batch_size=128, shuffle=True, 
                            drop_last=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(dataset['validation'], batch_size=128, shuffle=True, 
                            drop_last=True, collate_fn=collate_fn)
test_dataloader = DataLoader(dataset['test'], batch_size=128, shuffle=True, 
                            drop_last=True, collate_fn=collate_fn)

In [6]:
class Model(nn.Module):
    def __init__(self) :
        super().__init__()
        self.backbone = AutoModel.from_pretrained(path)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(512, tokenizer.vocab_size)
    
    def forward(self, input_ids, attention_mask, decoder_input_ids):
        out = self.backbone(input_ids, attention_mask, decoder_input_ids)
        out = out.last_hidden_state
        out = self.fc(self.dropout(out))

        return out

In [7]:
epochs = 3
model = Model()
optimizer = AdamW([
    {"params": model.backbone.parameters(), 'lr': 2e-5},
    {"params": model.fc.parameters(), 'lr': 5e-4}
])
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

Some weights of the model checkpoint at /data1/cuimenglong/huggingface/models/opus-mt-de-en were not used when initializing MarianModel: ['final_logits_bias']
- This IS expected if you are initializing MarianModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MarianModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
def train():
    print("training")
    model.train()
    model.to(device)
    for epoch in range(epochs):
        epoch_loss = 0
        for i, data in enumerate(train_dataloader):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            decoder_input_ids = data['decoder_input_ids'].to(device)
            labels = data['labels'].to(device)
            
            out = model(input_ids, attention_mask, decoder_input_ids)
            output_dim = out.shape[-1]
            out = out.view(-1, output_dim)
            labels = labels.view(-1)
            
            loss = criterion(out, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

            if i % 100 == 0:
                loss_item = epoch_loss / (i + 1)
                print('epoch:{}, idx:{}, loss:{}, PPL:{}'.format(epoch+1, i, loss_item, math.exp(loss_item)))

In [9]:
train()

training




epoch:1, idx:0, loss:12.518072128295898, PPL:273230.79728152603
epoch:1, idx:100, loss:2.3383696811033947, PPL:10.364325625449487
epoch:1, idx:200, loss:1.74026701729096, PPL:5.698864915001722
epoch:1, idx:300, loss:1.4678010544507607, PPL:4.3396819171657315
epoch:1, idx:400, loss:1.3133274716629353, PPL:3.7185264404946388
epoch:1, idx:500, loss:1.203749087637294, PPL:3.3325876944469015
epoch:1, idx:600, loss:1.1277288805426853, PPL:3.0886338720463002
epoch:1, idx:700, loss:1.0732607699494898, PPL:2.924901397764071
epoch:1, idx:800, loss:1.0288018958175673, PPL:2.7977118756046706
epoch:1, idx:900, loss:0.9940123193867331, PPL:2.702054256295985
epoch:1, idx:1000, loss:0.9647195555351593, PPL:2.6240516525138773


In [10]:
def compute_bleu(predictions, references):
    references = [[i] for i in references]
    metric = evaluate.load('bleu')
    metric_out = metric.compute(predictions=predictions, references=references)
    return metric_out

In [11]:
def translate(sentence, max_length=128):  # 给定一个德语句子，返回其翻译
    data = tokenizer.encode_plus(sentence, padding=True, truncation=True, max_length=128, return_tensors='pt')
    input_ids = data['input_ids'].to(device)
    attention_mask = data['attention_mask'].to(device)
    reference = [tokenizer.get_vocab()['<pad>']]
    for i in range(max_length):
        decoder_input_ids = torch.tensor(reference).unsqueeze(0).to(device)
        out = model(input_ids, attention_mask, decoder_input_ids)
        pred_token = out.argmax(dim=-1)[:,-1].item()
        if pred_token == 0:
            break
        reference.append(pred_token)
    return tokenizer.decode(reference[1:])

In [12]:
valid_references, valid_predictions = [], []
for item in tqdm(dataset['validation']['translation']):
    pred = translate(item['de'])
    valid_predictions.append(pred)
    valid_references.append(item['en'])
valid_bleu = compute_bleu(valid_predictions, valid_references)
print('valid bleu: ', valid_bleu)

100%|██████████| 2169/2169 [09:59<00:00,  3.62it/s]


valid bleu:  {'bleu': 0.24595684927854625, 'precisions': [0.5760426666666667, 0.3077215586274773, 0.1822562824702038, 0.11327670323683102], 'brevity_penalty': 1.0, 'length_ratio': 1.009301724694787, 'translation_length': 46875, 'reference_length': 46443}


In [13]:
test_references, test_predictions = [], []
for item in tqdm(dataset['test']['translation']):
    pred = translate(item['de'])
    test_predictions.append(pred)
    test_references.append(item['en'])
test_bleu = compute_bleu(test_predictions, test_references)
print('test bleu: ', test_bleu)

100%|██████████| 2999/2999 [14:05<00:00,  3.55it/s]


test bleu:  {'bleu': 0.27688030895299387, 'precisions': [0.5984797271921523, 0.33883400453058143, 0.21203916194753092, 0.13668339962828507], 'brevity_penalty': 1.0, 'length_ratio': 1.038179010901605, 'translation_length': 66567, 'reference_length': 64119}


In [16]:
def teacher_forcing(model, dataloader):  # 上帝视角,在翻译过程中使用teacher-forcing的方式
    model.eval()
    model = model.to(device)
    predictions = []
    references = []
    for data in dataloader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        decoder_input_ids = data['decoder_input_ids'].to(device)
        labels = data['labels'].to(device)
        
        out = model(input_ids, attention_mask, decoder_input_ids)
        pred = tokenizer.batch_decode(out.argmax(dim=2), skip_special_tokens=True)
        label = tokenizer.batch_decode(labels, skip_special_tokens=True)
        predictions.extend(pred)
        references.extend(label)
    
    return predictions, references

predictions, references = teacher_forcing(model, valid_dataloader)
valid_bleu_x = compute_bleu(predictions, references)
predictions_, references_ = teacher_forcing(model, test_dataloader)
test_bleu_x = compute_bleu(predictions_, references_)
print('valid bleu: ', valid_bleu_x)
print('test bleu: ', valid_bleu_x)



valid bleu:  {'bleu': 0.318229880337491, 'precisions': [0.63511027324142, 0.3905524190666762, 0.25104550121453434, 0.16469594594594594], 'brevity_penalty': 1.0, 'length_ratio': 1.0024818980827908, 'translation_length': 44027, 'reference_length': 43918}
test bleu:  {'bleu': 0.318229880337491, 'precisions': [0.63511027324142, 0.3905524190666762, 0.25104550121453434, 0.16469594594594594], 'brevity_penalty': 1.0, 'length_ratio': 1.0024818980827908, 'translation_length': 44027, 'reference_length': 43918}
