In [1]:
!rm -rf ~/.cache/huggingface/datasets
!pip install datasets transformers fsspec
!pip install -U datasets huggingface-hub fsspec
!pip install matplotlib

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
[0m

In [2]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/transformer')
sys.path.append('/content/drive/MyDrive/Colab Notebooks/transformer/other')

In [3]:
from embedding.TransformerEmbedding import TransformerEmbedding
from model.Encoder import Encoder
from model.Decoder import Decoder
from model.Transformer import Transformer
from other.dataloader import DataLoaderHF
from other.BLEU import bleu_stats, bleu, get_bleu

I am __init__ of embedding.py
I am __init__ of model.py
I am __init__ of blocks.py
I am __init__ of layers.py
I am __init__ of other.py


In [4]:
# 初始化模型，优化器，调度器，损失函数、
import datasets
import transformers
import torch
import torch.nn as nn
import math
import time
import gc

from torch import nn, optim
from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup

In [5]:
####################################################################################################
# 初始化参数
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
# 初始化模型参数
d_model = 512
max_len = 256
batch_size = 50
n_head = 8
n_layers = 6
ffn_hidden = 2048
drop_prob = 0.1

# 初始化优化器参数
init_lr = 5e-4
factor = 0.9
adam_eps = 5e-9
patience = 10
warmup = 20
epoches = 100
clip = 1.0
weight_decay = 5e-4
inf = float('inf')

cuda


In [6]:
# 临时关闭代理
import os
for k in ('HTTP_PROXY', 'HTTPS_PROXY', 'ALL_PROXY', 'http_proxy', 'https_proxy', 'all_proxy'):
    os.environ.pop(k, None)

In [7]:
# one epoch time
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - elapsed_mins * 60)
  return elapsed_mins, elapsed_secs

In [8]:
#######################################################################################################
# 首先初始化数据
model_name = 'Helsinki-NLP/opus-mt-de-en'

dataloader = DataLoaderHF(model_name, max_len, batch_size, "<s>")

tokenizer = dataloader.tokenizer
tokenizer.add_special_tokens({'additional_special_tokens':['<s>']})
voc_size = len(tokenizer.get_vocab())
pad_id = tokenizer.pad_token_id
train_data, valid_data, test_data = dataloader.make_dataset()
train_iter, valid_iter, test_iter = dataloader.make_iter(train_data, valid_data, test_data)



Dataset initializing start


Generating train split:   0%|          | 0/29000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1014 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset loaded


Map (num_proc=6):   0%|          | 0/29000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/1014 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/1000 [00:00<?, ? examples/s]

Data Initializing done


In [9]:
step_per_epoch = len(train_iter)
total_steps = step_per_epoch * epoches
warmup_steps = int(total_steps * 0.1)

In [10]:
########################################################################################################
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

def initialize_weights(m):
  if hasattr(m, 'weight') and m.weight.dim() > 1:
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)

    elif isinstance(m, nn.Embedding):
        nn.init.xavier_uniform_(m.weight)

In [11]:
##########################################################################################################
# 建立模型
model = Transformer(pad_idx=pad_id,
                    enc_voc_size=voc_size,
                    dec_voc_size=voc_size,
                    d_model=d_model,
                    max_len=max_len,
                    batch_size=batch_size,
                    n_head=n_head,
                    n_layers=n_layers,
                    ffn_hidden=ffn_hidden,
                    drop_prob=drop_prob,
                    device=device)

model.to(device)
model = torch.compile(model)
print('model has {0} parameters'.format(count_parameters(model)))
model.apply(initialize_weights)

# 创建更新参数的优化器
optimizer = optim.AdamW(params=model.parameters(),
                       lr=init_lr,
                       weight_decay=weight_decay,
                       eps=adam_eps)

# 创建更新优化器中的学习率的调优器
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

# 创建损失函数
criterion = nn.CrossEntropyLoss(ignore_index=pad_id)

model has 192937718 parameters


In [12]:
#######################################################################################################
# 混合精度训练
def train(model, iterator, optimizer, criterion, clip):

    model.train()
    epoch_loss = 0
    scaler = torch.amp.GradScaler(device='cuda')

    for i, batch in enumerate(iterator):
        src = batch['input_ids'].to(device, non_blocking=True)
        trg = batch['labels'].to(device, non_blocking=True)

        optimizer.zero_grad()
        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            output = model(src, trg)
            output_reshape = output[:, :-1, :]
            output_reshape = output_reshape.contiguous().view(-1, output_reshape.shape[-1]).to(device)
            trg = trg[:, 1:]
            trg = trg.contiguous().view(-1).to(device)
            loss = criterion(output_reshape, trg)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        scaler.step(optimizer)
        scaler.update()

        scheduler.step()

        epoch_loss += loss.item()
        if (i % 100 == 0):
            print('{0} Loss {1}'.format(i, loss))


    torch.cuda.empty_cache()
    gc.collect()

    return epoch_loss / len(train_iter)

In [13]:
def evaluate(model, valid_iter, criterion):
    model.eval()
    epoch_loss = 0
    batch_bleu = []
    with torch.no_grad():
        for i, batch in enumerate(valid_iter):
            src = batch['input_ids']
            trg = batch['labels']

            output = model(src, trg)
            output = output[:, :-1, :]
            output_reshape = output.contiguous().view(-1, output.shape[-1]).to(device, non_blocking=True)
            trg = trg[:, 1:]
            trg_reshape = trg.contiguous().view(-1).to(device, non_blocking=True)
            loss = criterion(output_reshape, trg_reshape)
            epoch_loss += loss.item()

            total_bleu = []
            output = output.argmax(dim=2)
            for j in range(len(src)):
                try:
                    trg_words = tokenizer.decode(trg[j], skip_special_tokens=True)
                    output_words = tokenizer.decode(output[j], skip_special_tokens=True)
                    bleu = get_bleu(hypothesis=output_words.split(), reference=trg_words.split())
                    total_bleu.append(bleu)
                except:
                    pass
              
            total_bleu = sum(total_bleu) / len(total_bleu)
            batch_bleu.append(total_bleu)
    
    batch_bleu = sum(batch_bleu) / len(batch_bleu)
    return epoch_loss / len(valid_iter), batch_bleu

In [14]:
# 运行函数
def run(total_epoch, best_loss):
    train_losses, test_losses, bleus = [], [], []
    for epoch in range(total_epoch):
        start_time = time.time()
        train_loss = train(model, train_iter, optimizer, criterion, clip)
        valid_loss, bleu = evaluate(model, valid_iter, criterion)
        end_time = time.time()

        print('{0}'.format(epoch))
        print('#########################################################################################')
        
        train_losses.append(train_loss)
        test_losses.append(valid_loss)
        bleus.append(bleu)
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_loss:
            best_loss = valid_loss

    return train_losses, test_losses, bleus

In [15]:
if __name__ == '__main__':
    train_losses, test_losses, bleus = [], [], []
    best_loss = float('inf')
    # for epoch in range(epoches):
    #     train_loss = train(model, train_iter, optimizer, criterion, clip)
    #     print('num:{0}'.format(epoch))
    #     print('################################################################')
    start_time = time.time()
    train_losses, test_losses, bleus = run(epoches, float('inf'))
    end_time = time.time()

    a, b = epoch_time(start_time, end_time)
    print(f"mins: {a}  secs: {b}")
        # valid_loss, bleu = evaluate(model, valid_iter, criterion)

        # if epoch > warmup:
        #     scheduler.step(valid_loss)

        # train_losses.append(train_loss)
        # test_losses.append(valid_loss)
        # bleu.append(bleu)

        # if best_loss > valid_loss:
        #     best_loss = valid_loss



0 Loss 11.004965782165527
100 Loss 10.249210357666016
200 Loss 9.033368110656738
300 Loss 7.255722522735596
400 Loss 5.813625335693359
500 Loss 5.064912796020508




0
#########################################################################################
0 Loss 4.761106491088867
100 Loss 4.260387420654297
200 Loss 4.205906391143799
300 Loss 3.8410725593566895
400 Loss 3.499314069747925
500 Loss 3.422848701477051
1
#########################################################################################
0 Loss 3.1548242568969727
100 Loss 2.9649691581726074
200 Loss 3.1091396808624268
300 Loss 2.8790011405944824
400 Loss 2.7820448875427246
500 Loss 2.6913652420043945
2
#########################################################################################
0 Loss 2.471972703933716
100 Loss 2.5867252349853516
200 Loss 2.8457043170928955
300 Loss 2.3746941089630127
400 Loss 2.279916286468506
500 Loss 2.3407182693481445
3
#########################################################################################
0 Loss 2.1909682750701904
100 Loss 2.1771204471588135
200 Loss 2.1777660846710205
300 Loss 2.2945873737335205
400 Loss 2.187124729156494
500 

In [16]:
print(bleus)

[0.0, 9.875795959874146, 3.3927837667034995, 5.050207600091111, 5.906831458639922, 4.226477049872762, 5.118012574900071, 6.219179837408326, 6.180805019792224, 6.085400044176935, 6.104925228122004, 6.76304883560243, 7.399110317428595, 6.946918384126898, 7.013807080225429, 9.585033308575175, 34.49558811712024, 24.53919732613385, 7.062972980490431, 7.483544211991229, 12.15255822632561, 8.712227286848904, 17.0466680890478, 27.965782767444267, 14.526812360777729, 16.282525004998224, 12.698538985284026, 15.516432560706289, 30.527147491055167, 20.635193494831594, 32.75562017662942, 30.469631556944353, 29.8021651956092, 17.61778309917739, 33.99050406539969, 11.807823828479496, 14.826931653872215, 7.860769271612874, 12.611568871041873, 12.699572439585731, 20.484728277580945, 14.99663255372749, 8.40060080542877, 7.69060088863895, 8.657148527785493, 16.250862279004554, 25.322462769292823, 11.774591490942054, 10.02996015985222, 18.9840071423246, 25.18133834810779, 11.56101577975911, 15.47429028580

In [29]:
batch = next(iter(test_iter))

a = batch['input_ids'].to(device)
b = batch['labels'].to(device)

decoder_output = torch.full((batch_size, max_len), pad_id)
decoder_output[:, 0] = 58101

src_mask = model.make_src_mask(a)


model.eval()
encoder_output = model.encoder(a, src_mask)
for j in range(1, max_len):
    trg_mask = model.make_trg_mask(decoder_output)
    output = model.decoder(encoder_output, decoder_output, src_mask, trg_mask)
    output = output.argmax(dim=2)
    output = output[:, j-1]
    decoder_output[:, j] = output


    
# print(decoder_output.shape)

print("真实数据：{0}".format(tokenizer.decode(b[0], skip_special_tokens=True)))
print("模型数据：{0}".format(tokenizer.decode(decoder_output[0], skip_special_tokens=True)))
print('\n')
bleu = get_bleu(hypothesis=tokenizer.decode(b[0], skip_special_tokens=True).split(), reference=tokenizer.decode(decoder_output[0], skip_special_tokens=True).split())
print("bleu : {0}".format(bleu))
print('#############################################################################################################################################')
print("真实数据：{0}".format(tokenizer.decode(b[1], skip_special_tokens=True)))
print("模型数据：{0}".format(tokenizer.decode(decoder_output[1], skip_special_tokens=True)))
print('\n')
bleu = get_bleu(hypothesis=tokenizer.decode(b[1], skip_special_tokens=True).split(), reference=tokenizer.decode(decoder_output[1], skip_special_tokens=True).split())
print("bleu : {0}".format(bleu))
print('#############################################################################################################################################')
print("真实数据：{0}".format(tokenizer.decode(b[2], skip_special_tokens=True)))
print("模型数据：{0}".format(tokenizer.decode(decoder_output[2], skip_special_tokens=True)))
print('\n')
bleu = get_bleu(hypothesis=tokenizer.decode(b[2], skip_special_tokens=True).split(), reference=tokenizer.decode(decoder_output[2], skip_special_tokens=True).split())
print("bleu : {0}".format(bleu))
print('#############################################################################################################################################')
print("真实数据：{0}".format(tokenizer.decode(b[3], skip_special_tokens=True)))
print("模型数据：{0}".format(tokenizer.decode(decoder_output[3], skip_special_tokens=True)))
print('\n')
bleu = get_bleu(hypothesis=tokenizer.decode(b[3], skip_special_tokens=True).split(), reference=tokenizer.decode(decoder_output[3], skip_special_tokens=True).split())
print("bleu : {0}".format(bleu))
print('#############################################################################################################################################')
print("真实数据：{0}".format(tokenizer.decode(b[4], skip_special_tokens=True)))
print("模型数据：{0}".format(tokenizer.decode(decoder_output[4], skip_special_tokens=True)))
print('\n')
bleu = get_bleu(hypothesis=tokenizer.decode(b[4], skip_special_tokens=True).split(), reference=tokenizer.decode(decoder_output[4], skip_special_tokens=True).split())
print("bleu : {0}".format(bleu))
print('#############################################################################################################################################')

真实数据：Two men pretend to be statutes while women look on.
模型数据：Two men are kidding around as a woman watches them.......


bleu : 0
#############################################################################################################################################
真实数据：Two workers spread cement onto a brick building.
模型数据：Two workers paint a brick wall......


bleu : 34.29235337074011
#############################################################################################################################################
真实数据：A man walking in front of a colorful wall mural.
模型数据：A man is walking past buildings with red paint.......


bleu : 0
#############################################################################################################################################
真实数据：Three men wearing brightly colored costumes take to the streets with wigs and crazy sunglasses.
模型数据：Three men in colorful costumes and hats walking down the street with children. and illumin

In [26]:
# 计算测试集上的bleu分数
bleu = []
model.eval()
for i, batch in enumerate(test_iter):
    src = batch['input_ids'].to(device)
    trg = batch['labels'].to(device)

    src_mask = model.make_src_mask(src)
    
    decoder_output = torch.full((batch_size, max_len), pad_id)
    decoder_output[:, 0] = 58101

    encoder_output = model.encoder(src, src_mask)
    
    for j in range(1, max_len):
        trg_mask = model.make_trg_mask(decoder_output)
        output = model.decoder(encoder_output, decoder_output, src_mask, trg_mask)
        output = output.argmax(dim=2)
        output = output[:, j-1]
        decoder_output[:, j] = output
        
    for j in range(batch_size):
        decoder_text = tokenizer.decode(decoder_output[j], skip_special_tokens=True)
        trg_text = tokenizer.decode(trg[j], skip_special_tokens=True)
        single_bleu = get_bleu(hypothesis=decoder_text.split(), reference=trg_text.split())
        bleu.append(single_bleu)

print(bleu)
print(sum(bleu) / len(bleu))

[47.482783017056825, 25.360745629471985, 16.16580722379085, 36.964048837543345, 30.004970808395854, 0, 0, 0, 0, 14.85913933100421, 20.308841203825523, 28.025329094731028, 0, 35.21828198797041, 0, 20.34287273403357, 0, 34.855395234395, 21.52740777704674, 0, 68.00191071751857, 18.84159053341141, 0, 14.943890275422648, 0, 44.37189172059723, 0, 0, 23.00508533175412, 0, 18.402907523917012, 34.52982612828641, 23.629224506448573, 15.672862348588998, 9.495628650423482, 17.219907932801828, 10.299424308520393, 0, 0, 0, 36.212588723389636, 19.861626912448344, 0, 0, 0, 22.895303546549677, 0, 21.710521901753456, 0, 0, 44.45335269913894, 33.16551097737016, 0, 0, 37.42104642239127, 0, 0, 88.81727772008662, 0, 8.307729562967992, 13.240291795544309, 25.659264599011884, 0, 36.739131889547174, 17.741584358515635, 27.396299140846576, 0, 20.74120516909554, 0, 53.966304749888316, 0, 0, 32.024939956638946, 0, 44.74935415497117, 20.05593651049209, 26.83217561385724, 0, 0, 43.300714581232825, 24.03003002974465