In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import torch.nn.functional as F
import yaml
from llama.tokenizer import Tokenizer
from torch.utils.data import DataLoader, Dataset

In [2]:
def read_csv(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

def read_yaml(config_path):
    with open(config_path, "r", encoding="utf-8") as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    return config

def split_text2chunk(text, chunk_size=512):
    '''
    一条文本太长，进行分块
    '''
    for i in range(0, len(text), chunk_size):
        yield text[i:i+chunk_size]

# 加载tokenizer, text_chunks, config
tokenizer = Tokenizer("./llama/tokenizer.model")
config = read_yaml("./config.yaml")
train_text = read_csv("./archive/train.csv")
print(len(train_text))
print(tokenizer.pad_id)

1003862
-1


In [3]:
import re
def split_text(text):
    # 使用正则表达式匹配所有标点符号、换行符和制表符进行切分
    tokens = re.findall(r"[\w']+|[.,!?;()\n\t]", text)
    return tokens
word_list = split_text(train_text)
print(len(word_list))

251283


In [4]:
class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

#先tokenize，后切分成512一组----》容易保证输入seq_len一致，但可能破坏文本一致性
train_data = tokenizer.encode(train_text, bos=False, eos=False)
print("分词后token数:", len(train_data))
#padding and padding mask
train_data_list = list(split_text2chunk(train_data, config["model_config"]["seq_len"]+1))
padded_train_data = torch.nn.utils.rnn.pad_sequence([torch.tensor(ids) for ids in train_data_list], batch_first=True)
train_dataset = TextDataset(padded_train_data)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

#先按words切分为512的chunk, 再进行tokenize---》容易保持文本一致性，但seq_len不一定相同

'''
train_chunks = list(split_text2chunk(word_list, config["model_config"]["seq_len"]))
train_data_list = []
for chunk in train_chunks:
    tokens = ' '.join(chunk)
    tokenized_chunk = tokenizer.encode(tokens, False, False)
    train_data_list.append(tokenized_chunk)
padded_train_data = torch.nn.utils.rnn.pad_sequence([torch.tensor(ids) for ids in train_data_list], batch_first=True, padding_value = tokenizer.pad_id)
'''

分词后token数: 330059


'\ntrain_chunks = list(split_text2chunk(word_list, config["model_config"]["seq_len"]))\ntrain_data_list = []\nfor chunk in train_chunks:\n    tokens = \' \'.join(chunk)\n    tokenized_chunk = tokenizer.encode(tokens, False, False)\n    train_data_list.append(tokenized_chunk)\npadded_train_data = torch.nn.utils.rnn.pad_sequence([torch.tensor(ids) for ids in train_data_list], batch_first=True, padding_value = tokenizer.pad_id)\n'

In [5]:
print(len(train_dataset))
print(len(train_dataloader)) # 每个chunk 512个token 每个batch 32个chunk

644
161


In [6]:
class tokenEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model): #d_model为单个token的embedding维度
        super(tokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embedding(x)

class RotationEmbedding(nn.Module):
    def __init__(self, max_len, d_model, device = 'cuda' if torch.cuda.is_available() else 'cpu'):
        super(RotationEmbedding, self).__init__()
        self.max_len = max_len
        self.d_model = d_model
        self.device = device
    def cal_theta(self, base=10000):
        # return shape (d_model,)
        _2i = torch.arange(0, self.d_model, step=2, device=self.device)
        theta = 1 / (base ** (_2i / self.d_model))
        theta =torch.repeat_interleave(theta, repeats=2)
        return theta
    
    def cal_cos_sin(self):
        pos = torch.arange(0, self.max_len, device=self.device).unsqueeze(1)
        # pos shape (max_len, 1)
        angles = pos * self.cal_theta() # shape (max_len, d_model) 广播运算
        embeddings = torch.stack([torch.cos(angles), torch.sin(angles)], dim=-1) # shape (max_len, d_model, 2)
        #dim=-1表示在最后一维插入，变成[cos(angles), sin(angles)]
        # shape (max_len, d_model, 2)
        embeddings = embeddings.unsqueeze(0).unsqueeze(0) # shape (1, 1, max_len, d_model, 2)
        return embeddings
    
    def forward(self, q, k):
        # q shape (batch_size, n_head, max_len, d_head)
        embeddings = self.cal_cos_sin()
        #print("q shape:", q.shape)
        #print("k shape:", k.shape)
        q2 = torch.stack([-q[..., 1::2], q[..., ::2]], dim=-1)
        q2 = q2.reshape(q.shape)
        cos_pos = embeddings[..., 0].squeeze(-1)
        expected_dims = [1, 1, self.max_len, self.d_model]
        for i, expected_dim in enumerate(expected_dims):
            assert cos_pos.size(i) == expected_dim, f"Expected dimension {i} to be {expected_dim}, but got {cos_pos.size(i)}"
        sin_pos = embeddings[..., 1].squeeze(-1)
        #print("sin and cos shape:", sin_pos.shape, cos_pos.shape)
        q = q * cos_pos + q2 * sin_pos

        k2 = torch.stack([-k[..., 1::2], k[..., ::2]], dim=-1)
        k2 = k2.reshape(k.shape)
        # 更新kw, *对应位置相乘
        k = k * cos_pos + k2 * sin_pos
        return q, k
    
class FeedForward(nn.Module):
    def __init__(self, d_model, d_hidden=2048, drop_prob=0.1):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_hidden)
        self.linear2 = nn.Linear(d_hidden, d_model)
        self.activate = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)
        if self.linear1.bias is not None:
            nn.init.zeros_(self.linear1.bias)
        if self.linear2.bias is not None:
            nn.init.zeros_(self.linear2.bias)
        nn.init.xavier_normal_(self.linear1.weight)
        nn.init.xavier_normal_(self.linear2.weight)
        
    def forward(self, x):
        x = self.linear1(x)
        x = self.activate(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x
    
class MultiheadAttention(nn.Module):
    def __init__(self, d_model, seq_len, n_heads):
        super(MultiheadAttention, self).__init__()
        #print("mutihead attention para: d_model:{}, seq_len:{}, n_heads:{}".format(d_model, seq_len, n_heads))
        self.d_model = d_model
        self.n_heads = n_heads
        self.seq_len = seq_len
        self.head_dim = d_model // n_heads
        #print("mutihead attention para: d_model:{}, seq_len:{}, n_heads:{}, head_dim:{}".format(d_model, seq_len, n_heads, self.head_dim))

        self.values = nn.Linear(d_model, self.head_dim * n_heads, bias=False)
        self.keys = nn.Linear(d_model, self.head_dim * n_heads, bias=False)
        self.queries = nn.Linear(d_model, self.head_dim * n_heads, bias=False)
        self.fc_out = nn.Linear(n_heads * self.head_dim, d_model)
        self.RoPE = RotationEmbedding(max_len=seq_len, d_model=self.head_dim)
        nn.init.kaiming_normal_(self.values.weight)
        nn.init.kaiming_normal_(self.keys.weight)
        nn.init.kaiming_normal_(self.queries.weight)
        nn.init.xavier_normal_(self.fc_out.weight)
        if self.fc_out.bias is not None:
            nn.init.zeros_(self.fc_out.bias)
        
    def forward(self, x, mask, use_rope=True):
        '''
        x : (batch_size, seq_len, d_model)
        mask : (batch_size, 1, seq_len, seq_len)
        '''
        #print("x shape:", x.shape)
        #print("mask shape:", mask.shape)
        bs = x.shape[0]

        seq_len = x.shape[1]

        values = self.values(x).view(bs, seq_len, self.n_heads, self.head_dim)
        keys = self.keys(x).view(bs, seq_len, self.n_heads, self.head_dim)
        queries = self.queries(x).view(bs, seq_len, self.n_heads, self.head_dim)

        values = values.permute(0, 2, 1, 3)
        keys = keys.permute(0, 2, 1, 3)
        queries = queries.permute(0, 2, 1, 3)
        # q,k,v : (batch_size, n_heads, seq_len, d_head)
        #print("queries shape:", queries.shape)
        if use_rope:
            queries, keys = self.RoPE(queries, keys)

        weights = torch.matmul(queries, keys.permute(0, 1, 3, 2)) #(bs, n, seq_len, d_head) @ (bs, n, d_head, seq_len) -> (bs, n, seq_len, seq_len)

        if mask is not None: 
            weights = weights.masked_fill(mask == 1, float("-1e10"))

        attention = F.softmax(weights / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)), dim=-1)

        out = torch.matmul(attention, values)
        out = out.permute(0, 2, 1, 3).contiguous().view(bs, seq_len, int(self.n_heads * self.head_dim))

        out = self.fc_out(out)
        return out

    def predict(self, x, mask, k_cache, v_cache, use_rope=True):
        bs = x.shape[0]
        seq_len = x.shape[1]

        # prompting process
        values = self.values(x).view(bs, seq_len, self.n_heads, self.head_dim)
        keys = self.keys(x).view(bs, seq_len, self.n_heads, self.head_dim)
        # [B, L, H, D]

        queries = self.queries(x).view(bs, seq_len, self.n_heads, self.head_dim)
        # [B, L, H, D]

        values = values.permute(0, 2, 1, 3)
        keys = keys.permute(0, 2, 1, 3)
        queries = queries.permute(0, 2, 1, 3)
        # [B, H, L, D]

        if k_cache is not None:
            keys = torch.cat([k_cache, keys], dim=2)
            values = torch.cat([v_cache, values], dim=2)
            # [B, H, L+1, D]

        if use_rope:
            queries, keys = self.RoPE(queries, keys)

        weights = torch.matmul(queries, keys.permute(0, 1, 3, 2))

        if mask is not None:
            weights = weights.masked_fill(mask == 1, float("-1e10"))

        attention = F.softmax(weights / (self.head_dim ** (1 / 2)), dim=-1)

        out = torch.matmul(attention, values)
        out = out.permute(0, 2, 1, 3).contiguous().view(bs, seq_len, self.n_heads * self.head_dim)

        out = self.fc_out(out)
        return out, keys, values
    
class DecoderLayer(nn.Module):
    def __init__(self, d_model, seq_len, n_heads, d_hidden, drop_prob=0.1, device = 'cuda'):
        super().__init__()
        #self.embed = tokenEmbedding(vocab_size=voc_size, d_model=d_model)
        self.pre_norm = nn.LayerNorm(d_model)
        self.attn = MultiheadAttention(d_model, seq_len=seq_len, n_heads=n_heads)
        self.ffn = FeedForward(d_model, d_hidden=d_hidden, drop_prob=drop_prob)
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.norm_ffn = nn.LayerNorm(d_model)
        nn.init.normal_(self.pre_norm.weight, mean=0, std=1e-2)
        nn.init.zeros_(self.pre_norm.bias)
        nn.init.normal_(self.norm_ffn.weight, mean=0, std=1e-2)
        nn.init.zeros_(self.norm_ffn.bias)
        
    def forward(self, x, mask):
        _x = x
        x = self.pre_norm(x)
        x = self.attn(x, mask)
        x = self.dropout1(x)
        x1 = (x + _x)
        x = self.norm_ffn(x1)
        x = self.ffn(x)
        x = self.dropout1(x)
        return x + x1

class DecoderOnlyTransformer(nn.Module):
    def __init__(self, pad_id, voc_size, d_model, seq_len, n_heads, n_layers, d_hidden, drop_prob=0.1, device = 'cuda' if torch.cuda.is_available() else 'cpu'):
        super().__init__()
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, seq_len, n_heads, d_hidden, drop_prob, device) for _ in range(n_layers)])
        self.embed = tokenEmbedding(vocab_size=voc_size, d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.last_norm = nn.LayerNorm(d_model)
        self.last_linear = nn.Linear(d_model, voc_size)
        #self.softmax = nn.Softmax(dim=-1)
        self.pad_id = pad_id
        self.device = device
        nn.init.xavier_uniform_(self.last_linear.weight)
        nn.init.normal_(self.last_norm.weight, mean=0, std=1e-2)

    
    def create_mask(self, x):
        #x_shape: (batch_size, seq_len)
        #mask==True 的位置, 在注意力计算后会被忽略
        pad_mask = (x==self.pad_id).unsqueeze(1).unsqueeze(3) #batch_size, 1, seq_len, 1
        seq_len = x.shape[1]
        up_tri_mask = (torch.triu(torch.ones(seq_len, seq_len)) == 1).bool().to(self.device) 
        mask = torch.logical_or(pad_mask, up_tri_mask)
        return mask
    
    def forward(self, x):
        mask = self.create_mask(x)
        x = self.embed(x)
        x = self.dropout1(x)
        for layer in self.decoder_layers:
            x = layer(x, mask)
        x = self.last_norm(x)
        x = self.last_linear(x)
        return x

In [7]:
pad_id = tokenizer.pad_id
voc_size = tokenizer.n_words
d_model = config["model_config"]['d_model']
n_heads = config["model_config"]['n_heads']
d_hidden = config["model_config"]['d_hidden']
drop_prob1 = config["model_config"]['drop_prob1']
drop_prob2 = config["model_config"]['drop_prob2']
n_layers = config["model_config"]['n_layers']
seq_len = 512

model = DecoderOnlyTransformer(pad_id=pad_id, d_model=d_model, n_heads=n_heads, d_hidden=d_hidden, drop_prob=drop_prob1, voc_size=voc_size, seq_len=seq_len, n_layers=n_layers)

In [8]:
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

In [9]:
#设置损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=config["training_config"]["learning_rate"])

num_epochs = 100 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# 计算总训练步骤数和 warm-up 步骤数
total_steps = num_epochs * len(train_dataloader)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
print(f"训练总steps:{total_steps}, warm-up steps:{warmup_steps}")

#设置checkpoint保存路径
import os
checkpoint_dir = os.getcwd()
print(f"checkpoint 路径:{checkpoint_dir}")

训练总steps:16100, warm-up steps:1610
checkpoint 路径:D:\学习\研1\机器学习\assignment3-transformer


In [34]:
init_checkpoint = None
if init_checkpoint is not None:
    checkpoint = torch.load(init_checkpoint)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

cur_steps = 0  
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for batch_data in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        # 假设 batch 包含 input_ids
        input_ids = batch_data.to(device)
        # 创建 labels，即 input_ids 向右偏移一个时间步
        labels = input_ids[:, 1:].contiguous()
        input_ids = input_ids[:, :-1].contiguous()
        
        # 将梯度置零
        optimizer.zero_grad()
        #with torch.cuda.amp.autocast():
        # 前向传播
        logits = model(input_ids)

        # 计算损失
        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
        
        # 反向传播和优化
        loss.backward()
        optimizer.step()
        scheduler.step()  # 更新学习率
        cur_steps += 1
        running_loss += loss.item()
    
    # 打印平均损失
    avg_loss = running_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    
    # 保存模型
    torch.save({'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict()}, f"checkpoint_steps{cur_steps}.pth")

Epoch 1/100:   0%|                                                                             | 0/161 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 250.00 MiB. GPU 

In [10]:
import torch

# 获取当前 GPU 的显存使用情况
print(torch.cuda.memory_summary(device=None, abbreviated=False))

# 获取当前 GPU 的显存分配情况
print(torch.cuda.memory_allocated())
print(torch.cuda.max_memory_allocated())
print(torch.cuda.memory_reserved())
print(torch.cuda.max_memory_reserved())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 201977 KiB | 201977 KiB | 201977 KiB |      0 B   |
|       from large pool | 177152 KiB | 177152 KiB | 177152 KiB |      0 B   |
|       from small pool |  24825 KiB |  24825 KiB |  24825 KiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         | 201977 KiB | 201977 KiB | 201977 KiB |      0 B   |
|       from large pool | 177152 KiB | 177152 KiB | 177152 KiB |      0 B   |
|       from small pool |  24825 KiB |  24825 KiB |  24825 KiB |      0 B   |
|---------------------------------------------------------------