In [1]:
import sentencepiece as spm
import random
import torch

class GPTDataset():
    def __init__(self, model_path, file_path):
        self.model_path = model_path
        self.file_path = file_path
        self.sp = self.load_tokenizer()
        self.train_data, self.val_data = self.load_data()

    # 加载tokenizer
    def load_tokenizer(self):
        sp = spm.SentencePieceProcessor()
        sp.Load(self.model_path)
        return sp

    # 加载数据
    def load_data(self, split_rate=0.8):
        with open(self.file_path, "r", encoding="utf-8") as f:
            text = f.read()
        train_data = text[:int(len(text) * split_rate)]
        val_data = text[int(len(text) * split_rate):]
        sp = self.load_tokenizer()
        train_data = sp.Encode(train_data, out_type=int)
        val_data = sp.Encode(val_data, out_type=int)
        return train_data, val_data

    # 生成数据迭代器
    def seq_data(self, data="train_data", win_len=128, batch_size=10):
        if data == "train_data":
            data = self.train_data
        elif data == "val_data":
            data = self.val_data
        data = data[random.randint(0, win_len-1):]
        num_subseqs = (len(data)-1)//win_len
        num_indexs = list(range(0, num_subseqs*win_len, win_len))
        random.shuffle(num_indexs)
        #生成迭代器
        num_batch = len(num_indexs)//batch_size
        for i in range(0, num_batch*batch_size, batch_size):
            x = [data[j:j+win_len] for j in num_indexs[i:i+batch_size]]
            y = [data[j+1:j+win_len+1] for j in num_indexs[i:i+batch_size]]
            yield x, y





In [2]:
import torch
from torch import nn
from torchinfo import summary
import torch.nn.functional as F
import math

class GPTconfig:
    vocab_size: int = 20000  # 词表大小
    seq_len: int = 128  # 序列长度,即模型将接收和处理的每个样本中的单词数量
    embed_dim: int = 128  # 嵌入维度
    n_head: int = 4  # 注意力头数
    n_layer: int = 4  # Transformer层数
    dropout: float = 0.0
    device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))


class PositionalEncoding(nn.Module):
    """位置编码"""
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.dropout = nn.Dropout(self.config.dropout)
        self.P = torch.zeros((1, config.seq_len, config.embed_dim)).to(device = self.config.device)
        X = torch.arange(config.seq_len, dtype=torch.float).reshape(-1, 1)/\
            torch.pow(10000, torch.arange(0, config.embed_dim, 2, dtype=torch.float)/config.embed_dim)
        self.P[:, :, 0::2] = torch.sin(X)
        self.P[:, :, 1::2] = torch.cos(X)

    def forward(self, x):
        #x: [batch_size, seq_len, embed_dim]
        x = x + self.P[:, :x.shape[1], :].requires_grad_(False) #位置编码不求梯度
        return self.dropout(x)


class LayerNorm(nn.Module):
    """层归一化"""
    def __init__(self, config):
        super().__init__()
        self.embed_dim = config.embed_dim
        self.LayerNorm = nn.LayerNorm(self.embed_dim)

    def forward(self, x):
        out = self.LayerNorm(x)
        return out


class SelfAttention(nn.Module):
    """自注意力机制"""
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.attn_dropout = nn.Dropout(config.dropout)

    def mask_softmax(self, attn):
        """掩码softmax"""
        seq_len = self.config.seq_len
        mask = torch.tril(torch.ones(seq_len, seq_len))\
            .view(1, seq_len, seq_len).requires_grad_(False)#创建一个下三角矩阵,不更新梯度
        mask = mask.to(device = self.config.device)
        attn = attn.masked_fill(mask[:, :seq_len, :seq_len] == 0, float('-inf'))
        attn = F.softmax(attn, dim=-1)
        return attn

    def forward(self, q, k, v):
        attn = torch.bmm(q, k.transpose(-2, -1))/math.sqrt(self.config.seq_len)#transpose() 函数在 PyTorch 中用于交换张量的两个维度        attn = attn
        attn = self.mask_softmax(attn)
        attn = torch.bmm(self.attn_dropout(attn), v)
        return attn


class MultiHeadAttention(nn.Module):
    """多头自注意力机制"""
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.wk = nn.Linear(config.embed_dim, config.embed_dim)
        self.wq = nn.Linear(config.embed_dim, config.embed_dim)
        self.wv = nn.Linear(config.embed_dim, config.embed_dim)
        self.selfattention = SelfAttention(config)
        self.wo = nn.Linear(config.embed_dim, config.embed_dim)

    def transpose_qkv(self, qkv):
        """将qkv的维度进行转换，分出n_head个头"""
        qkv = qkv.reshape(qkv.shape[0], qkv.shape[1], self.config.n_head, -1) #[batch_size, seq_len, embed_dim]->[batch_size, seq_len, n_head, embed_dim/n_head]
        qkv = qkv.permute(0, 2, 1, 3) #[batch_size, n_head, seq_len, embed_dim/n_head]
        qkv = qkv.reshape(-1, qkv.shape[2], qkv.shape[3]) #[batch_size*n_head, seq_len, embed_dim/n_head]
        return qkv

    def output_cat(self, attn):
        """将transpose_qkv操作反转"""
        attn = attn.reshape(-1, self.config.n_head, attn.shape[1], attn.shape[2]) #[batch_size*n_head, seq_len, embed_dim/n_head]->[batch_size, n_head, seq_len, embed_dim/n_head]
        attn = attn.permute(0, 2, 1, 3) #[batch_size, seq_len, n_head, embed_dim/n_head]
        attn = attn.reshape(attn.shape[0], attn.shape[1], -1) #[batch_size, seq_len, embed_dim]
        return attn

    def forward(self, x):
        k = self.transpose_qkv(self.wk(x))
        q = self.transpose_qkv(self.wq(x))
        v = self.transpose_qkv(self.wv(x))
        attn = self.selfattention(q, k, v) #[batch_size*n_head, seq_len, embed_dim/n_head]
        attn = self.output_cat(attn)
        return self.wo(attn)


class FeedForward(nn.Module):
    """两个全连接层的mlp"""
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.net = nn.Sequential(
            nn.Linear(config.embed_dim, 4 * config.embed_dim),
            nn.GELU(),
            nn.Linear(4 * config.embed_dim, config.embed_dim),
            nn.Dropout(config.dropout),
        )

    def forward(self, x):
        x = self.net(x)
        return x


class Block(nn.Module):
    """Transformer块"""
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.ln1 = LayerNorm(config)
        self.attn = MultiHeadAttention(config)
        self.ln2 = LayerNorm(config)
        self.ffn = FeedForward(config)

    def forward(self, x):
        out = self.attn(self.ln1(x))
        out = x + out
        out2 = self.ffn(self.ln2(out))
        out2 = out + out2
        return out2


class GPTmodel(nn.Module):
    """主模型"""
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embeding = nn.Embedding(config.vocab_size, config.embed_dim)
        self.position = PositionalEncoding(config)
        self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
        self.ln = LayerNorm(config)
        self.fc = nn.Linear(config.embed_dim, config.vocab_size)

    def forward(self, features, targets=None):
        x = self.position(self.embeding(features))
        x = self.blocks(x)
        x = self.ln(x)
        if targets is not None:
            x = self.fc(x) #x: [batch_size, seq_len, embed]->[batch_size, seq_len, vocab_size]
            x = x.view(-1, x.shape[-1])
        else:
            x = self.fc(x[:, -1, :])
            x = F.softmax(x, dim=-1)
        return x

    @torch.no_grad()
    def genarate(self, seq, max_new_tokens):
        for _ in range(max_new_tokens):
            x = seq[:, -self.config.seq_len:] #seq: [batch_size, seq_len]
            new_w = self.forward(x) #new_w: [batch_size, vocab_size]
            new_token_index = torch.multinomial(new_w, num_samples=1)#从新词概率分布中采样（抽取）一个新词
            seq = torch.cat((seq, new_token_index), dim=1)
        return seq


In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
import csv
from torch.utils.tensorboard import SummaryWriter

writter = SummaryWriter(log_dir='./tf-logs') 

def training_loop(config, model, liu_gpt, optimizer, loss_fn, device, epochs, win_len, batch_size):
    loss_list = []
    loss_list_val = []
    for epoch in range(1, epochs+1):
        i_num = 0# 记录循环次数
        loss_sum = 0
        loss_sum_val = 0
        # 训练
        model.train()
        for train_x, train_y in liu_gpt.seq_data(data="train_data", win_len=win_len, batch_size=batch_size):
            train_x = torch.tensor(train_x).to(device=device)
            train_y = torch.tensor(train_y).to(device=device)
            x = model(train_x, train_y)
            loss = loss_fn(x, train_y.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_sum += loss.item()
            i_num += 1
        loss_mean = loss_sum / i_num
        if epoch%60 == 0:
            print("epoch:", epoch, "loss:", loss_mean, "-----------train")
            loss_list.append(loss_mean)
        model.eval()
        with torch.no_grad():
            i_num_val = 0
            for test_x, test_y in liu_gpt.seq_data(data="val_data", win_len=win_len, batch_size=batch_size):
                test_x = torch.tensor(test_x).to(device=device)
                test_y = torch.tensor(test_y).to(device=device)
                x = model(test_x, test_y)
                loss_val = loss_fn(x, test_y.view(-1))
                loss_sum_val += loss_val.item()
                i_num_val += 1
            loss_mean_val = loss_sum_val / i_num_val
            if epoch%60 == 0:
                print("epoch:", epoch, "val loss:", loss_mean_val, "------val")
                loss_list_val.append(loss_mean_val)
        writter.add_scalars("train_val_loss", {"train_loss": loss_mean, "val_loss": loss_mean_val}, epoch)
    return loss_list, loss_list_val
    
            
        

def main():
    device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))  # GPU设备
    config = GPTconfig()
    model = GPTmodel(config).to(device=device)
    liu_gpt = GPTDataset("my_bpe_model.model", "big_liu.txt")

    epochs = 12000
    optimizer = optim.SGD(model.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()
    win_len = 128
    batch_size = 300
    loss_list, loss_list_val = training_loop(config, model, liu_gpt, optimizer, loss_fn, device, epochs, win_len, batch_size)

    model.cpu()
    torch.save(model.state_dict(), "my_gpt_model.pt")
    torch.save(model, "my_gpt_model_net.pth")

    with open("loss_list.csv", "w") as csvfile:
        writer = csv.writer(csvfile)
        for i in range(len(loss_list)):
            writer.writerow([loss_list[i], loss_list_val[i]])
    print("数据写入:loss_list")    

main()

epoch: 60 loss: 9.311641502380372 -----------train
epoch: 60 val loss: 9.342051347096762 ------val
epoch: 120 loss: 9.127431983947753 -----------train
epoch: 120 val loss: 9.167149861653646 ------val
epoch: 180 loss: 9.02469596862793 -----------train
epoch: 180 val loss: 9.073141892751059 ------val
epoch: 240 loss: 8.946689682006836 -----------train
epoch: 240 val loss: 8.99762757619222 ------val
epoch: 300 loss: 8.886507606506347 -----------train
epoch: 300 val loss: 8.938309987386068 ------val
epoch: 360 loss: 8.835389328002929 -----------train
epoch: 360 val loss: 8.892326831817627 ------val
epoch: 420 loss: 8.789538345336915 -----------train
epoch: 420 val loss: 8.848799546559652 ------val
epoch: 480 loss: 8.748546714782714 -----------train
epoch: 480 val loss: 8.807405312856039 ------val
epoch: 540 loss: 8.71168571472168 -----------train
epoch: 540 val loss: 8.773582776387533 ------val
epoch: 600 loss: 8.677050704956054 -----------train
epoch: 600 val loss: 8.73988151550293 ------