In [7]:
from model import LLamaOnlyPERO
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
# from data import getDataLoader, getVocabSize
import matplotlib.pyplot as plt
import logging

In [2]:
from NewsDataLoader import getABatch, getVocabSize

数据加载完毕


In [98]:
n_embedding: int = 360  # 嵌入维度
# 注意力相关参数
n_heads: int = 4  # 注意力头
head_dim: int = n_embedding // n_heads  # 每个注意力头的维度
vocab_size: int = -1  # 词表大小
multiple_of: int = 4  # make SwiGLU hidden layer size multiple of large power of 2
batch_size: int = 128  # 一个批量大小
block_size: int = 512  # 一个批量中包含的字符数
dropout: int = 0.2
device: str = 'cuda:5' if torch.cuda.is_available() else 'cpu'
#device="cpu"
max_iter: int = 5

In [99]:
# 创建logger对象
logger = logging.getLogger('my_logger')
logger.setLevel(logging.DEBUG)

# 创建FileHandler并设置日志格式、保存路径等参数
file_handler = logging.FileHandler('log', mode='w')
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)

# 添加FileHandler到logger对象
logger.addHandler(file_handler)

In [100]:
@torch.no_grad()
def estimate_loss(model):
    model.eval()
    losses = []
    for x, y in getABatch('val', batch_size, block_size):
        x = x.to(device)
        y = y.to(device)
        logits, loss = model(x, y)
        losses.append(loss.item())
    out = np.mean(losses)
    model.train()
    return out


vocab_size = getVocabSize()
print(vocab_size)

8000


In [None]:
from typing import Tuple, Union
import torch
from torch import nn
import torch.nn.functional as F


def precompute_freqs_cis(dim: int, seq_len: int, theta: float = 10000.0):
    # 计算词向量元素两两分组之后，每组元素对应的旋转角度
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
    # 生成 token 序列索引 t = [0, 1,..., seq_len-1]
    t = torch.arange(seq_len, device=freqs.device)
    # freqs.shape = [seq_len, dim // 2]
    freqs = torch.outer(t, freqs).float()
    # torch.polar 的文档
    # https://pytorch.org/docs/stable/generated/torch.polar.html
    # 计算结果是个复数向量
    # 假设 freqs = [x, y]
    # 则 freqs_cis = [cos(x) + sin(x)i, cos(y) + sin(y)i]
    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
    return freqs_cis


# PERO旋转位置嵌入
def apply_rotary_emb(
        xq: torch.Tensor,
        xk: torch.Tensor,
        freqs_cis: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
    # xq.shape = [batch_size, seq_len, dim]
    # xq_.shape = [batch_size, seq_len, dim // 2, 2]
    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 2).to(device)
    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 2).to(device)

    # 转为复数域
    xq_ = torch.view_as_complex(xq_)
    xk_ = torch.view_as_complex(xk_)
    # 应用旋转操作，然后将结果转回实数域
    # xq_out.shape = [batch_size, seq_len, dim]
    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(2)
    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(2)
    return xq_out.type_as(xq), xk_out.type_as(xk)


class Head(nn.Module):
    """ 掩藏注意力 """
    freqs_cis = None

    def __init__(self, head_size, n_embedding=360, block_size=256, dropout=0.2, pos_embed_method="rope"):
        super().__init__()
        self.key = nn.Linear(n_embedding, head_size, bias=False)
        self.query = nn.Linear(n_embedding, head_size, bias=False)
        self.value = nn.Linear(n_embedding, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
        self.pos_embed_method = pos_embed_method
        self.freqs_cis=precompute_freqs_cis(dim=head_size, seq_len=block_size).to(device)

    def forward(self, x):
        B, T, C = x.shape
        key = self.key(x)
        query = self.query(x)
        if self.pos_embed_method == 'rope':
            # Reformer相对位置编码
 #           if self.freqs_cis is None:
 #               self.freqs_cis = precompute_freqs_cis(dim=key.shape[-1], seq_len=T).to(key.device)
            xq, xk = apply_rotary_emb(key, query, self.freqs_cis)    
            query, key = xq, xk
            # key*value/(d**-0.5)
        wei = key @ query.transpose(-2, -1) * (key.shape[-1] ** -0.5)
        # 掩藏操作，使注意力只能看到前面数据
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, -1)
        wei = self.dropout(wei)
        value = self.value(x)
        outputs = wei @ value
        return outputs


class MultiHeadAttention(nn.Module):
    """ 多头注意力 """

    def __init__(self, num_heads, head_size, block_size=256,n_embedding=360, pos_embed_method="rope"):
        super().__init__()
        # 多头注意力由多个注意力叠加
        self.heads = nn.ModuleList([Head(head_size,n_embedding=360, block_size=256,pos_embed_method=pos_embed_method) for _ in range(num_heads)])
        self.linear = nn.Linear(n_embedding, n_embedding)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        # 拼接各个注意力的输出结果
        output = torch.cat([h(x) for h in self.heads], dim=-1)
        output = self.dropout(self.linear(output))
        return output


class RMSNorm(nn.Module):
    """ RMSNorm均方层归一化 """

    def __init__(self, n_emb, eps: float = 1e-6, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.n_embedding = n_emb
        self.weights = nn.Parameter(torch.ones(n_emb))
        self.eps = eps

    def _norm(self, x):
        return x / torch.sqrt(torch.mean(torch.square(x), dim=-1, keepdim=True) + self.eps)

    def forward(self, x: torch.Tensor):
        o = self._norm(x)
        return o * self.weights


def _Swish(x, beta=1):
    return x * (1 / 1 + torch.exp(-beta * x))


class SwishGLU(nn.Module):
    def __init__(self, dim, hidden_dim, beta=1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.beta = beta
        self.w1 = nn.Linear(dim, hidden_dim)
        self.w2 = nn.Linear(dim, hidden_dim)

    def forward(self, x):
        return _Swish(self.w1(x), self.beta) * self.w2(x)


class FeedForwardWithRELU(nn.Module):
    """ 前馈神经网络 """

    def __init__(self, dim: int, hidden_dim, dropout: float):
        super().__init__()
        self.w3 = nn.Linear(dim, hidden_dim)
        self.swish = nn.ReLU()
        self.w4 = nn.Linear(hidden_dim, dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.dropout(self.w4(self.swish(self.w3(x))))


class FeedForward(nn.Module):
    """ 前馈神经网络 """

    def __init__(self, dim: int, hidden_dim: int, multiple_of: int = 4, dropout: float = 0.2, mode='swish'):
        super().__init__()
        self.mode = mode
        if self.mode == 'swish':
            # 4*(2*3*hidden_dim/4) 缩小为2*3倍
            hidden_dim = multiple_of * ((2 * hidden_dim // 3 + multiple_of - 1) // multiple_of)
            self.w3 = nn.Linear(hidden_dim, dim)
            self.swish = SwishGLU(dim, hidden_dim)
            self.dropout = nn.Dropout(dropout)
        elif self.mode == 'relu':
            self.relu = FeedForwardWithRELU(dim, hidden_dim, dropout)

    def forward(self, x):
        if self.mode == 'swish':
            return self.dropout(self.w3(self.swish(x)))
        elif self.mode == 'relu':
            return self.relu(x)


class Block(nn.Module):
    """
    feed_forward_mode "swish"-前馈神经网络使用swish激活函数，”relu“-使用relu激活函数
    norm ："RMS"-使用RMSNorm归一化，其他值则使用nn.LayerNorm
    """

    def __init__(self, n_emb, n_head, dropout=0.2, feed_forward_mode: str = "swish", norm="RMS",
                 pos_embed_method="repo"):
        super().__init__()
        self.norm = norm
        self.swish = feed_forward_mode
        head_size = n_emb // n_head
        self.heads = MultiHeadAttention(n_head, head_size, pos_embed_method=pos_embed_method)
        self.fb = FeedForward(n_emb, 4 * n_emb, 2, dropout, mode=feed_forward_mode)
        if norm == "RMS":
            self.l1 = RMSNorm(n_emb)
            self.l2 = RMSNorm(n_emb)
        else:
            self.l1 = nn.LayerNorm(n_emb)
            self.l2 = nn.LayerNorm(n_emb)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        if self.norm == "RMS":
            x = self.l1(x)
            x = x + self.heads(x)
            x = self.l2(x)
            x = x + self.fb(x)
        else:
            x = x + self.heads(x)
            x = self.l1(x)
            x = x + self.fb(x)
            x = self.l2(x)
        return x


class BlockOnlyPERO(nn.Module):
    def __init__(self, n_emb, n_head,block_size=256, n_embedding=360, dropout=0.2):
        super().__init__()
        head_size = n_emb // n_head
        self.heads = MultiHeadAttention(n_head, head_size,n_embedding=n_embedding, block_size=block_size)
        self.fb = FeedForwardWithRELU(n_emb, n_emb, dropout)
        self.l1 = nn.LayerNorm(n_emb)
        self.l2 = nn.LayerNorm(n_emb)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.l1(x)
        x = x + self.heads(x)
        x = self.l2(x)
        x = x + self.fb(x)
        return x


class LLama(nn.Module):
    def __init__(self, vocab_size, n_embedding=360, dropout=0.2):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, n_embedding)
        # self.position_emb = nn.Embedding(block_size, n_embedding)
        self.heads = nn.Sequential(
            Block(n_embedding, n_head=4),
            Block(n_embedding, n_head=4),
            Block(n_embedding, n_head=4),
            nn.Dropout(dropout)
        )
        self.l1 = torch.nn.Linear(n_embedding, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        word_embedding = self.embedding(idx)
        x = word_embedding
        x = self.heads(x)
        logits = self.l1(x)
        loss = None
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = torch.nn.functional.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens, block_size=16):
        p = 0
        # 生成文本
        for _ in range(max_new_tokens):
            # get the predictions
            idx_conv = idx[:, -block_size:]
            logits, loss = self(idx_conv)
            logits = logits[:, -1, :]
            # apply softmax to get the probability
            probs = torch.nn.functional.softmax(logits, dim=-1)
            top_probs = torch.topk(probs, 5, dim=-1)
            next_index = torch.multinomial(top_probs[0], num_samples=1)[0][0]
            idx_next = top_probs[1][0][next_index.item()].unsqueeze(0).unsqueeze(0)
            # idx_next = torch.argmax(probs, dim=-1).unsqueeze(0)
            p = p + torch.log(probs[0][idx_next[0][0]])
            idx = torch.cat((idx, idx_next), dim=1)
        zhi_xin_du = torch.pow(torch.e, -p / max_new_tokens)
        print(f"置信度：{zhi_xin_du}")
        return idx


class LLama1(nn.Module):
    """
    pos_embed_method 限制取值 "rope"|"sin"
    """

    def __init__(self, vocab_size, out_features=2, n_heads=4,
                 n_embedding=360, block_size=200, dropout=0.2,
                 pos_embed_method="rope"):
        super().__init__()
        self.vocab_size = vocab_size
        self.out_features = out_features
        self.n_heads = n_heads
        self.n_embedding = n_embedding
        self.pos_embed_method = pos_embed_method
        self.block_size = block_size
        self.embedding = torch.nn.Embedding(vocab_size, n_embedding)
        if pos_embed_method != "rope":
            self.position_emb = nn.Embedding(block_size, n_embedding)
        self.heads = nn.Sequential(
            Block(n_embedding, n_head=n_heads, pos_embed_method=pos_embed_method),
            Block(n_embedding, n_head=n_heads, pos_embed_method=pos_embed_method),
            Block(n_embedding, n_head=n_heads, pos_embed_method=pos_embed_method),
            nn.Dropout(dropout)
        )
        self.l1 = torch.nn.Linear(n_embedding, out_features)

    def forward(self, idx, targets=None):
        B, T = idx
        word_embedding = self.embedding(idx)
        x = word_embedding
        # pos_embed_method=="sin" ，使用sinmoid位置编码
        if self.pos_embed_method == "sin":
            pos_embedding = self.position_emb(torch.arange(0, T))
            pos_embedding = torch.repeat_interleave(pos_embedding, B)
            x = x + pos_embedding
        x = self.heads(x)
        logits = self.l1(x)
        loss = None
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits[:, -1, :]
            logits = logits.view(B, C)
            targets = targets.view(B)
            loss = torch.nn.functional.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens, block_size=16):
        # 生成文本
        for _ in range(max_new_tokens):
            # get the predictions
            idx_conv = idx[:, -block_size:]
            logits, loss = self(idx_conv)
            logits = logits[:, -1, :]
            # apply softmax to get the probability
            probs = torch.nn.functional.softmax(logits, dim=-1)
            top_probs = torch.topk(probs, 5, dim=-1)
            next_index = torch.multinomial(top_probs[0], num_samples=1)[0][0]
            idx_next = top_probs[1][0][next_index.item()].unsqueeze(0).unsqueeze(0)
            # idx_next = torch.argmax(probs, dim=-1).unsqueeze(0)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

    def pre(self, idx):
        # 预测分类
        logits, loss = self(idx)
        logits = logits[:, -1, :]
        output = torch.argmax(logits, -1)
        return output


class LLamaOnlyPERO(nn.Module):
    def __init__(self, vocab_size,out_features=2,block_size=256, n_embedding=360, dropout=0.2):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, n_embedding)
        self.heads = nn.Sequential(
            BlockOnlyPERO(n_embedding, n_head=4),
            BlockOnlyPERO(n_embedding, n_head=4),
            BlockOnlyPERO(n_embedding, n_head=4),
            nn.Dropout(dropout)
        )
        self.l1 = torch.nn.Linear(n_embedding, out_features)

    def forward(self, idx, targets=None):
        word_embedding = self.embedding(idx)
        x = word_embedding
        x = self.heads(x)
        logit = self.l1(x)
        loss = None
        if targets is None:
            loss = None
        else:
            B, T, C = logit.shape
            logit = logit[:, -1, :]
            logit = logit.view(B, C)
            targets = targets.view(B)
            loss = torch.nn.functional.cross_entropy(logit, targets)
        return logit, loss

    def pre(self, idx):
        logit, loss = self(idx)
        logit = logit[:, -1, :]
        output = torch.argmax(logit, -1)
        return output

    
    
#!/usr/bin/env python
# coding: utf-8

from typing import Tuple, Union
import torch
from torch import nn
import torch.nn.functional as F

def precompute_freqs_cis(dim: int, seq_len: int, theta: float = 10000.0):
    # 计算词向量元素两两分组之后，每组元素对应的旋转角度
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
    # 生成 token 序列索引 t = [0, 1,..., seq_len-1]
    t = torch.arange(seq_len, device=freqs.device)
    # freqs.shape = [seq_len, dim // 2]
    freqs = torch.outer(t, freqs).float()
    # torch.polar 的文档
    # https://pytorch.org/docs/stable/generated/torch.polar.html
    # 计算结果是个复数向量
    # 假设 freqs = [x, y]
    # 则 freqs_cis = [cos(x) + sin(x)i, cos(y) + sin(y)i]
    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
    return freqs_cis


# PERO旋转位置嵌入
def apply_rotary_emb(
        xq: torch.Tensor,
        xk: torch.Tensor,
        freqs_cis: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
    # xq.shape = [batch_size, seq_len, dim]
    # xq_.shape = [batch_size, seq_len, dim // 2, 2]
    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 2).to(device)
    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 2).to(device)

    # 转为复数域
    xq_ = torch.view_as_complex(xq_)
    xk_ = torch.view_as_complex(xk_)
    # 应用旋转操作，然后将结果转回实数域
    # xq_out.shape = [batch_size, seq_len, dim]
    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(2)
    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(2)
    return xq_out.type_as(xq), xk_out.type_as(xk)


class Head(nn.Module):
    """ 掩藏注意力 """
    freqs_cis = None

    def __init__(self, head_size, n_embedding,block_size, dropout=0.2, pos_embed_method="rope"):
        super().__init__()
        self.key = nn.Linear(n_embedding, head_size, bias=False)
        self.query = nn.Linear(n_embedding, head_size, bias=False)
        self.value = nn.Linear(n_embedding, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
        self.pos_embed_method = pos_embed_method
        self.freqs_cis=precompute_freqs_cis(dim=head_size, seq_len=block_size).to(device)

    def forward(self, x):
        B, T, C = x.shape
        key = self.key(x)
        query = self.query(x)
        if self.pos_embed_method == 'rope':
            # Reformer相对位置编码
 #           if self.freqs_cis is None:
 #               self.freqs_cis = precompute_freqs_cis(dim=key.shape[-1], seq_len=T).to(key.device)
            xq, xk = apply_rotary_emb(key, query, self.freqs_cis)
            # key*value/(d**-0.5)
            query, key = xq, xk
        wei = key @ query.transpose(-2, -1) * (key.shape[-1] ** -0.5)
        # 掩藏操作，使注意力只能看到前面数据
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, -1)
        wei = self.dropout(wei)
        value = self.value(x)
        outputs = wei @ value
        return outputs


class MultiHeadAttention(nn.Module):
    """ 多头注意力 """

    def __init__(self, num_heads, head_size, block_size=256,n_embedding=360, pos_embed_method="rope"):
        super().__init__()
        # 多头注意力由多个注意力叠加
        self.heads = nn.ModuleList([Head(head_size, block_size=block_size,n_embedding=n_embedding,pos_embed_method=pos_embed_method) for _ in range(num_heads)])
        self.linear = nn.Linear(n_embedding, n_embedding)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        # 拼接各个注意力的输出结果
        output = torch.cat([h(x) for h in self.heads], dim=-1)
        output = self.dropout(self.linear(output))
        return output


class RMSNorm(nn.Module):
    """ RMSNorm均方层归一化 """

    def __init__(self, n_emb, eps: float = 1e-6, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.n_embedding = n_emb
        self.weights = nn.Parameter(torch.ones(n_emb))
        self.eps = eps

    def _norm(self, x):
        return x / torch.sqrt(torch.mean(torch.square(x), dim=-1, keepdim=True) + self.eps)

    def forward(self, x: torch.Tensor):
        o = self._norm(x)
        return o * self.weights


def _Swish(x, beta=1):
    return x * (1 / 1 + torch.exp(-beta * x))


class SwishGLU(nn.Module):
    def __init__(self, dim, hidden_dim, beta=1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.beta = beta
        self.w1 = nn.Linear(dim, hidden_dim)
        self.w2 = nn.Linear(dim, hidden_dim)

    def forward(self, x):
        return _Swish(self.w1(x), self.beta) * self.w2(x)


class FeedForwardWithRELU(nn.Module):
    """ 前馈神经网络 """

    def __init__(self, dim: int, hidden_dim, dropout: float):
        super().__init__()
        self.w3 = nn.Linear(dim, hidden_dim)
        self.swish = nn.ReLU()
        self.w4 = nn.Linear(hidden_dim, dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.dropout(self.w4(self.swish(self.w3(x))))


class FeedForward(nn.Module):
    """ 前馈神经网络 """

    def __init__(self, dim: int, hidden_dim: int, multiple_of: int = 4, dropout: float = 0.2, mode='swish'):
        super().__init__()
        self.mode = mode
        if self.mode == 'swish':
            # 4*(2*3*hidden_dim/4) 缩小为2*3倍
            hidden_dim = multiple_of * ((2 * hidden_dim // 3 + multiple_of - 1) // multiple_of)
            self.w3 = nn.Linear(hidden_dim, dim)
            self.swish = SwishGLU(dim, hidden_dim)
            self.dropout = nn.Dropout(dropout)
        elif self.mode == 'relu':
            self.relu = FeedForwardWithRELU(dim, hidden_dim, dropout)

    def forward(self, x):
        if self.mode == 'swish':
            return self.dropout(self.w3(self.swish(x)))
        elif self.mode == 'relu':
            return self.relu(x)
        else:
            raise ValueError("feed forward mode must is 'swish' or 'relu'")


class Block(nn.Module):
    """
    feed_forward_mode "swish"-前馈神经网络使用swish激活函数，”relu“-使用relu激活函数
    norm ："RMS"-使用RMSNorm归一化，其他值则使用nn.LayerNorm
    """

    def __init__(self, n_emb, n_head, block_size,dropout=0.2, feed_forward_mode: str = "swish", norm="rms",
                 pos_embed_method="repo"):
        super().__init__()
        self.norm = norm
        self.swish = feed_forward_mode
        head_size = n_emb // n_head
        self.heads = MultiHeadAttention(n_head, head_size, block_size,n_emb,pos_embed_method=pos_embed_method)
        self.fb = FeedForward(n_emb, 4 * n_emb, 2, dropout, mode=feed_forward_mode)
        if norm == "rms":
            self.l1 = RMSNorm(n_emb)
            self.l2 = RMSNorm(n_emb)
        else:
            self.l1 = nn.LayerNorm(n_emb)
            self.l2 = nn.LayerNorm(n_emb)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        if self.norm == "rms":
            x = self.l1(x)
            x = x + self.heads(x)
            x = self.l2(x)
            x = x + self.fb(x)
        else:
            x = x + self.heads(x)
            x = self.l1(x)
            x = x + self.fb(x)
            x = self.l2(x)
        return x


class BlockOnlyPERO(nn.Module):
    def __init__(self, n_emb, n_head, block_size=256, dropout=0.2):
        super().__init__()
        head_size = n_emb // n_head
        self.heads = MultiHeadAttention(n_head, head_size,block_size=block_size,n_embedding=n_emb)
        self.fb = FeedForwardWithRELU(n_emb, n_emb, dropout)
        self.l1 = nn.LayerNorm(n_emb)
        self.l2 = nn.LayerNorm(n_emb)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.l1(x)
        x = x + self.heads(x)
        x = self.l2(x)
        x = x + self.fb(x)
        return x


class LLama(nn.Module):
    #完整的Llama模型
    def __init__(self, vocab_size, out_features=2, n_heads=4,
                 n_embedding=360, block_size=200, dropout=0.2,
                 feed_forward_mode: str = "swish", norm="rms",
                 pos_embed_method="repo"):
        super().__init__()
        self.vocab_size = vocab_size
        self.out_features = out_features
        self.n_heads = n_heads
        self.n_embedding = n_embedding
        self.pos_embed_method = pos_embed_method
        self.block_size = block_size
        self.embedding = torch.nn.Embedding(vocab_size, n_embedding)
        if pos_embed_method == "sin":
            self.position_emb = nn.Embedding(block_size, n_embedding)
        self.heads = nn.Sequential(
            Block(n_embedding, n_heads,block_size, pos_embed_method=pos_embed_method,feed_forward_mode = feed_forward_mode, norm=norm),
            Block(n_embedding, n_heads,block_size, pos_embed_method=pos_embed_method,feed_forward_mode = feed_forward_mode, norm=norm),
            Block(n_embedding, n_heads,block_size, pos_embed_method=pos_embed_method,feed_forward_mode = feed_forward_mode, norm=norm),
            nn.Dropout(dropout)
        )
        #self.l1 = torch.nn.Linear(n_embedding, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        word_embedding = self.embedding(idx)
        x = word_embedding
        if self.pos_embed_method == "sin":
            pos_embedding = self.position_emb(torch.arange(0, T))
            pos_embedding = torch.repeat_interleave(pos_embedding, B)
            x = x + pos_embedding
        x = self.heads(x)
        return x
        # logits = self.l1(x)
        # loss = None
        # if targets is None:
        #     loss = None
        # else:
        #     B, T, C = logits.shape
        #     logits = logits.view(B * T, C)
        #     targets = targets.view(B * T)
        #     loss = torch.nn.functional.cross_entropy(logits, targets)
        # return logits, loss

#     def generate(self, idx, max_new_tokens, block_size=16):
#         p = 0
#         # 生成文本
#         for _ in range(max_new_tokens):
#             # get the predictions
#             idx_conv = idx[:, -block_size:]
#             logits, loss = self(idx_conv)
#             logits = logits[:, -1, :]
#             # apply softmax to get the probability
#             probs = torch.nn.functional.softmax(logits, dim=-1)
#             top_probs = torch.topk(probs, 5, dim=-1)
#             next_index = torch.multinomial(top_probs[0], num_samples=1)[0][0]
#             idx_next = top_probs[1][0][next_index.item()].unsqueeze(0).unsqueeze(0)
#             # idx_next = torch.argmax(probs, dim=-1).unsqueeze(0)
#             p = p + torch.log(probs[0][idx_next[0][0]])
#             idx = torch.cat((idx, idx_next), dim=1)
#         zhi_xin_du = torch.pow(torch.e, -p / max_new_tokens)
#         print(f"置信度：{zhi_xin_du}")
#         return idx


class LLama1(nn.Module):
    """
    pos_embed_method 限制取值 "rope"|"sin"
    """

    def __init__(self, vocab_size, out_features=2, n_heads=4,
                 n_embedding=360, block_size=200, dropout=0.2,
                 feed_forward_mode: str = "swish", norm="rms",
                 pos_embed_method="repo"):
        super().__init__()
        # self.vocab_size = vocab_size
        # self.out_features = out_features
        # self.n_heads = n_heads
        # self.n_embedding = n_embedding
        # self.pos_embed_method = pos_embed_method
        # self.block_size = block_size
        # self.embedding = torch.nn.Embedding(vocab_size, n_embedding)
        # if pos_embed_method != "rope":
        #     self.position_emb = nn.Embedding(block_size, n_embedding)
        # self.heads = nn.Sequential(
        #     Block(n_embedding, n_heads,block_size, pos_embed_method=pos_embed_method),
        #     Block(n_embedding, n_heads,block_size, pos_embed_method=pos_embed_method),
        #     Block(n_embedding, n_heads,block_size, pos_embed_method=pos_embed_method),
        #     nn.Dropout(dropout)
        # )
        self.l1 = torch.nn.Linear(n_embedding, out_features)
        self.llama = LLama(vocab_size, out_features, n_heads,
                 n_embedding, block_size, dropout,
                 feed_forward_mode,norm,
                 pos_embed_method)

    def forward(self, idx, targets=None):
        # B, T = idx
        # word_embedding = self.embedding(idx)
        # x = word_embedding
        # # pos_embed_method=="sin" ，使用sinmoid位置编码
        # if self.pos_embed_method == "sin":
        #     pos_embedding = self.position_emb(torch.arange(0, T))
        #     pos_embedding = torch.repeat_interleave(pos_embedding, B)
        #     x = x + pos_embedding
        # x = self.heads(x)
        x = self.llama(idx, targets)
        logits = self.l1(x)
        loss = None
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits[:, -1, :]
            logits = logits.view(B, C)
            targets = targets.view(B)
            loss = torch.nn.functional.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens, block_size=16):
        # 生成文本
        for _ in range(max_new_tokens):
            # get the predictions
            idx_conv = idx[:, -block_size:]
            logits, loss = self(idx_conv)
            logits = logits[:, -1, :]
            # apply softmax to get the probability
            probs = torch.nn.functional.softmax(logits, dim=-1)
            top_probs = torch.topk(probs, 5, dim=-1)
            next_index = torch.multinomial(top_probs[0], num_samples=1)[0][0]
            idx_next = top_probs[1][0][next_index.item()].unsqueeze(0).unsqueeze(0)
            # idx_next = torch.argmax(probs, dim=-1).unsqueeze(0)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

    def pre(self, idx):
        # 预测分类
        logits, loss = self(idx)
        logits = logits[:, -1, :]
        output = torch.argmax(logits, -1)
        return output


class LLamaOnlyPERO(nn.Module):
    def __init__(self, vocab_size,out_features=2,block_size=256, n_embedding=360,n_heads=4, dropout=0.2):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, n_embedding)
        self.heads = nn.Sequential(
            BlockOnlyPERO(n_embedding, n_heads,block_size),
            BlockOnlyPERO(n_embedding, n_heads,block_size),
            BlockOnlyPERO(n_embedding, n_heads,block_size),
            nn.Dropout(dropout)
        )
        self.l1 = torch.nn.Linear(n_embedding, out_features)

    def forward(self, idx, targets=None):
        word_embedding = self.embedding(idx)
        x = word_embedding
        x = self.heads(x)
        logit = self.l1(x)
        loss = None
        if targets is None:
            loss = None
        else:
            B, T, C = logit.shape
            logit = logit[:, -1, :]
            logit = logit.view(B, C)
            targets = targets.view(B)
            loss = torch.nn.functional.cross_entropy(logit, targets)
        return logit, loss

    def pre(self, idx):
        logit, loss = self(idx)
        logit = logit[:, -1, :]
        output = torch.argmax(logit, -1)
        return output


In [None]:
feed_forward_mode="relu"
norm="none"
pos_embed_method="repo"
#14分类任务
m = LLama1(vocab_size, out_features=14, n_heads=4,
                 n_embedding=n_embedding, block_size=block_size, dropout=dropout,
                 feed_forward_mode = feed_forward_mode, norm=norm,
                 pos_embed_method=pos_embed_method)
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)
trainLosses = []
val_losses = []
count = 0
logger.info("start training")
print("start training")
params_num=sum(p.numel() for p in m.parameters()) / 1e6
print(sum(p.numel() for p in m.parameters()) / 1e6, 'M parameters')
for step in range(max_iter):
    m.to(device)
    trainLoss = []
    logger.info(f"The step is {step}")
    for X, Y in getABatch('train', batch_size, block_size):
        X, Y = X.to(device), Y.to(device)
        logits, loss = m(X,Y)
        trainLoss.append(loss.item())
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    if step != -1:
        val_loss = estimate_loss(m)
        t_loss = np.mean(trainLoss)
        trainLosses.append(t_loss)
        val_losses.append(val_loss)
        count += 1
        logger.info(f"step{step}: train loss {t_loss}, val loss {val_loss}")
    m.to("cpu")
    allCount = 0
    accCount = 0
    for x, y in getABatch('val', 512, block_size):
        x = x.to('cpu')
        y = y.to('cpu')
        allCount += 512
        output = m.pre(x)
        y=y.view(block_size)
        acc = y == output
        acc = acc.sum().item()
        accCount += acc
    print(f"step:{step},总数据{allCount}")
    print(f"准确分类数据{accCount}")
    print(f"准确率{accCount / allCount}")
    logger.info(f"step:{step},toal num:{allCount}")
    logger.info(f"acc num:{accCount}")
    logger.info(f"acc:{accCount / allCount}")
    torch.save(m,f'./output/mod-_{feed_forward_mode}_{norm}_{pos_embed_method}-{block_size}_{n_embedding}-{step}-{params_num}_{accCount / allCount}.pth')

In [None]:
#14分类任务
m = LLamaOnlyPERO(vocab_size, out_features=14, block_size=block_size,n_embedding=n_embedding, dropout=dropout)
m.to(device)
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)
trainLosses = []
val_losses = []
count = 0
logger.info("start training")
print("start training")
for step in range(max_iter):
    trainLoss = []
    logger.info(f"The step is {step}")
    for X, Y in getABatch('train', batch_size, block_size):
        X, Y = X.to(device), Y.to(device)
        logits, loss = m(X,Y)
        trainLoss.append(loss.item())
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    if step != -1:
        val_loss = estimate_loss(m)
        t_loss = np.mean(trainLoss)
        trainLosses.append(t_loss)
        val_losses.append(val_loss)
        count += 1
        logger.info(f"step{step}: train loss {t_loss}, val loss {val_loss}")
        print(f"step{step}: train loss {t_loss}, val loss {val_loss}")
    torch.save(m,f'./output/mod-{step}.pth')

In [None]:
torch.save(m,'./output/mod-last.pth')
print(sum(p.numel() for p in m.parameters()) / 1e6, 'M parameters')

In [46]:
m.to("cpu")
allCount = 0
accCount = 0
for x, y in getABatch('val', 512, block_size):
    x = x.to('cpu')
    y = y.to('cpu')
    allCount += 512
    output = m.pre(x)
    acc = y == output
    acc = acc.sum().item()
    accCount += acc
print(f"总数据{allCount}")
print(f"准确分类数据{accCount}")
print(f"准确率{accCount / allCount}")
logger.info(f"总数据{allCount}")
logger.info(f"准确分类数据{accCount}")
logger.info(f"准确率{accCount / allCount}")

总数据83456
准确分类数据5489356
准确率65.77545053680981


In [83]:
print(y)
print(output)
acc=output==y

tensor([10,  2,  3,  9,  6,  9,  8,  5,  7,  3,  3,  6,  5,  6, 10,  4,  9,  6,
        12,  0,  3,  5,  2, 11,  1,  9,  9, 12, 10,  6,  4,  1, 13,  0, 13,  3,
        13,  3,  3, 10,  5, 10, 13,  5,  3,  6,  6,  3,  3, 11,  3,  6,  6,  3,
         3, 10,  0,  5,  6,  2, 10, 10, 12, 10,  9,  9, 10,  3,  9,  6,  5,  7,
         6, 10, 13,  3,  3,  3, 13,  6, 12,  9,  9,  9, 13, 10,  3,  3, 10, 10,
         3,  2, 10,  6,  6,  0,  6,  9, 10,  3,  3,  6,  3,  7,  6,  3,  6,  5,
         6,  3, 12,  3, 10,  6,  6,  6, 10,  6,  3, 13,  3, 13, 13,  6, 10, 10,
         6,  3,  3, 12,  3,  3,  6,  6,  5,  9,  6, 10, 13,  3, 10,  9,  3, 12,
         6, 13,  6,  3, 10, 10, 13,  3,  7,  3, 13,  7,  3,  6, 10,  7,  6,  6,
         7,  9, 10,  3,  0,  6,  3,  0, 13,  3, 13, 12,  6,  6,  7, 10,  5,  6,
         3,  0, 10, 10,  3,  9,  2,  6,  8,  7,  7,  3, 13, 13, 12,  4, 13,  6,
        10, 10,  6, 10,  3,  3,  3,  5,  6,  3, 10,  3,  3,  3,  7,  6,  3,  3,
         9,  0,  3,  5, 13, 10,  7, 10, 

In [82]:
acc.sum().item()

488

In [85]:
for step in range(5,10):
    m.to(device)
    trainLoss = []
    logger.info(f"The step is {step}")
    for X, Y in getABatch('train', batch_size, block_size):
        X, Y = X.to(device), Y.to(device)
        logits, loss = m(X,Y)
        trainLoss.append(loss.item())
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    if step != -1:
        val_loss = estimate_loss(m)
        t_loss = np.mean(trainLoss)
        trainLosses.append(t_loss)
        val_losses.append(val_loss)
        count += 1
        logger.info(f"step{step}: train loss {t_loss}, val loss {val_loss}")
    m.to("cpu")
    allCount = 0
    accCount = 0
    for x, y in getABatch('val', 512, block_size):
        x = x.to('cpu')
        y = y.to('cpu')
        allCount += 512
        output = m.pre(x)
        y=y.view(block_size)
        acc = y == output
        acc = acc.sum().item()
        accCount += acc
    print(f"step:{step},总数据{allCount}")
    print(f"准确分类数据{accCount}")
    print(f"准确率{accCount / allCount}")
    logger.info(f"step:{step},toal num:{allCount}")
    logger.info(f"acc num:{accCount}")
    logger.info(f"acc:{accCount / allCount}")
    torch.save(m,f'./output/mod-_{feed_forward_mode}_{norm}_{pos_embed_method}-{block_size}_{n_embedding}-{step}-{params_num}_{accCount / allCount}.pth')

step:5,总数据83456
准确分类数据79369
准确率0.9510280866564417
step:6,总数据83456
准确分类数据79224
准确率0.9492906441717791
step:7,总数据83456
准确分类数据79387
准确率0.9512437691717791
step:8,总数据83456
准确分类数据79340
准确率0.9506805981595092
step:9,总数据83456
准确分类数据79411
准确率0.9515313458588958


In [97]:
class GQAHead(nn.Module):
    """ 掩藏注意力 """
    freqs_cis = None

    def __init__(self, head_size, n_embedding, block_size, dropout=0.2, pos_embed_method="rope"):
        super().__init__()
        self.query = nn.Linear(n_embedding, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
        self.pos_embed_method = pos_embed_method
        self.freqs_cis = precompute_freqs_cis(dim=head_size, seq_len=block_size).to(device)

    def forward(self, x, key, value):
        B, T, C = x.shape
        query = self.query(x)
        if self.pos_embed_method == 'rope':
            # Reformer相对位置编码
            #           if self.freqs_cis is None:
            #               self.freqs_cis = precompute_freqs_cis(dim=key.shape[-1], seq_len=T).to(key.device)
            xq, xk = apply_rotary_emb(key, query, self.freqs_cis)
            # key*value/(d**-0.5)
            query, key = xq, xk
        wei = key @ query.transpose(-2, -1) * (key.shape[-1] ** -0.5)
        # 掩藏操作，使注意力只能看到前面数据
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, -1)
        wei = self.dropout(wei)
        outputs = wei @ value
        return outputs


class MultiQueryAttention(nn.Module):
    """ 多组查询注意力 """

    def __init__(self, num_heads, head_size, block_size=256, n_embedding=360, pos_embed_method="rope"):
        super().__init__()
        # 多头注意力由多个注意力叠加
        self.heads = nn.ModuleList(
            [GQAHead(head_size=head_size, block_size=block_size, n_embedding=n_embedding,
                     pos_embed_method=pos_embed_method) for _ in
             range(num_heads)])
        self.linear = nn.Linear(n_embedding, n_embedding)
        self.key = nn.Linear(n_embedding, head_size, bias=False)
        self.value = nn.Linear(n_embedding, head_size, bias=False)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        # 拼接各个注意力的输出结果
        key = self.key(x)
        value = self.value(x)
        output = torch.cat([h(x, key, value) for h in self.heads], dim=-1)
        output = self.dropout(self.linear(output))
        return output


class GQABlock(nn.Module):
    """
    feed_forward_mode "swish"-前馈神经网络使用swish激活函数，”relu“-使用relu激活函数
    norm ："RMS"-使用RMSNorm归一化，其他值则使用nn.LayerNorm
    """

    def __init__(self, n_emb, n_head, block_size, dropout=0.2, feed_forward_mode: str = "swish", norm="rms",
                 pos_embed_method="repo"):
        super().__init__()
        self.norm = norm
        self.swish = feed_forward_mode
        head_size = n_emb // n_head
        self.heads = MultiQueryAttention(n_head, head_size, block_size, n_emb, pos_embed_method=pos_embed_method)
        self.fb = FeedForward(n_emb, 4 * n_emb, 2, dropout, mode=feed_forward_mode)
        if norm == "rms":
            self.l1 = RMSNorm(n_emb)
            self.l2 = RMSNorm(n_emb)
        else:
            self.l1 = nn.LayerNorm(n_emb)
            self.l2 = nn.LayerNorm(n_emb)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        if self.norm == "rms":
            x = self.l1(x)
            x = x + self.heads(x)
            x = self.l2(x)
            x = x + self.fb(x)
        else:
            x = x + self.heads(x)
            x = self.l1(x)
            x = x + self.fb(x)
            x = self.l2(x)
        return x


class GQALLama(nn.Module):
    # 完整的Llama模型
    def __init__(self, vocab_size, out_features=2, n_heads=4,
                 n_embedding=360, block_size=200, dropout=0.2,
                 feed_forward_mode: str = "swish", norm="rms",
                 pos_embed_method="repo"):
        super().__init__()
        self.vocab_size = vocab_size
        self.out_features = out_features
        self.n_heads = n_heads
        self.n_embedding = n_embedding
        self.pos_embed_method = pos_embed_method
        self.block_size = block_size
        self.embedding = torch.nn.Embedding(vocab_size, n_embedding)
        if pos_embed_method == "sin":
            self.position_emb = nn.Embedding(block_size, n_embedding)
        self.heads = nn.Sequential(
            GQABlock(n_embedding, n_heads, block_size, pos_embed_method=pos_embed_method,
                     feed_forward_mode=feed_forward_mode, norm=norm),
            GQABlock(n_embedding, n_heads, block_size, pos_embed_method=pos_embed_method,
                     feed_forward_mode=feed_forward_mode, norm=norm),
            GQABlock(n_embedding, n_heads, block_size, pos_embed_method=pos_embed_method,
                     feed_forward_mode=feed_forward_mode, norm=norm),
            nn.Dropout(dropout)
        )
        # self.l1 = torch.nn.Linear(n_embedding, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        word_embedding = self.embedding(idx)
        x = word_embedding
        if self.pos_embed_method == "sin":
            pos_embedding = self.position_emb(torch.arange(0, T))
            pos_embedding = torch.repeat_interleave(pos_embedding, B)
            x = x + pos_embedding
        x = self.heads(x)
        return x
        # logits = self.l1(x)
        # loss = None
        # if targets is None:
        #     loss = None
        # else:
        #     B, T, C = logits.shape
        #     logits = logits.view(B * T, C)
        #     targets = targets.view(B * T)
        #     loss = torch.nn.functional.cross_entropy(logits, targets)
        # return logits, loss
        
        
class GQALLama1(nn.Module):
    """
    pos_embed_method 限制取值 "rope"|"sin"
    """

    def __init__(self, vocab_size, out_features=2, n_heads=4,
                 n_embedding=360, block_size=200, dropout=0.2,
                 feed_forward_mode: str = "swish", norm="rms",
                 pos_embed_method="repo"):
        super().__init__()
        self.l1 = torch.nn.Linear(n_embedding, out_features)
        self.llama = GQALLama(vocab_size, out_features, n_heads,
                              n_embedding, block_size, dropout,
                              feed_forward_mode, norm,
                              pos_embed_method)

    def forward(self, idx, targets=None):
        # B, T = idx
        # word_embedding = self.embedding(idx)
        # x = word_embedding
        # # pos_embed_method=="sin" ，使用sinmoid位置编码
        # if self.pos_embed_method == "sin":
        #     pos_embedding = self.position_emb(torch.arange(0, T))
        #     pos_embedding = torch.repeat_interleave(pos_embedding, B)
        #     x = x + pos_embedding
        # x = self.heads(x)
        x = self.llama(idx, targets)
        logits = self.l1(x)
        loss = None
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits[:, -1, :]
            logits = logits.view(B, C)
            targets = targets.view(B)
            loss = torch.nn.functional.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens, block_size=16):
        # 生成文本
        for _ in range(max_new_tokens):
            # get the predictions
            idx_conv = idx[:, -block_size:]
            logits, loss = self(idx_conv)
            logits = logits[:, -1, :]
            # apply softmax to get the probability
            probs = torch.nn.functional.softmax(logits, dim=-1)
            top_probs = torch.topk(probs, 5, dim=-1)
            next_index = torch.multinomial(top_probs[0], num_samples=1)[0][0]
            idx_next = top_probs[1][0][next_index.item()].unsqueeze(0).unsqueeze(0)
            # idx_next = torch.argmax(probs, dim=-1).unsqueeze(0)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

    def pre(self, idx):
        # 预测分类
        logits, loss = self(idx)
        logits = logits[:, -1, :]
        output = torch.argmax(logits, -1)
        return output

In [None]:
feed_forward_mode="relu"
norm="none"
pos_embed_method="repo"
#14分类任务
m = GQALLama(vocab_size, out_features=14, n_heads=4,
                 n_embedding=n_embedding, block_size=block_size, dropout=dropout,
                 feed_forward_mode = feed_forward_mode, norm=norm,
                 pos_embed_method=pos_embed_method)
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)
trainLosses = []
val_losses = []
count = 0
logger.info("start training")
print("start training")
params_num=sum(p.numel() for p in m.parameters()) / 1e6
print(sum(p.numel() for p in m.parameters()) / 1e6, 'M parameters')
for step in range(max_iter):
    m.to(device)
    trainLoss = []
    logger.info(f"The step is {step}")
    for X, Y in getABatch('train', batch_size, block_size):
        X, Y = X.to(device), Y.to(device)
        logits, loss = m(X,Y)
        trainLoss.append(loss.item())
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    if step != -1:
        val_loss = estimate_loss(m)
        t_loss = np.mean(trainLoss)
        trainLosses.append(t_loss)
        val_losses.append(val_loss)
        count += 1
        logger.info(f"step{step}: train loss {t_loss}, val loss {val_loss}")
    m.to("cpu")
    allCount = 0
    accCount = 0
    for x, y in getABatch('val', 512, block_size):
        x = x.to('cpu')
        y = y.to('cpu')
        allCount += 512
        output = m.pre(x)
        y=y.view(block_size)
        acc = y == output
        acc = acc.sum().item()
        accCount += acc
    print(f"step:{step},总数据{allCount}")
    print(f"准确分类数据{accCount}")
    print(f"准确率{accCount / allCount}")
    logger.info(f"step:{step},toal num:{allCount}")
    logger.info(f"acc num:{accCount}")
    logger.info(f"acc:{accCount / allCount}")
    torch.save(m,f'./output/GQA-qkv011-mod-_{feed_forward_mode}_{norm}_{pos_embed_method}-{block_size}_{n_embedding}-{step}-{params_num}_{accCount / allCount}.pth')