In [1]:
from transformers import PretrainedConfig
from typing import List

In [6]:
import math
import struct
import inspect
import time
from typing import Any, Optional, Tuple, List
import numpy as np
from torch import nn 
from transformers import PreTrainedModel 
import torch


In [7]:
class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))
    def forward(self, x):
        return self.weight * (x.float() * torch.rsqrt(x.pow(2).mean(-1,keepdim=True)+self.eps)).type_as(x)


In [8]:
def precompute_pos_cis(dim: int, end: int = int(32*1024), theta: float = 1e6):
    freq = 1.0 / (theta ** (torch.arange(0,dim,2)[: (dim//2)].float() / dim))

    t = torch.arange(end, device=freq.device)
    freqs = torch.outer(t,freqs).float()
    pos_cis = torch.polar(torch.ones_like(freqs),freqs)
    return pos_cis 


In [None]:
def apply_rotray_emb(xq, xk, pos_cis):
    def unite_shape(pos_cis,x):
        ndim = x.ndim
        assert 0 <= 1 < ndim
        assert pos_cis.shape == (x.shape[1], x.shape[-1])
        shape = [d if i==1 or i == ndim -1 else 1 for i, d in enumerate(x.shape)]
        return pos_cis.view(*shape)
    
    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1,2))
    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1,2))
    pos_cis  = unite_shape(pos_cis, xq_)
    xq_out = torch.view_as_real(xq_ * pos_cis).flatten(3)
    xk_out = torch.view_as_real(xk_ * pos_cis).flatten(3)
    return xq_out.type_as(xq), xk_out.type_as(xk)

In [None]:
def repeat_kv(x: torch.Tensor, n_rep: int):
    bs, slen, n_kv_heads, head_dim = x.shape
    if n_rep == 1:
        return x
    return (
        x[:,:,None,:].expand(bs, slen, n_kv_heads, n_rep, head_dim).reshape(bs,slen,n_kv_heads * n_rep, head_dim)
    )

In [None]:
class LMConfig(PretrainedConfig):
    model_type = "miaodeeai"
    def __init__(self,
                 dim: int = 512,
                 n_layers: int = 1,
                 n_heads: int = 8,
                 n_kv_heads: int = 2,
                 vocab_size: int = 6400,
                 hidden_dim: int = None,
                 multiple_of: int = 64,
                 max_seq_len: int = 8192,
                 rope_theta: int = 1e6,
                 dropout: float = 0.0,
                 flash_attn: bool = True,
                 ###底下的是使用 MoE 的时候才需要的参数
                 use_moe: bool = False,
                 num_experts_per_tok: int =2,
                 num_routed_experts: int=4,
                 n_shared_experts: bool = True,
                 scoring_func: str = 'softmax',
                 aux_loss_alpha: float = 0.1,
                 seq_aux: bool = True,
                 norm_topk_prob: bool= True,
                 **kwargs,
                 ):
        self.dim = dim
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.n_kv_heads = n_kv_heads
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim 
        self.multiple_of = multiple_of
        self.max_seq_len = max_seq_len
        self.rope_theta = rope_theta
        self.dropout = dropout
        self.flash_attn = flash_attn
### 这里是
        self.use_moe = use_moe
        self.num_experts_per_tok = num_experts_per_tok
        self.num_routed_experts = num_routed_experts
        self.n_shared_experts = n_shared_experts
        self.scoring_func = scoring_func
        self.aux_loss_alpha = aux_loss_alpha
        self.seq_aux = seq_aux
        self.norm_topk_prob = norm_topk_prob
        super().__init__(**kwargs)

In [12]:
import json
import random
import re
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split
import os 
import ast


In [None]:
class PretrainDataset(Dataset):
    def __init__(self, data_path: str, tokenizer, max_length: int = 512):
        super().__init__()
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = self.load_data(data_path)

    def load_data(self, data_path: str):
        samples = []
        with open(data_path,'r', encoding = 'utf-8') as f:
            for line_num, line in enumerate(f,1):
                data = json.loads(line.strip())
                samples.append(data)
        return samples 
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, index: int):
        sample = self.samples[index]

        text = f"{self.tokenizer.bos_token}{str(sample['text'])}{self.tokenizer.eos_token}"

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        inputs_ids = encoding['input_ids'].squeeze()
        loss_mask = (inputs_ids != self.tokenizer.pad_token_id)
        X = torch.tensor(inputs_ids[:-1], dtype=torch.long)
        Y = torch.tensor(inputs_ids[1:], dtype=torch.long)
        loss_mask = torch.tensor(loss_mask[1:], dtype=torch.long)
        return X,Y,loss_mask
    

    