In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import tiktoken

In [14]:
#超参数
batch_size = 4
context_length = 16
d_model = 64
num_heads = 4

In [15]:
# 指定文件夹路径
folder_path = '/Users/loki/Downloads/文本素材/'

# 遍历文件夹
for filename in os.listdir(folder_path):
    # 检查文件是否为.txt文件
    if filename.endswith('.txt'):
        # 构建完整文件路径
        file_path = os.path.join(folder_path, filename)
        
        # 读取文件内容
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
            # 打印或处理每个文件的内容
            print(f'File: {filename}')
            print(content[:10])
            print('---')

File: 大奉打更人.txt
大奉打更人

作者：
---
File: 从斗罗开始签到女神.txt
从斗罗开始签到女神

---
File: 红楼梦.txt
红楼梦

作者：曹雪
---
File: 全球轮回：只有我知道剧情 2.txt
全球轮回：只有我知道
---
File: 完美世界.txt
完美世界

作者：辰
---
File: 洪荒之功德99999999.txt
洪荒之功德99999
---


In [16]:
len(content)

1708974

In [9]:
encoding = tiktoken.get_encoding("cl100k_base")

In [10]:
tokenized_content = encoding.encode(content)
tokenized_content = torch.tensor(tokenized_content)
max_token_length = tokenized_content.max().item()
max_token_length

100207

In [279]:
#切割数据集
train_index = int(len(tokenized_content)*0.9)
train_data = tokenized_content[:train_index]
valid_data = tokenized_content[train_index:]

In [280]:
valid_data[:10]

tensor([  111,  9039, 18184, 41920, 95598, 76208,  1811, 70277, 16175,   246])

In [281]:
data = train_data
idxs = torch.randint(len(data)-context_length, (batch_size,))
x_batch = torch.stack([data[idx:idx+context_length] for idx in idxs])
y_batch = torch.stack([data[idx+1:idx+context_length+1] for idx in idxs])

In [282]:
import pandas as pd

In [283]:
pd.DataFrame(x_batch.tolist())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,3922,81258,21043,44388,91985,9554,29207,226,161,99,247,3922,69636,3574,242,3922
1,33565,111,20022,240,19000,44388,162,241,224,55038,55030,17905,82696,82364,14276,236
2,23187,53901,21990,33764,75293,43511,80578,3922,74245,50667,28037,76505,26892,27384,17792,18904
3,72718,34547,56906,3490,30537,7261,46028,4996,231,239,7741,102,27384,161,99,244


In [284]:
input_embed = nn.Embedding(max_token_length, d_model)
x_batch_embed = input_embed(x_batch)
y_batch_embed = input_embed(y_batch)

In [285]:
#位置编码
import math
position_embed = torch.zeros(context_length, d_model)
position = torch.arange(0, context_length).unsqueeze (1)

div_term = torch.exp(torch.arange(0, d_model, 2).float()) * (-math.log (10000.0) / d_model)
position_embed[:, 0::2] = torch.sin(position * div_term)
position_embed[:, 1::2] = torch.cos(position * div_term)
position_embed = position_embed.unsqueeze(0).expand(batch_size, -1, -1)
position_embed.shape

torch.Size([4, 16, 64])

In [286]:
x = x_batch_embed + position_embed
y = y_batch_embed + position_embed

In [287]:
x.shape,y.shape

(torch.Size([4, 16, 64]), torch.Size([4, 16, 64]))

In [288]:
Wq = nn.Linear(d_model, d_model)
Wk = nn.Linear(d_model, d_model)
Wv = nn.Linear(d_model, d_model)

Q = Wq(x)
K = Wk(x)
V = Wv(x)

Q.shape

torch.Size([4, 16, 64])

In [289]:
#多头注意力
Q = Q.reshape(batch_size, context_length, num_heads, d_model//num_heads).permute(0, 2, 1, 3)
K = K.reshape(batch_size, context_length, num_heads, d_model//num_heads).permute(0, 2, 1, 3)
V = V.reshape(batch_size, context_length, num_heads, d_model//num_heads).permute(0, 2, 1, 3)

Q.shape

torch.Size([4, 4, 16, 16])

In [290]:
output = Q @ K.transpose(-2, -1)/math.sqrt(d_model//num_heads)
output.shape

torch.Size([4, 4, 16, 16])

In [291]:
#mask
mask = torch.triu(torch.ones(context_length, context_length),1).bool()
output = output.masked_fill(mask, float('-inf'))

In [292]:
#softmax 计算
attention_score = torch.softmax(output, -1)
pd.DataFrame(attention_score[0,0].tolist())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.676941,0.323059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.194312,0.360013,0.445674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.379475,0.17812,0.11157,0.330835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.276202,0.105721,0.22893,0.210585,0.178563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.100399,0.162897,0.173075,0.171999,0.158764,0.232867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.202224,0.068139,0.11797,0.084156,0.067519,0.210206,0.249784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.085517,0.072135,0.155761,0.152048,0.160089,0.207345,0.111607,0.055498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.060191,0.20496,0.089771,0.04763,0.072273,0.068249,0.12274,0.272737,0.061448,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.109734,0.060257,0.220894,0.036199,0.090397,0.096802,0.119268,0.04492,0.092359,0.12917,0.0,0.0,0.0,0.0,0.0,0.0


In [293]:
A = attention_score @ V

In [294]:
A = A.transpose(1,2).reshape(batch_size, -1, d_model)

In [295]:
Wo = nn.Linear(d_model, d_model)
output = Wo(A)

In [296]:
output = output + x

In [297]:
#层归一化
layer_norm = nn.LayerNorm(d_model)
layer_norm_output = layer_norm(output)

In [298]:
#前馈网络
output = nn.Linear(d_model, d_model*4)(layer_norm_output)
output = nn.ReLU()(output)
output = nn.Linear(d_model*4, d_model)(output)

In [299]:
#残差链接
output = output + layer_norm_output

In [300]:
#最终线性变换
output = nn.Linear(d_model, max_token_length)(output)
logit = F.log_softmax(output, -1)

In [301]:
#获取预测值
predicted_token = torch.argmax(logit[0,0]).item()

In [302]:
predicted_token

84300

In [305]:
encoding.decode([84300])

' canyon'

In [304]:
output.shape

torch.Size([4, 16, 100207])