In [130]:
import torch
from torch import nn
from gpt import GPTLanguageModel, get_batch, optimizer, estimate_loss
device = 'cuda' if torch.cuda.is_available() else 'cpu'
from torch.quantization import quantize_dynamic
from icecream import ic
# torch.set_printoptions(precision=40, sci_mode=False)


In [131]:
model_save_path = "fully_train.pth"

model = GPTLanguageModel()
m = model.to(device)

# Model loading for generation
print("Loading the model for generation...")
checkpoint = torch.load(model_save_path, weights_only=True)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()  # Set the model to evaluation mode

Loading the model for generation...


GPTLanguageModel(
  (token_embedding_table): Embedding(65, 384)
  (position_embedding_table): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-5): 6 x Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedFoward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((384,), eps=1e-05, elementwise_affine=

In [132]:
# Print all parameters
print("Model Parameters:")
for name, param in list(model.named_parameters())[:]:
    print(f"{name}: {param.shape}")
    print(f"{name}: {param.dtype}")
print(model._modules["token_embedding_table"].state_dict()["weight"].shape)
model._modules["token_embedding_table"].state_dict()


Model Parameters:
token_embedding_table.weight: torch.Size([65, 384])
token_embedding_table.weight: torch.float32
position_embedding_table.weight: torch.Size([256, 384])
position_embedding_table.weight: torch.float32
blocks.0.sa.heads.0.key.weight: torch.Size([64, 384])
blocks.0.sa.heads.0.key.weight: torch.float32
blocks.0.sa.heads.0.query.weight: torch.Size([64, 384])
blocks.0.sa.heads.0.query.weight: torch.float32
blocks.0.sa.heads.0.value.weight: torch.Size([64, 384])
blocks.0.sa.heads.0.value.weight: torch.float32
blocks.0.sa.heads.1.key.weight: torch.Size([64, 384])
blocks.0.sa.heads.1.key.weight: torch.float32
blocks.0.sa.heads.1.query.weight: torch.Size([64, 384])
blocks.0.sa.heads.1.query.weight: torch.float32
blocks.0.sa.heads.1.value.weight: torch.Size([64, 384])
blocks.0.sa.heads.1.value.weight: torch.float32
blocks.0.sa.heads.2.key.weight: torch.Size([64, 384])
blocks.0.sa.heads.2.key.weight: torch.float32
blocks.0.sa.heads.2.query.weight: torch.Size([64, 384])
blocks.0.sa

OrderedDict([('weight',
              tensor([[ 0.0091,  0.0415, -0.1269,  ..., -0.0714,  0.0073,  0.0025],
                      [ 0.0430,  0.0210, -0.0139,  ..., -0.0139, -0.0091,  0.0034],
                      [-0.0393,  0.0236,  0.0226,  ...,  0.0451, -0.0622, -0.0174],
                      ...,
                      [ 0.0940,  0.0103,  0.1219,  ..., -0.0038,  0.0331,  0.0075],
                      [ 0.0578, -0.0447,  0.0744,  ..., -0.0093, -0.0104,  0.0081],
                      [ 0.0590, -0.0205, -0.0779,  ..., -0.0641,  0.0432,  0.0550]],
                     device='cuda:0'))])

In [133]:
# # Print all parameters for half
# model.half()
# print("Model Parameters:")
# for name, param in list(model.named_parameters())[:3]:
#     print(f"{name}: {param.shape}")
#     print(f"{name}: {param.dtype}")

In [134]:
# print(model._modules["token_embedding_table"].state_dict()["weight"].shape)
# model._modules["token_embedding_table"].state_dict()

In [135]:
import numpy as np

# Original matrix
T = np.random.rand(10, 20)

# Compute SVD
U, S, Vt = np.linalg.svd(T, full_matrices=False)

# Choose rank r
r = 8
U_r = U[:, :r]
S_r = np.diag(S[:r])
Vt_r = Vt[:r, :]

# Low-rank approximation
T_r = U_r @ S_r @ Vt_r

ic(T.shape)
ic(U_r.shape)
ic(S_r.shape)
ic(Vt_r.shape)
mae = np.mean(np.abs(T - T_r))
print(f"Mean Absolute Error (MAE): {mae:.4f}")


ic| T.shape: (10, 20)
ic| U_r.shape: (10, 8)
ic| S_r.shape: (8, 8)
ic| Vt_r.shape: (8, 20)


Mean Absolute Error (MAE): 0.0412


In [136]:
# Original matrix
T = model._modules["token_embedding_table"].state_dict()["weight"]

# Compute SVD
U, S, V = torch.svd(T)

# Choose rank r
r = 40 
U_r = U[:, :r]
S_r = torch.diag(S[:r])
V_r = V[:, :r]

# Low-rank approximation
T_r = U_r @ S_r @ V_r.T

# Shapes
m,n = T.shape
print(f"T shape: {T.shape}")
print(f"U_r shape: {U_r.shape}")
print(f"S_r shape: {S_r.shape}")
print(f"V_r shape: {V_r.shape}")

# Mean Absolute Error (MAE)
mae = torch.mean(torch.abs(T - T_r))
print(f"Mean Absolute Error (MAE): {mae:.4f}")


T shape: torch.Size([65, 384])
U_r shape: torch.Size([65, 40])
S_r shape: torch.Size([40, 40])
V_r shape: torch.Size([384, 40])
Mean Absolute Error (MAE): 0.0126


In [137]:
estimate_loss(model)

{'train': tensor(0.8562), 'val': tensor(1.5702)}

In [138]:
model._modules["token_embedding_table"].state_dict()["weight"][:,:] = T_r
estimate_loss(model)

{'train': tensor(1.0589), 'val': tensor(1.6623)}

In [139]:
def size_decrease(m,r,n):
    return (m*r + r*n + r*r)/(m*n)

In [140]:
size_decrease(m,r,n)

0.7836538461538461