In [1]:
import scaled_model_config
my_config=scaled_model_config

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from model import BigramLanguageModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#defining the parameters
assert(my_config.head_size*my_config.num_heads==my_config.embed_size)
batch_size=my_config.batch_size
block_size=my_config.block_size
max_iters=my_config.max_iters
eval_interval=my_config.eval_interval
lr=my_config.lr
eval_iters=my_config.eval_iters
embed_size=my_config.embed_size
num_head=my_config.num_heads
n_blocks=my_config.n_blocks
dropout=my_config.dropout
vocab_size=my_config.vocab_size
model_config = {key: value for key, value in vars(my_config).items() if not key.startswith('__')}


best_val_loss=1e9
out_text_folder='saved_text/'
out_model_folder='saved_models'
if not os.path.exists(out_text_folder):
    os.makedirs(out_text_folder)
if not os.path.exists(out_model_folder):
    os.makedirs(out_model_folder)



torch.manual_seed(1337)

with open('output.txt', 'r',encoding='utf-8') as f:
    text = f.read()

chars=sorted(list(set(text)))
vocab_size=len(chars)
char_to_id={ch:id for id,ch in enumerate(chars)}
id_to_char={id:ch for id,ch in enumerate(chars)}
encode=lambda s: [char_to_id[ch] for ch in s]
decode=lambda l: ''.join([id_to_char[id] for id in l])

data=torch.tensor(encode(text),dtype=torch.long)
n=int(len(data)*0.9)
train_data=data[:n]
val_data=data[n:]

def get_batch(split):
    data=train_data if split=='train' else val_data
    ix=torch.randint(0,len(data)-block_size,(batch_size,))
    x=torch.stack([data[i:i+block_size] for i in ix])
    y=torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device),y.to(device)

@torch.no_grad()
def estimte_loss():
    out={}
    model.eval()
    for split in ['train','val']:
        losses=torch.zeros(eval_iters)
        for k in range(eval_iters):
            x,y=get_batch(split)
            logits,loss=model(x,y)
            losses[k]=loss.item()
        out[split+'_loss']=losses.mean()
    model.train()
    return out

model=BigramLanguageModel(my_config).to(device)
print("model defined with the follwing paramters")
print('batch-size:',batch_size)
print('block-size:',block_size)
print('max-iters:',max_iters)
print('eval-interval:',eval_interval)
print('lr:',lr)
print('eval-iters:',eval_iters)
print('embed-size:',embed_size)

optimizer=torch.optim.AdamW(model.parameters(),lr=lr)

for iter in range(max_iters):
    if(iter%eval_interval==0):
        losses=estimte_loss()
        print(f'Iter {iter:5d} Train loss {losses["train_loss"]:.4f} Val loss {losses["val_loss"]:.4f}')
        if losses['val_loss'] < best_val_loss:
            best_val_loss = losses['val_loss']
            if iter > 0:
                torch.save({
                'epoch': iter,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': losses['val_loss'],
                'config': model_config,
                }, f'{out_model_folder}/model.pt')
                print(f'Saving model at iter {iter} with val loss {losses["val_loss"]:.4f}')
    x,y=get_batch('train')
    logits,loss=model(x,y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

#generate text
# context=torch.tensor(encode('To be or not'),dtype=torch.long).unsqueeze(0).to(device)
# #context=torch.zeros(1,8,dtype=torch.long).to(device)
# generated_vector=decode(((model.generate(context,1000))[0]).tolist())

# #put the generated text in a file
# with open(f'{out_text_folder}/generated_text.txt', 'w',encoding='utf-8') as f:
#     f.write(generated_vector)
#     f.close()

model defined with the follwing paramters
batch-size: 64
block-size: 256
max-iters: 6500
eval-interval: 500
lr: 0.0003
eval-iters: 200
embed-size: 384
Iter     0 Train loss 4.7994 Val loss 4.7840
Iter   500 Train loss 1.7759 Val loss 1.8511
Saving model at iter 500 with val loss 1.8511
Iter  1000 Train loss 1.4834 Val loss 1.5689
Saving model at iter 1000 with val loss 1.5689
Iter  1500 Train loss 1.3569 Val loss 1.4285
Saving model at iter 1500 with val loss 1.4285
Iter  2000 Train loss 1.2891 Val loss 1.3622
Saving model at iter 2000 with val loss 1.3622
Iter  2500 Train loss 1.2188 Val loss 1.3043
Saving model at iter 2500 with val loss 1.3043
Iter  3000 Train loss 1.1801 Val loss 1.2708
Saving model at iter 3000 with val loss 1.2708
Iter  3500 Train loss 1.1481 Val loss 1.2395
Saving model at iter 3500 with val loss 1.2395
Iter  4000 Train loss 1.1229 Val loss 1.2156
Saving model at iter 4000 with val loss 1.2156
Iter  4500 Train loss 1.1047 Val loss 1.2079
Saving model at iter 450