In [1]:
import time
from datetime import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F

from model.Models import Transformer, Transformer2
from model.Optim import CosineWithRestarts
from model.Batch import create_masks
from utils import MyTokenizer, MyMasker
from data import TextDataset
from torch.utils.data import Dataset, DataLoader, random_split

ModuleNotFoundError: No module named 'Layers'

In [2]:
# Loading data
bs=512
dataset = TextDataset()
train_size = int(0.99*len(dataset))
test_size = len(dataset)-train_size

print(train_size, test_size)

225027 2273


In [3]:
masker = MyMasker()
tokenizer = MyTokenizer(32)

train_dataset, val_dataset = random_split(dataset, [train_size, test_size], generator=torch.Generator().manual_seed(0))

In [4]:
trainloader = DataLoader(dataset=train_dataset, batch_size=bs, shuffle=True, num_workers=0)
valloader = DataLoader(dataset=val_dataset, batch_size=bs, shuffle=True, num_workers=0)

In [5]:
# Loading Tranformer model from scratch
max_len = 32
model = Transformer(src_vocab=28, d_model=128, max_seq_len=max_len, N=12, heads=8, dropout=0.1)
model.to('cuda')
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [6]:
masker = MyMasker()
tokenizer = MyTokenizer(max_len)

optim = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)

In [9]:
def train_model(model, bs=128, epochs=10, printevery=100):

    print("training model...")
    start = time.time()
    if torch.cuda.is_available():
        print('gpu detected!')
    else:
        print('no gpu detected')
        return 0

    model.train()
    for epoch in range(epochs):

        total_loss = 0

        for i, trg in enumerate(trainloader):

            # src = batch.src.transpose(0,1)
            # trg = batch.trg.transpose(0,1)
            # trg_input = trg[:, :-1]
            # src_mask, _ = create_masks(src, trg_input) # need to edit

            # test to check if overfit

            # src is the incomplete word
            perc=None
            src = masker.mask(trg, perc)  # e.g. [m_zh__n, _s, _w_eso_e]
            src = tokenizer.encode(src)  # e.g. [[], [], []]

            # trg is the complete word
            trg = tokenizer.encode(trg)

            # our src_mask is the same as trg_mask = mask
            mask, _ = create_masks(src)  # e.g. [[1, 1, 0, 0], [1, 0, 0, 0], [1, 1, 1, 0]]

            # Converting to cuda
            if torch.cuda.is_available():
                src = src.to('cuda')
                mask = mask.to('cuda')
                trg = trg.to('cuda')
            
            model.train()
            preds = model(src, mask)
            # ys = trg[:, 1:].contiguous().view(-1)
            # y = mask.squeeze(1)
            
            # 

            optim.zero_grad()
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), trg.contiguous().view(-1), ignore_index=0)
            loss.backward()
            optim.step()

            total_loss += loss.item()

            # print(i+1)
            if (i + 1) % printevery == 0:
                p = int(100 * (i + 1) / len(trainloader.dataset) * bs)
                avg_loss = total_loss / printevery
                print("\r   %dm: epoch %d [%s%s]  %d%%  loss = %.3f" %((time.time() - start)//60, epoch + 1, "".join('#'*(p//5)), "".join(' '*(20-(p//5))), p, avg_loss), end='')
                total_loss = 0

            
            if (i+1) % 10 == 0:
                torch.save(model.state_dict(), f'./weights/model_weights_{datetime.today().strftime("%m%d%Y")}')
                pass
                
        total_val_loss = 0
        sims = 0
        for i, val in enumerate(valloader):
            perc=None
            src = masker.mask(val, perc)  # e.g. [m_zh__n, _s, _w_eso_e]
            src = tokenizer.encode(src)  # e.g. [[], [], []]
            
            # trg is the complete word
            val = tokenizer.encode(val)
            
            # our src_mask is the same as trg_mask = mask
            mask, _ = create_masks(src)  # e.g. [[1, 1, 0, 0], [1, 0, 0, 0], [1, 1, 1, 0]]
            
            # Converting to cuda
            if torch.cuda.is_available():
                src = src.to('cuda')
                mask = mask.to('cuda')
                val = val.to('cuda')
            
            model.eval()
            preds = model(src, mask)
            
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), val.contiguous().view(-1), ignore_index=0)
            
            total_val_loss += loss.item()
            sims += 1
            if (i + 1) % printevery == 0:
                p = int(100 * (i + 1) / len(valloader.dataset) * bs)
                avg_val_loss = total_val_loss / sims
                print("\r   %dm: epoch %d [%s%s]  %d%%  loss = %.3f" %((time.time() - start)//60, epoch + 1, "".join('#'*(p//5)), "".join(' '*(20-(p//5))), p, avg_val_loss), end='')
            
        print("\r   %dm: epoch %d [%s%s]  %d%%  loss = %.3f\nepoch %d complete, val loss = %.03f" %\
        ((time.time() - start)//60, epoch + 1, "".join('#'*(100//5)), "".join(' '*(20-(100//5))), 100, avg_loss, epoch + 1, avg_val_loss))

In [10]:
# train_model(model, bs=512, epochs=25, printevery=1)

training model...
gpu detected!
   1m: epoch 1 [####################]  100%  loss = 1.00766
epoch 1 complete, val loss = 0.966
   3m: epoch 2 [####################]  100%  loss = 1.02764
epoch 2 complete, val loss = 0.964
   5m: epoch 3 [####################]  100%  loss = 0.98839
epoch 3 complete, val loss = 0.939
   7m: epoch 4 [####################]  100%  loss = 1.06019
epoch 4 complete, val loss = 0.919
   10m: epoch 5 [####################]  100%  loss = 0.95580
epoch 5 complete, val loss = 0.880
   12m: epoch 6 [####################]  100%  loss = 1.01609
epoch 6 complete, val loss = 0.909
   14m: epoch 7 [####################]  100%  loss = 0.95580
epoch 7 complete, val loss = 0.880
   16m: epoch 8 [####################]  100%  loss = 0.87496
epoch 8 complete, val loss = 0.896
   18m: epoch 9 [####################]  100%  loss = 0.95094
epoch 9 complete, val loss = 0.894
   20m: epoch 10 [####################]  100%  loss = 0.90880
epoch 10 complete, val loss = 0.880
   22m: ep

In [11]:
start = ord('a')
alphabets = {'_': 27}
ids = {27:'_', 0:''}
for i in range(26):
    ch = chr(start)
    alphabets[ch] = i+1
    ids[i+1] = ch
    start += 1

In [13]:
from agent import Agent
from Models import PGN


pgn = PGN(src_vocab=28, d_model=128, max_seq_len=32, N=12, heads=8, dropout=0.1)
pgn.transformer.load_state_dict(torch.load('./weights/model_weights_12082023'))

'''
pgn = PGN(src_vocab=28, d_model=32, max_seq_len=32, N=2, heads=4, dropout=0.1)
pgn.transformer.load_state_dict(torch.load('./weights/model_weights_lite_1'))
'''

if torch.cuda.is_available():
    pgn.to('cuda')

pgn.eval()

PGN(
  (transformer): Transformer(
    (encoder): Encoder(
      (embed): Embedder(
        (embed): Embedding(28, 128, padding_idx=0)
      )
      (pe): PositionalEncoder(
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (layers): ModuleList(
        (0-11): 12 x EncoderLayer(
          (norm_1): Norm()
          (norm_2): Norm()
          (attn): MultiHeadAttention(
            (q_linear): Linear(in_features=128, out_features=128, bias=True)
            (v_linear): Linear(in_features=128, out_features=128, bias=True)
            (k_linear): Linear(in_features=128, out_features=128, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (out): Linear(in_features=128, out_features=128, bias=True)
          )
          (ff): FeedForward(
            (linear_1): Linear(in_features=128, out_features=2048, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (linear_2): Linear(in_features=2048, out_features=128, bias=True)
        

In [14]:
def mini_sim(sample, env):
    env.reset(sample[0])
    n = len(sample[0])
    state = masker.mask(sample, 1)
    sample_mask, _ = create_masks(tokenizer.encode(sample))
    mask = sample_mask.to('cuda')
    y = sample_mask.squeeze(1).to('cuda')
    y_float = torch.where(y, 1., 0.)
    
    left = torch.ones((1, 28)).to('cuda')
    left[0,  0] = 0.
    left[0, -1] = 0.
    
    P = nn.Softmax(dim=-1)
    
    done = False
    
    cr = 0

    while not done:
        
        # print(state)
        
        state = tokenizer.encode(state)
        state = state.to('cuda')
        
        # q_probs = score / torch.sum(score)
        
        probs = pgn(state, mask)
        
        b_probs = torch.mul(probs, left)
        b_probs = b_probs / torch.sum(b_probs)
        b = torch.distributions.Categorical(probs=b_probs)

        action = b.sample()
        
        # using a greedy approach
        guess_id = torch.argmax(b_probs).item()
        
        # guess_id = action.item()
        guess = ids[guess_id]
        
        next_state, r, done = env.step(guess)
        state = [''.join(next_state)]

        
        left[0, guess_id] = 0.
        
        cr += r
        # print(guess, cr)
    
    return cr

In [15]:
from hangman import Hangman

def test_pgn(valloader):
    
    env = Hangman(n_episode=26)
    
    wins = 0
    reward = 0
    total_games = 0
    pgn.eval()
    for i, state in enumerate(valloader):
        
        cr = mini_sim(state, env)
        if cr > - 6:
            wins += 1
            # print(state)
        total_games += 1
        reward += cr
        
        avg_reward = reward / total_games
        win_rate = wins / total_games
        print('\r  wins : %d \t total games : %d \t win rate : %.03f%% \t reward : %.03f \t average reward : %.03f ' %(wins, total_games, 100*win_rate, cr, avg_reward), end='')

In [16]:
trainloader = DataLoader(dataset=train_dataset, batch_size=1, shuffle=True, num_workers=0)
valloader = DataLoader(dataset=val_dataset, batch_size=1, shuffle=True, num_workers=0)

In [17]:
test_pgn(trainloader)

  wins : 758 	 total games : 1349 	 win rate : 56.190% 	 reward : -5.000 	 average reward : -5.503  

KeyboardInterrupt: 

In [18]:
test_pgn(valloader)

  wins : 1277 	 total games : 2273 	 win rate : 56.181% 	 reward : -7.000 	 average reward : -5.691  