In [1]:
!pip install pytorch-lightning wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.0.2-py3-none-any.whl (719 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.0/719.0 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.15.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m84.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lightning-utilities>=0.7.0
  Downloading lightning_utilities-0.8.0-py3-none-any.whl (20 kB)
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting setproctitle
  Downloading setproctitle-1

In [2]:
!pip install pythainlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pythainlp
  Downloading pythainlp-4.0.0-py3-none-any.whl (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pythainlp
Successfully installed pythainlp-4.0.0


In [1]:
import random
import math
import numpy as np
import pandas as pd

import torch
import pytorch_lightning as pl

In [2]:
data_path = 'data/'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33msahatsarin07[0m. Use [1m`wandb login --relogin`[0m to force relogin


## Word Vectorization

In [4]:
from pythainlp import word_tokenize
from torchtext import vocab
from collections import Counter

In [5]:
train = pd.read_csv(data_path + 'train.csv')
val = pd.read_csv(data_path + 'val.csv')

In [6]:
train_tokens = train.text.apply(word_tokenize)
val_tokens = val.text.apply(word_tokenize)

In [7]:
def gen_len_mask(tokens, max_len):
  mask = []
  for sent in tokens:
    mask.append(len(sent) < max_len)
  return pd.Series(mask)

In [8]:
# remove sentences that longer than 300 tokens
train_mask = gen_len_mask(train_tokens, 300)
train_tokens = train_tokens[train_mask]
train_labels = train.label[train_mask].to_numpy()

val_mask = gen_len_mask(val_tokens, 300)
val_tokens = val_tokens[val_mask]
val_labels = val.label[val_mask].to_numpy()

In [9]:
counter = Counter(train_tokens.sum())
v1 = vocab.vocab(counter, specials=["</s>", "<unk>",])
v1.set_default_index(1)
print(v1["<unk>"]) #prints 0
print(v1['out of vocab']) #prints 0

1
1


In [10]:
vocab_size = len(v1)
print(f"vocal size is {vocab_size}")
text_pipeline = lambda sent: v1.lookup_indices(sent) # add </s> to the end of each sentence
text_decoding = lambda encoded: "".join(v1.lookup_tokens(encoded))

vocal size is 20589


In [11]:
encoded_sent = text_pipeline(train_tokens[10])
print("original:", "".join(train_tokens[10]))
print("encoded:", encoded_sent)
print("decoded:", text_decoding(encoded_sent))

original: ลงมาเป็นเสียงสามัญ ม้าอือ อย่างนี้ก็แย่ซิครับ คือเปลี่ยนทั้งพยัญชนะ เปลี่ยนทั้งสระเปลี่ยนทั้งวรรณยุกต์ด้วย หัวผมไม่ใช่คอมพิวเตอร์ จะได้แปลคำเหล่านี้มาเป็นคำไทยได้ทันเพราะฉะนั้น เมื่อพูดถึง ม้าอือ หม่าย ผมก็แปลไม่ออก
encoded: [226, 9, 227, 228, 4, 229, 230, 4, 231, 63, 232, 233, 234, 4, 16, 235, 236, 237, 4, 235, 236, 238, 235, 236, 239, 184, 4, 240, 62, 35, 241, 242, 4, 6, 117, 66, 243, 244, 54, 9, 243, 72, 117, 224, 245, 4, 124, 246, 4, 229, 230, 4, 247, 248, 63, 66, 35, 179]
decoded: ลงมาเป็นเสียงสามัญ ม้าอือ อย่างนี้ก็แย่ซิครับ คือเปลี่ยนทั้งพยัญชนะ เปลี่ยนทั้งสระเปลี่ยนทั้งวรรณยุกต์ด้วย หัวผมไม่ใช่คอมพิวเตอร์ จะได้แปลคำเหล่านี้มาเป็นคำไทยได้ทันเพราะฉะนั้น เมื่อพูดถึง ม้าอือ หม่าย ผมก็แปลไม่ออก


### Prepare Dataloader

In [12]:
max_train_len = len(max(train_tokens, key=len))
max_val_len = len(max(val_tokens, key=len))

In [13]:
print("Max train length:", max_train_len)
print("Max val length:", max_val_len)

Max train length: 285
Max val length: 285


In [14]:
import torch 
from torch.nn.utils.rnn import pad_sequence

x_train = [torch.LongTensor(sentence) for sentence in train_tokens.apply(text_pipeline)]
x_val = [torch.LongTensor(sentence) for sentence in val_tokens.apply(text_pipeline)] 

x_train = pad_sequence(x_train, batch_first=True)
x_val = pad_sequence(x_val, batch_first=True)

# Pad the sequence length of x_test to be maxlen 
remaining_len = x_train.size(1) - x_val.size(1)
remaining_mat = torch.zeros((x_val.size(0), remaining_len), dtype=torch.long) 
x_val = torch.cat((x_val, remaining_mat), dim=1) 

In [15]:
impolite_train = x_train[train_labels == 0]
polite_train = x_train[train_labels == 1]

impolite_val = x_val[val_labels == 0]
polite_val = x_val[val_labels == 1]

In [16]:
from torch.utils.data import Dataset, DataLoader

class PolitenessDataset(Dataset):
    def __init__(self, data, labels):
        self.encoded = data
        self.labels = labels

    def __len__(self):
        return len(self.encoded)

    def __getitem__(self, idx):
        return self.encoded[idx], self.labels[idx]

class ConcatDataset(torch.utils.data.Dataset):
    def __init__(self, *datasets):
        self.datasets = datasets

    def __getitem__(self, i):
        return tuple(d[i] for d in self.datasets)

    def __len__(self):
        return min(len(d) for d in self.datasets)

In [17]:
from torch.nn.utils.rnn import pad_sequence

class PolitenessDataModule(pl.LightningDataModule):
  def __init__(self, batch_size, num_workers=0):
      super().__init__()
      self.batch_size = batch_size
      self.num_workers = num_workers

  def setup(self, stage: str):
      pass

  def train_dataloader(self):
      train_loader = DataLoader(ConcatDataset(
                          PolitenessDataset(impolite_train, torch.zeros(impolite_train.shape[0])),
                          PolitenessDataset(polite_train, torch.ones(polite_train.shape[0]))
                      ),
                      batch_size = self.batch_size, 
                      shuffle = True, 
                      num_workers = self.num_workers)

      return train_loader
  
  def val_dataloader(self):
      val_loader = DataLoader(ConcatDataset(
                          PolitenessDataset(impolite_val, torch.zeros(impolite_val.shape[0])),
                          PolitenessDataset(polite_val, torch.ones(impolite_val.shape[0]))
                      ),
                      batch_size = self.batch_size, 
                      shuffle = False, 
                      num_workers = self.num_workers)
      return val_loader


In [18]:
batch_size = 64
data_module = PolitenessDataModule(batch_size=batch_size, num_workers=2)

In [19]:
batch = next(iter(data_module.train_dataloader()))
batch

[[tensor([[ 1769,  5522,   211,  ...,     0,     0,     0],
          [   54,  4577,  4578,  ...,     0,     0,     0],
          [   77,  1503,   510,  ...,     0,     0,     0],
          ...,
          [  360,   361,   211,  ...,     0,     0,     0],
          [10616,  1840,   530,  ...,     0,     0,     0],
          [ 2226,   308,   160,  ...,     0,     0,     0]]),
  tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])],
 [tensor([[   62, 11326,  9274,  ...,     0,     0,     0],
          [  397,    29,    37,  ...,     0,     0,     0],
          [  805,   611,   403,  ...,     0,     0,     0],
          ...,
          [   59,    62,  2354,  ...,     0,     0,     0],
          [  117,  4419,   153,  ...,     0,     0,     0],
          [  882,  

## Models

In [20]:
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

from pytorch_lightning import Trainer

In [21]:
Ty = max_train_len

In [22]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embeding_size, hidden_dim, n_layers=1):
        super().__init__()
        self.embedding_size = embeding_size
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        self.gru = nn.GRU(self.embedding_size, hidden_dim, n_layers, batch_first=True)
       
    def forward(self, x):
        embedded = self.embedding(x)
        _, hidden = self.gru(embedded)
        return hidden[-1]
    

class Decoder(nn.Module):
    def __init__(self, vocab_size, input_dim, hidden_dim, dropout_rate=0.3):
        super().__init__()
        self.vocab_size = vocab_size
        self.gru_cell = nn.GRUCell(input_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, self.vocab_size)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, hx):
        hx = self.gru_cell(x, hx)
        logit = self.fc(hx)
        return logit, hx

In [23]:
class MultiDecoderModel(pl.LightningModule):
    def __init__(self, criterion1, criterion2, learning_rate):
        super().__init__()
        self.embedding_dim = 64
        self.hidden_dim = 64
        self.vocab_size = vocab_size
        self.encoder = Encoder(self.vocab_size, self.embedding_dim, self.hidden_dim)
        self.decoders = nn.ModuleList([Decoder(self.vocab_size, self.hidden_dim, self.hidden_dim) for i in range(2)])

        self.classifier = nn.Sequential(
            nn.Linear(self.hidden_dim, 32),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

        self.learning_rate = learning_rate
        self.decoder_criterion = criterion1
        self.cls_criterion = criterion2

    def forward(self, src, label_idx):
        context = self.encoder(src)
        decoder_h = torch.randn(src.shape[0], self.hidden_dim).to(self.decoders[label_idx].gru_cell.weight_ih.device)

        prediction = torch.zeros((src.shape[0], Ty, self.vocab_size)).to(self.decoders[label_idx].gru_cell.weight_ih.device)
        
        # Iterate until max_output_length
        for t in range(Ty):
            out, decoder_h = self.decoders[label_idx](context, decoder_h)

            prediction[:, t] = out
        return prediction, context
    
    def training_step(self, batch, batch_idx):
        impolite_inputs, impolite_labels = batch[0]
        polite_inputs, polite_labels = batch[1]
  
        prediction_0, context_0 = self(impolite_inputs, 0)
        prediction_1, context_1 = self(polite_inputs, 1)

        contexts = torch.concat([context_0, context_1])
        labels = torch.concat([impolite_labels, polite_labels])

        # classification from context vector
        y_pred = self.classifier(contexts)
        y_pred = y_pred.squeeze()
        avr_loss1 = self.cls_criterion(y_pred, labels)
        avr_loss2 = torch.sum(y_pred * torch.log2(y_pred) + (1 - y_pred) * torch.log2(1 - y_pred)) # negative of entropy

        # calculate loss of each decoder networks
        gen0_loss = self.decoder_criterion(prediction_0.reshape(-1, vocab_size), impolite_inputs.reshape(-1))
        gen1_loss = self.decoder_criterion(prediction_1.reshape(-1, vocab_size), polite_inputs.reshape(-1))
    
        # total lostt
        loss = gen0_loss + gen1_loss + avr_loss1 + avr_loss2

        self.log("training_loss", loss)
        self.log("training_gen_loss", gen1_loss + gen0_loss)
        self.log("training_adversarial_loss_1", avr_loss1)
        self.log("training_adversarial_loss_2", avr_loss2)
        return loss

    def validation_step(self, batch, batch_idx):
        impolite_inputs, impolite_labels = batch[0]
        polite_inputs, polite_labels = batch[1]

        with torch.no_grad():
            prediction_0, context_0 = self(impolite_inputs, 0)
            prediction_1, context_1 = self(polite_inputs, 1)

            contexts = torch.concat([context_0, context_1])
            labels = torch.concat([impolite_labels, polite_labels])

            # classification from context vector
            y_pred = self.classifier(contexts)
            y_pred = y_pred.squeeze()
            avr_loss1 = self.cls_criterion(y_pred, labels)
            avr_loss2 = torch.sum(y_pred * torch.log2(y_pred) + (1 - y_pred) * torch.log2(1 - y_pred)) # negative of entropy

            # calculate loss of each decoder networks
            gen0_loss = self.decoder_criterion(prediction_0.reshape(-1, vocab_size), impolite_inputs.reshape(-1))
            gen1_loss = self.decoder_criterion(prediction_1.reshape(-1, vocab_size), polite_inputs.reshape(-1))
        
            # total lostt
            loss = gen0_loss + gen1_loss + avr_loss1 + avr_loss2

        self.log("val_loss", loss)
        self.log("val_gen_loss", gen1_loss + gen0_loss)
        self.log("val_adversarial_loss", avr_loss1)
        self.log("val_adversarial_loss", avr_loss2)
        return loss
    
    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        inputs, labels = batch
        with torch.no_grad():
          prediction, _ = self(inputs, 0)
          prediction = F.softmax(prediction, dim=-1)
          prediction = torch.argmax(prediction, dim=-1)
          for pred in prediction:
            print("".join(v1.lookup_tokens(pred.cpu().numpy())))
        return prediction
    
    def configure_optimizers(self):
        return optim.Adadelta(self.parameters(), lr=self.learning_rate)

In [24]:
lr = 3e-4
criterion1 = nn.CrossEntropyLoss(reduction='sum')
criterion2 = nn.BCELoss(reduction='sum')

model = MultiDecoderModel(criterion1, criterion2, lr)

In [25]:
import wandb
from pytorch_lightning.loggers import WandbLogger
wandb_logger = WandbLogger(project="final_project")

[34m[1mwandb[0m: Currently logged in as: [33msahatsarin07[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [26]:
trainer = Trainer(
    default_root_dir='checkpoints/multi_decoder/',
    max_epochs=15,
    devices=1,
    logger=wandb_logger,
    callbacks=[pl.callbacks.ModelCheckpoint(filename='best',monitor='val_gen_loss')]
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, data_module)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type             | Params
-------------------------------------------------------
0 | encoder           | Encoder          | 1.3 M 
1 | decoders          | ModuleList       | 2.7 M 
2 | classifier        | Sequential       | 2.1 K 
3 | decoder_criterion | CrossEntropyLoss | 0     
4 | cls_criterion     | BCELoss          | 0     
-------------------------------------------------------
4.1 M     Trainable params
0         Non-trainable params
4.1 M     Total params
16.285    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

In [None]:
wandb.finish()

# Test Model

In [None]:
EXAMPLES = ['แอคกูปะ','ตังออกวันไหน','ทำไมอินนิสฟรีที่สั่งในจมก.ยังไม่เริ่มจัดส่งอีกกกกก นานแล้วนะว้อยยย','นี่ถ้าเป็นนู๋เตรียมบัตรแล้วน่ะเนี้ย','ได้น้องแล้วค่ะ ตัวแน่นมากกกกก😣💓💓']

In [None]:
predict_data = []
for line in EXAMPLES:
    line = [l for l in line] #change from string to list
    predict_data.append(torch.tensor(v1(line)))

print(len(predict_data))


predict_data = nn.utils.rnn.pad_sequence(predict_data, batch_first = True)
predict_dataset = PolitenessDataset(predict_data, torch.zeros(len(predict_data)))
predict_loader = DataLoader(predict_dataset,
                            batch_size = 1,
                            shuffle = False,
                            num_workers = 0)

In [None]:
model.eval()

In [None]:
output = trainer.predict(model, predict_loader)