In [3]:
# !pip install pytorch-lightning wandb

In [4]:
# !pip install pythainlp

In [1]:
import pickle
import numpy as np
import pandas as pd

import torch
import pytorch_lightning as pl

In [2]:
data_path = 'data/'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
# !wandb login

In [4]:
# from pythainlp.corpus import download
# download('ltw2v_v1.0_5_window')

## Load data

In [3]:
train = pd.read_csv(data_path + 'train.csv')
val = pd.read_csv(data_path + 'val.csv')

In [4]:
train.shape, val.shape

((9507, 2), (3169, 2))

In [5]:
# remove sentences that longer than 100 characters
train_mask = train.text.str.len() <= 120
filter_train = train[train_mask]

val_mask = val.text.str.len() <= 120
filter_val = val[val_mask]


In [6]:
filter_train.shape, filter_val.shape

((5091, 2), (1738, 2))

### Word Vectorization

In [7]:
from pythainlp import word_tokenize
from torchtext import vocab
from collections import Counter

In [8]:
train_tokens = filter_train.text.apply(word_tokenize)
val_tokens = filter_val.text.apply(word_tokenize)

train_labels = filter_train.label.to_numpy()
val_labels = filter_val.label.to_numpy()

In [9]:
counter = Counter(train_tokens.sum())
unk_token = '<unk>'
default_index = -1
v1 = vocab.vocab(counter, specials=["</s>", "<unk>",])
v1.set_default_index(1)
print(v1["<unk>"]) #prints 0
print(v1['out of vocab']) #prints 0

1
1


In [10]:
vocab_size = len(v1)
print(f"vocal size is {vocab_size}")
text_pipeline = lambda sent: v1.lookup_indices(sent) # add </s> to the end of each sentence
text_decoding = lambda encoded: "".join(v1.lookup_tokens(encoded))

vocal size is 7909


In [11]:
encoded_sent = text_pipeline(train_tokens[2])
print("original:", "".join(train_tokens[2]))
print("encoded:", encoded_sent)
print("decoded:", text_decoding(encoded_sent))

original: ก็เพราะมันจะมาทำลายธุรกิจถูกกฎหมายของประเทศที่ทำงินมหาศาลในแต่ละปีไงล่ะจึงต้องตัดไฟเสียแต่ต้นลม
encoded: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]
decoded: ก็เพราะมันจะมาทำลายธุรกิจถูกกฎหมายของประเทศที่ทำงินมหาศาลในแต่ละปีไงล่ะจึงต้องตัดไฟเสียแต่ต้นลม


In [12]:
from pythainlp.corpus import get_corpus_path
model_path = get_corpus_path('ltw2v_v1.0_5_window')

In [13]:
from gensim.models import KeyedVectors

model5 = KeyedVectors.load_word2vec_format(model_path, binary=True, unicode_errors='ignore')

In [14]:
model5.vector_size

400

In [15]:
weights_matrix = np.zeros((vocab_size, 400))
words_found = 0

oov = []
for i, word in enumerate(v1.get_itos()):
    try: 
        weights_matrix[i] = model5[word]
        words_found += 1
    except KeyError:
        oov.append(i)
print("word_found =",words_found)
mean_vector = np.mean(weights_matrix, axis=0)
weights_matrix[oov] = mean_vector

word_found = 6772


In [16]:
weights_matrix = torch.FloatTensor(weights_matrix)

### Prepare Dataloader

In [17]:
max_train_len = len(max(train_tokens, key=len))
max_val_len = len(max(val_tokens, key=len))

In [18]:
print("Max train length:", max_train_len)
print("Max val length:", max_val_len)

Max train length: 49
Max val length: 46


In [19]:
from torch.nn.utils.rnn import pad_sequence

x_train = [torch.LongTensor(sentence) for sentence in train_tokens.apply(text_pipeline)]
x_val = [torch.LongTensor(sentence) for sentence in val_tokens.apply(text_pipeline)] 

x_train = pad_sequence(x_train, batch_first=True)
x_val = pad_sequence(x_val, batch_first=True)

# Pad the sequence length of x_test to be maxlen 
remaining_len = x_train.size(1) - x_val.size(1)
remaining_mat = torch.zeros((x_val.size(0), remaining_len), dtype=torch.long) 
x_val = torch.cat((x_val, remaining_mat), dim=1) 

In [23]:
impolite_train = x_train[train_labels == 0]
polite_train = x_train[train_labels == 1]

impolite_val = x_val[val_labels == 0]
polite_val = x_val[val_labels == 1]

In [24]:
from torch.utils.data import Dataset, DataLoader

class PolitenessDataset(Dataset):
    def __init__(self, data, labels):
        self.encoded = data
        self.labels = labels

    def __len__(self):
        return len(self.encoded)

    def __getitem__(self, idx):
        return self.encoded[idx], self.labels[idx]

class ConcatDataset(torch.utils.data.Dataset):
    def __init__(self, *datasets):
        self.datasets = datasets

    def __getitem__(self, i):
        return tuple(d[i] for d in self.datasets)

    def __len__(self):
        return min(len(d) for d in self.datasets)

In [25]:
from torch.nn.utils.rnn import pad_sequence

class PolitenessDataModule(pl.LightningDataModule):
  def __init__(self, batch_size, num_workers=0):
      super().__init__()
      self.batch_size = batch_size
      self.num_workers = num_workers

  def setup(self, stage: str):
      pass

  def train_dataloader(self):
      train_loader = DataLoader(ConcatDataset(
                          PolitenessDataset(impolite_train, torch.zeros(impolite_train.shape[0])),
                          PolitenessDataset(polite_train, torch.ones(polite_train.shape[0]))
                      ),
                      batch_size = self.batch_size, 
                      shuffle = True, 
                      num_workers = self.num_workers)

      return train_loader
  
  def val_dataloader(self):
      val_loader = DataLoader(ConcatDataset(
                          PolitenessDataset(impolite_val, torch.zeros(impolite_val.shape[0])),
                          PolitenessDataset(polite_val, torch.ones(impolite_val.shape[0]))
                      ),
                      batch_size = self.batch_size, 
                      shuffle = False, 
                      num_workers = self.num_workers)
      return val_loader


In [26]:
batch_size = 64
data_module = PolitenessDataModule(batch_size=batch_size)

In [26]:
# batch = next(iter(data_module.train_dataloader()))
# batch

## Models

In [27]:
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

from pytorch_lightning import Trainer

In [28]:
Ty = max_train_len

In [29]:
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [30]:
class Encoder(nn.Module):
    def __init__(self, weights_matrix, hidden_dim, n_layers=1):
        super().__init__()
        embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix)
        self.vocab_size = num_embeddings
        self.embedding_size = embedding_dim
        
        self.embedding = embedding
        self.lstm = nn.LSTM(self.embedding_size, hidden_dim, n_layers, batch_first=True)
       
    def forward(self, x):
        embedded_x = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded_x)
        return hidden[-1]
    

class Decoder(nn.Module):
    def __init__(self, vocab_size, input_dim, hidden_dim, dropout_rate=0.3):
        super().__init__()
        self.vocab_size = vocab_size
        self.lstm_cell = nn.LSTMCell(input_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, self.vocab_size)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, hx, cx):
        hx, cx = self.lstm_cell(x, (hx, cx))
        logit = self.fc(hx)
        return logit, hx, cx

In [31]:
class MultiDecoderModel(pl.LightningModule):
    def __init__(self, weights_matrix, criterion1, criterion2, learning_rate):
        super().__init__()
        self.vocab_size = weights_matrix.shape[0]
        self.embedding_dim = weights_matrix.shape[1]
        
        self.hidden_dim = 64
        self.encoder = Encoder(weights_matrix, self.hidden_dim)
        self.decoders = nn.ModuleList([Decoder(self.vocab_size, self.hidden_dim, self.hidden_dim) for i in range(2)])

        self.classifier = nn.Sequential(
            nn.Linear(self.hidden_dim, 32),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

        self.learning_rate = learning_rate
        self.decoder_criterion = criterion1
        self.cls_criterion = criterion2

    def forward(self, src, label_idx):
        context = self.encoder(src)
        decoder_h = torch.randn(src.shape[0], self.hidden_dim).to(self.decoders[label_idx].lstm_cell.weight_ih.device)
        decoder_c = torch.randn(src.shape[0], self.hidden_dim).to(self.decoders[label_idx].lstm_cell.weight_ih.device)

        prediction = torch.zeros((src.shape[0], Ty, self.vocab_size)).to(self.decoders[label_idx].lstm_cell.weight_ih.device)
        
        # Iterate until max_output_length
        for t in range(Ty):
            out, decoder_h, decoder_c = self.decoders[label_idx](context, decoder_h, decoder_c)

            prediction[:, t] = out
        return prediction, context
    
    def training_step(self, batch, batch_idx):
        impolite_inputs, impolite_labels = batch[0]
        polite_inputs, polite_labels = batch[1]
        
        # create context and predction for each style
        prediction_0, context_0 = self(impolite_inputs, 0)
        prediction_1, context_1 = self(polite_inputs, 1)

        contexts = torch.concat([context_0, context_1])
        labels = torch.concat([impolite_labels, polite_labels])

        # classification from context vector
        y_pred = self.classifier(contexts)
        y_pred = y_pred.squeeze()
        adv_loss1 = self.cls_criterion(y_pred, labels)
        adv_loss2 = torch.mean(y_pred * torch.log2(y_pred) + (1 - y_pred) * torch.log2(1 - y_pred)) # negative of entropy

        # calculate loss of each decoder networks
        gen0_loss = self.decoder_criterion(prediction_0.reshape(-1, vocab_size), impolite_inputs.reshape(-1)) # match impolite prediction to impolite inputs
        gen1_loss = self.decoder_criterion(prediction_1.reshape(-1, vocab_size), polite_inputs.reshape(-1)) # match polite prediction to polite inputs
    
        # total lostt
        loss = gen0_loss + gen1_loss + adv_loss1 + adv_loss2

        self.log("training_loss", loss)
        self.log("training_gen_loss", gen1_loss + gen0_loss)
        self.log("training_adversarial_loss_1", adv_loss1)
        self.log("training_adversarial_loss_2", adv_loss2)
        return loss

    def validation_step(self, batch, batch_idx):
        impolite_inputs, impolite_labels = batch[0]
        polite_inputs, polite_labels = batch[1]

        with torch.no_grad():
            prediction_0, context_0 = self(impolite_inputs, 0)
            prediction_1, context_1 = self(polite_inputs, 1)

            contexts = torch.concat([context_0, context_1])
            labels = torch.concat([impolite_labels, polite_labels])

            # classification from context vector
            y_pred = self.classifier(contexts)
            y_pred = y_pred.squeeze()
            adv_loss1 = self.cls_criterion(y_pred, labels)
            adv_loss2 = torch.mean(y_pred * torch.log2(y_pred) + (1 - y_pred) * torch.log2(1 - y_pred)) # negative of entropy

            # calculate loss of each decoder networks
            gen0_loss = self.decoder_criterion(prediction_0.reshape(-1, vocab_size), impolite_inputs.reshape(-1))
            gen1_loss = self.decoder_criterion(prediction_1.reshape(-1, vocab_size), polite_inputs.reshape(-1))
        
            # total lostt
            loss = gen0_loss + gen1_loss + adv_loss1 + adv_loss2

        self.log("val_loss", loss)
        self.log("val_gen_loss", gen1_loss + gen0_loss)
        self.log("val_adversarial_loss", adv_loss1)
        self.log("val_adversarial_loss", adv_loss2)
        return loss
    
    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        inputs, labels = batch
        with torch.no_grad():
          prediction, _ = self(inputs, 1)
          prediction = F.softmax(prediction, dim=-1)
          prediction = torch.argmax(prediction, dim=-1)
          for pred in prediction:
            print("".join(v1.lookup_tokens(pred.cpu().numpy())))
        return prediction
    
    def configure_optimizers(self):
        return optim.Adadelta(self.parameters(), lr=self.learning_rate)

In [28]:
lr = 1e-2
criterion1 = nn.CrossEntropyLoss()
criterion2 = nn.BCELoss()

model = MultiDecoderModel(weights_matrix, criterion1, criterion2, lr)

NameError: name 'weights_matrix' is not defined

In [33]:
import wandb
from pytorch_lightning.loggers import WandbLogger
wandb_logger = WandbLogger(project="final_project")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msahatsarin07[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

In [34]:
trainer = Trainer(
    default_root_dir='checkpoints/multi_decoder/round_2/',
    max_epochs=50,
    devices=1,
    logger=wandb_logger,
    callbacks=[pl.callbacks.ModelCheckpoint(filename='mymodel-epoch{epoch:02d}', dirpath='checkpoints/multi_decoder/round_2/', monitor='val_loss')]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [35]:
trainer.fit(model, data_module)

You are using a CUDA device ('NVIDIA GeForce RTX 3060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type             | Params
-------------------------------------------------------
0 | encoder           | Encoder          | 3.3 M 
1 | decoders          | ModuleList       | 1.1 M 
2 | classifier        | Sequential       | 2.1 K 
3 | decoder_criterion | CrossEntropyLoss | 0     
4 | cls_criterion     | BCELoss          | 0     
-------------------------------------------------------
4.4 M     Trainable params
0         Non-trainable params
4.4 M     Total params
17.519    To

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=50` reached.


In [None]:
with open('checkpoints/multi_decoder/round_2/' + '\\vocab.pkl', 'wb') as f:
    pickle.dump(v1, f)

In [None]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▇▇▇▇███
trainer/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇███
training_adversarial_loss_1,█▂▁▁▇▂
training_adversarial_loss_2,█▂▁▂▂▃
training_gen_loss,█▇▆▅▃▁
training_loss,█▇▆▅▃▁
val_adversarial_loss,█▆▅▃▂▂▁▁▁▁▁▁▁▁▂▂▂▂▂▂
val_gen_loss,███▇▇▇▇▆▆▆▆▅▅▄▄▄▃▂▂▁
val_loss,███▇▇▇▇▆▆▆▆▅▅▄▄▄▃▂▂▁

0,1
epoch,19.0
trainer/global_step,319.0
training_adversarial_loss_1,0.69316
training_adversarial_loss_2,-0.99998
training_gen_loss,17.47872
training_loss,17.1719
val_adversarial_loss,-0.15342
val_gen_loss,17.39444
val_loss,17.0876


# Test Model

In [51]:
vocab_path = 'checkpoints/multi_decoder/round_2/vocab.pkl'
target_vocab = pickle.load(open(vocab_path, "rb"))

In [52]:
vocab_size = len(target_vocab)
vocab_size

7909

In [53]:
EXAMPLES = ['แอคกูปะ','ตังออกวันไหน','ทำไมอินนิสฟรีที่สั่งในจมก.ยังไม่เริ่มจัดส่งอีกกกกก นานแล้วนะว้อยยย','นี่ถ้าเป็นนู๋เตรียมบัตรแล้วน่ะเนี้ย','ได้น้องแล้วค่ะ ตัวแน่นมากกกกก😣💓💓']

In [54]:
predict_data = []
for line in EXAMPLES:
    line = [l for l in line] #change from string to list
    predict_data.append(torch.tensor(target_vocab(line)))

print(len(predict_data))


predict_data = nn.utils.rnn.pad_sequence(predict_data, batch_first = True)
predict_dataset = PolitenessDataset(predict_data, torch.zeros(len(predict_data)))
predict_loader = DataLoader(predict_dataset,
                            batch_size = 1,
                            shuffle = False,
                            num_workers = 0)

5


In [55]:
checkpoint = torch.load('checkpoints\multi_decoder\\round_2\mymodel-epochepoch=19.ckpt')
print(checkpoint.keys())

dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers'])


In [56]:
lr = 5e-3
criterion1 = nn.CrossEntropyLoss()
criterion2 = nn.BCELoss()

test_model = MultiDecoderModel(weights_matrix, criterion1, criterion2, lr)
test_model.load_state_dict(checkpoint['state_dict'])
test_model.eval()

MultiDecoderModel(
  (encoder): Encoder(
    (embedding): Embedding(7909, 400)
    (lstm): LSTM(400, 64, batch_first=True)
  )
  (decoders): ModuleList(
    (0): Decoder(
      (lstm_cell): LSTMCell(64, 64)
      (fc): Linear(in_features=64, out_features=7909, bias=True)
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (1): Decoder(
      (lstm_cell): LSTMCell(64, 64)
      (fc): Linear(in_features=64, out_features=7909, bias=True)
      (dropout): Dropout(p=0.3, inplace=False)
    )
  )
  (classifier): Sequential(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): Linear(in_features=32, out_features=1, bias=True)
    (2): Sigmoid()
  )
  (decoder_criterion): CrossEntropyLoss()
  (cls_criterion): BCELoss()
)

In [57]:
trainer = Trainer(
    devices=1,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [58]:
output = trainer.predict(test_model, predict_loader)

You are using a CUDA device ('NVIDIA GeForce RTX 3060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

ชุมชนMazda</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>
การรัดย่างเฟร</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>
villeี้เจ้NCAPNCAPฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะ
KULOVKULOVแฮปปี้แฮปปี้</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>
นึงงิ้วงิ้ว</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>


In [59]:
for i in range(len(EXAMPLES)):
    original = EXAMPLES[i]
    pred = output[i][output[i] > 0]
    output_sent = "".join(target_vocab.lookup_tokens(pred.cpu().numpy()))
    print(original, ":", output_sent)

แอคกูปะ : ชุมชนMazda
ตังออกวันไหน : การรัดย่างเฟร
ทำไมอินนิสฟรีที่สั่งในจมก.ยังไม่เริ่มจัดส่งอีกกกกก นานแล้วนะว้อยยย : villeี้เจ้NCAPNCAPฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะฮะ
นี่ถ้าเป็นนู๋เตรียมบัตรแล้วน่ะเนี้ย : KULOVKULOVแฮปปี้แฮปปี้
ได้น้องแล้วค่ะ ตัวแน่นมากกกกก😣💓💓 : นึงงิ้วงิ้ว


In [60]:
test = pd.read_csv(data_path + 'test.csv')

impolite_test = test[test.label == 0].text.tolist()

test_data = []
for line in impolite_test:
    line = [l for l in line] #change from string to list
    test_data.append(torch.tensor(target_vocab(line)))

print(len(test_data))


test_data = nn.utils.rnn.pad_sequence(test_data, batch_first = True)
test_dataset = PolitenessDataset(test_data, torch.zeros(len(test_data)))
test_loader = DataLoader(test_dataset,
                            batch_size = 1,
                            shuffle = False,
                            num_workers = 0)

1585


In [None]:
output = trainer.predict(test_model, test_loader)

In [62]:
len(output)

1585

In [63]:
predict_sents = []
for i in range(len(output)):
    pred = output[i][output[i] > 0]
    pred_sent = "".join(target_vocab.lookup_tokens(pred.cpu().numpy()))
    predict_sents.append(pred_sent)

In [64]:
sentence_pair = pd.DataFrame(
    {'impolite': impolite_test,
     'polite': predict_sents}
)

In [65]:
sentence_pair.to_csv("result.csv", index=False)