SKENARIO 1

LOAD DATA

In [None]:
import pandas as pd

df = pd.read_csv("/content/Skenario 1 (errant).csv")
df = df.dropna()
df = df[['input', 'target']]


Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
import sentencepiece as spm
import re

# Preprocessing fungsi
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Menghilangkan tanda baca
    text = re.sub(r'\d+', '', text)  # Menghilangkan angka
    text = re.sub(r'\s+', ' ', text)  # Menghilangkan spasi berlebih
    return text.strip()

# Kita terapkan preprocessing
df['input'] = df['input'].astype(str).apply(clean_text)
df['target'] = df['target'].astype(str).apply(clean_text)

df['input'].to_csv("src.txt", index=False, header=False)
df['target'].to_csv("tgt.txt", index=False, header=False)

# Latih tokenizer SentencePiece
spm.SentencePieceTrainer.train(
    input='src.txt,tgt.txt',
    model_prefix='tokenizer',
    vocab_size=1603,
    pad_id=0,
    bos_id=1,
    eos_id=2,
    unk_id=3
)

# Load tokenizer
sp = spm.SentencePieceProcessor(model_file='tokenizer.model')


Mempersiapkan data teks (input dan target) dalam format tensor numerik agar bisa digunakan oleh model Transformer (atau Seq2Seq lainnya).

In [None]:
import torch
from torch.utils.data import Dataset

class GrammarDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, tokenizer, max_len=64):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_ids = [1] + self.tokenizer.encode(self.src_texts[idx]) + [2]
        tgt_ids = [1] + self.tokenizer.encode(self.tgt_texts[idx]) + [2]

        src_ids = src_ids[:self.max_len] + [0] * (self.max_len - len(src_ids))
        tgt_ids = tgt_ids[:self.max_len] + [0] * (self.max_len - len(tgt_ids))

        return {
            'src_ids': torch.tensor(src_ids),
            'trg_ids': torch.tensor(tgt_ids)
        }

# Split data
train_src, val_src, train_tgt, val_tgt = train_test_split(df['input'], df['target'], test_size=0.1)

# Dataset
train_data = GrammarDataset(train_src.tolist(), train_tgt.tolist(), sp)
val_data = GrammarDataset(val_src.tolist(), val_tgt.tolist(), sp)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=32)

Arsitektur model Transformer berbasis Seq2Seq

In [None]:
import torch.nn as nn

class Seq2SeqTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, num_heads=8, num_layers=3, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.transformer = nn.Transformer(d_model=embed_dim, nhead=num_heads, num_encoder_layers=num_layers,
                                          num_decoder_layers=num_layers, dropout=dropout)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, src, tgt):
        src_mask = self.transformer.generate_square_subsequent_mask(src.size(1)).to(src.device)
        tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)

        src_emb = self.embedding(src)
        tgt_emb = self.embedding(tgt)

        out = self.transformer(src_emb.permute(1, 0, 2), tgt_emb.permute(1, 0, 2),
                               src_mask=src_mask, tgt_mask=tgt_mask)
        return self.fc(out.permute(1, 0, 2))  # (batch, seq, vocab)

In [None]:
import torch.optim as optim

model = Seq2SeqTransformer(vocab_size=2059).to('cuda')
criterion = nn.CrossEntropyLoss(ignore_index=0)
# Use optim.Adam instead of torch.optim.Adam for clarity and consistency, though torch.optim works
optimizer = optim.Adam(model.parameters(), lr=0.0001)

def train_epoch(model, loader):
    model.train()
    total_loss = 0
    for batch in loader:
        src = batch['src_ids'].to('cuda')
        tgt = batch['trg_ids'].to('cuda')

        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.reshape(-1, 2059), tgt[:, 1:].reshape(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate_loss(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in loader:
            src = batch['src_ids'].to('cuda')
            tgt = batch['trg_ids'].to('cuda')
            output = model(src, tgt[:, :-1])
            loss = criterion(output.reshape(-1, 2059), tgt[:, 1:].reshape(-1))
            total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(25):
    print(f"\nEpoch {epoch+1}")
    train_loss = train_epoch(model, train_loader)
    val_loss = evaluate_loss(model, val_loader)
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

def correct(model, sentence):
    model.eval()
    with torch.no_grad():
        src_ids = [1] + sp.encode(sentence) + [2]
        src_ids = src_ids[:64] + [0]*(64 - len(src_ids))
        src_tensor = torch.tensor([src_ids]).to('cuda')

        tgt_ids = [1]
        for _ in range(64):
            tgt_tensor = torch.tensor([tgt_ids]).to('cuda')
            output = model(src_tensor, tgt_tensor)
            next_token = output[0, -1].argmax(-1).item()
            if next_token == 2 or len(tgt_ids) >= 64:
                break
            tgt_ids.append(next_token)

        return sp.decode(tgt_ids[1:])




Epoch 1
Train Loss: 6.6164 | Val Loss: 5.9745

Epoch 2
Train Loss: 5.6819 | Val Loss: 5.3361

Epoch 3
Train Loss: 5.1113 | Val Loss: 4.9198

Epoch 4
Train Loss: 4.6587 | Val Loss: 4.5637

Epoch 5
Train Loss: 4.2716 | Val Loss: 4.2821

Epoch 6
Train Loss: 3.9345 | Val Loss: 4.0369

Epoch 7
Train Loss: 3.6537 | Val Loss: 3.8417

Epoch 8
Train Loss: 3.3762 | Val Loss: 3.6578

Epoch 9
Train Loss: 3.1627 | Val Loss: 3.4903

Epoch 10
Train Loss: 2.9441 | Val Loss: 3.3650

Epoch 11
Train Loss: 2.7715 | Val Loss: 3.2314

Epoch 12
Train Loss: 2.5683 | Val Loss: 3.1249

Epoch 13
Train Loss: 2.4134 | Val Loss: 3.0063

Epoch 14
Train Loss: 2.2504 | Val Loss: 2.8917

Epoch 15
Train Loss: 2.0988 | Val Loss: 2.7944

Epoch 16
Train Loss: 1.9556 | Val Loss: 2.7073

Epoch 17
Train Loss: 1.8238 | Val Loss: 2.6414

Epoch 18
Train Loss: 1.6991 | Val Loss: 2.5498

Epoch 19
Train Loss: 1.5809 | Val Loss: 2.5049

Epoch 20
Train Loss: 1.4634 | Val Loss: 2.4447

Epoch 21
Train Loss: 1.3397 | Val Loss: 2.3945



In [None]:
def correct(model, sentence):
    model.eval()
    with torch.no_grad():
        src_ids = [1] + sp.encode(sentence) + [2]
        src_ids = src_ids[:64] + [0]*(64 - len(src_ids))
        src_tensor = torch.tensor([src_ids]).to('cuda')

        tgt_ids = [1]
        for _ in range(64):
            tgt_tensor = torch.tensor([tgt_ids]).to('cuda')
            output = model(src_tensor, tgt_tensor)
            next_token = output[0, -1].argmax(-1).item()
            if next_token == 2 or len(tgt_ids) >= 64:
                break
            tgt_ids.append(next_token)

        return sp.decode(tgt_ids[1:])


In [None]:
!pip install rouge_score
import nltk
# Remove the incorrect exception handling for DownloadError
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    print("wordnet not found. Attempting to download wordnet...")
    nltk.download('wordnet')

from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def evaluate_metrics(model, data, tokenizer, max_len=64):
    bleu_scores, meteor_scores, em_scores = [], [], []
    r1, r2, rl = [], [], []

    for idx in range(len(data)):
        # Get sample from dataset
        batch = data[idx]

        # Dekode dan bersihkan kalimat target dan kalimat yang diprediksi
        # Dengan asumsi item batch sudah berupa tensor, ubah ke daftar sebelum mendekode
        src = tokenizer.decode(batch['src_ids'].tolist()).replace('<pad>', '').strip()
        tgt = tokenizer.decode(batch['trg_ids'].tolist()).replace('<pad>', '').strip()

        pred = correct(model, src).strip()

        # Tokenize for metrics that require it
        tgt_tokens = tgt.split()
        pred_tokens = pred.split()
        bleu_scores.append(sentence_bleu([tgt_tokens], pred_tokens))
        meteor_scores.append(meteor_score([tgt_tokens], pred_tokens))
        em_scores.append(int(tgt == pred))

        scores = scorer.score(tgt, pred)
        r1.append(scores['rouge1'].fmeasure)
        r2.append(scores['rouge2'].fmeasure)
        rl.append(scores['rougeL'].fmeasure)

    # Handle potential division by zero if data is empty
    num_samples = len(bleu_scores)
    if num_samples == 0:
        return {
            "BLEU": 0.0,
            "METEOR": 0.0,
            "ROUGE-1": 0.0,
            "ROUGE-2": 0.0,
            "ROUGE-L": 0.0,
            "Exact Match": 0.0,
        }

    return {
        "BLEU": sum(bleu_scores)/num_samples,
        "METEOR": sum(meteor_scores)/num_samples,
        "ROUGE-1": sum(r1)/num_samples,
        "ROUGE-2": sum(r2)/num_samples,
        "ROUGE-L": sum(rl)/num_samples,
        "Exact Match": sum(em_scores)/num_samples,
    }

metrics = evaluate_metrics(model, val_data, sp)
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=ce0a9baae81f436a41aa68932733f3a84a04c6a620076083702f9444bcb70659
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
wordnet not found. Attempting to download wordnet...


[nltk_data] Downloading package wordnet to /root/nltk_data...
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-

BLEU: 0.3673
METEOR: 0.5490
ROUGE-1: 0.6183
ROUGE-2: 0.4652
ROUGE-L: 0.5967
Exact Match: 0.2012


SKENARIO 2

LOAD DATA

In [None]:
import pandas as pd

df = pd.read_csv("/content/Skenario 2 (shadow labeling & augmentasi).csv")
df = df.dropna()
df = df[['input', 'target']]


Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
import sentencepiece as spm
import re

# Preprocessing fungsi
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Menghilangkan tanda baca
    text = re.sub(r'\d+', '', text)  # Menghilangkan angka
    text = re.sub(r'\s+', ' ', text)  # Menghilangkan spasi berlebih
    return text.strip()

# Kita terapkan preprocessing
df['input'] = df['input'].astype(str).apply(clean_text)
df['target'] = df['target'].astype(str).apply(clean_text)

df['input'].to_csv("src.txt", index=False, header=False)
df['target'].to_csv("tgt.txt", index=False, header=False)

# Latih tokenizer SentencePiece
spm.SentencePieceTrainer.train(
    input='src.txt,tgt.txt',
    model_prefix='tokenizer',
    vocab_size=1603,
    pad_id=0,
    bos_id=1,
    eos_id=2,
    unk_id=3
)

# Load tokenizer
sp = spm.SentencePieceProcessor(model_file='tokenizer.model')


Mempersiapkan data teks (input dan target) dalam format tensor numerik agar bisa digunakan oleh model Transformer (atau Seq2Seq lainnya).

In [None]:
import torch
from torch.utils.data import Dataset

class GrammarDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, tokenizer, max_len=64):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_ids = [1] + self.tokenizer.encode(self.src_texts[idx]) + [2]
        tgt_ids = [1] + self.tokenizer.encode(self.tgt_texts[idx]) + [2]

        src_ids = src_ids[:self.max_len] + [0] * (self.max_len - len(src_ids))
        tgt_ids = tgt_ids[:self.max_len] + [0] * (self.max_len - len(tgt_ids))

        return {
            'src_ids': torch.tensor(src_ids),
            'trg_ids': torch.tensor(tgt_ids)
        }

# Split data
train_src, val_src, train_tgt, val_tgt = train_test_split(df['input'], df['target'], test_size=0.1)

# Dataset
train_data = GrammarDataset(train_src.tolist(), train_tgt.tolist(), sp)
val_data = GrammarDataset(val_src.tolist(), val_tgt.tolist(), sp)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=32)

Arsitektur model Transformer berbasis Seq2Seq

In [None]:
import torch.nn as nn

class Seq2SeqTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, num_heads=8, num_layers=3, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.transformer = nn.Transformer(d_model=embed_dim, nhead=num_heads, num_encoder_layers=num_layers,
                                          num_decoder_layers=num_layers, dropout=dropout)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, src, tgt):
        src_mask = self.transformer.generate_square_subsequent_mask(src.size(1)).to(src.device)
        tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)

        src_emb = self.embedding(src)
        tgt_emb = self.embedding(tgt)

        out = self.transformer(src_emb.permute(1, 0, 2), tgt_emb.permute(1, 0, 2),
                               src_mask=src_mask, tgt_mask=tgt_mask)
        return self.fc(out.permute(1, 0, 2))  # (batch, seq, vocab)


In [None]:
import torch.optim as optim # Import optim explicitly

model = Seq2SeqTransformer(vocab_size=2059).to('cuda')
criterion = nn.CrossEntropyLoss(ignore_index=0)
# Use optim.Adam instead of torch.optim.Adam for clarity and consistency, though torch.optim works
optimizer = optim.Adam(model.parameters(), lr=0.0001)

def train_epoch(model, loader):
    model.train()
    total_loss = 0
    for batch in loader:
        src = batch['src_ids'].to('cuda')
        tgt = batch['trg_ids'].to('cuda')

        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.reshape(-1, 2059), tgt[:, 1:].reshape(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate_loss(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in loader:
            src = batch['src_ids'].to('cuda')
            tgt = batch['trg_ids'].to('cuda')
            output = model(src, tgt[:, :-1])
            loss = criterion(output.reshape(-1, 2059), tgt[:, 1:].reshape(-1))
            total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(25):
    print(f"\nEpoch {epoch+1}")
    train_loss = train_epoch(model, train_loader)
    val_loss = evaluate_loss(model, val_loader)
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

def correct(model, sentence):
    model.eval()
    with torch.no_grad():
        src_ids = [1] + sp.encode(sentence) + [2]
        src_ids = src_ids[:64] + [0]*(64 - len(src_ids))
        src_tensor = torch.tensor([src_ids]).to('cuda')

        tgt_ids = [1]
        for _ in range(64):
            tgt_tensor = torch.tensor([tgt_ids]).to('cuda')
            output = model(src_tensor, tgt_tensor)
            next_token = output[0, -1].argmax(-1).item()
            if next_token == 2 or len(tgt_ids) >= 64:
                break
            tgt_ids.append(next_token)

        return sp.decode(tgt_ids[1:])




Epoch 1
Train Loss: 6.5647 | Val Loss: 5.9907

Epoch 2
Train Loss: 5.7174 | Val Loss: 5.3385

Epoch 3
Train Loss: 5.0977 | Val Loss: 4.8774

Epoch 4
Train Loss: 4.6114 | Val Loss: 4.4473

Epoch 5
Train Loss: 4.2123 | Val Loss: 4.1463

Epoch 6
Train Loss: 3.8609 | Val Loss: 3.8631

Epoch 7
Train Loss: 3.5563 | Val Loss: 3.6145

Epoch 8
Train Loss: 3.2730 | Val Loss: 3.3958

Epoch 9
Train Loss: 3.0168 | Val Loss: 3.1983

Epoch 10
Train Loss: 2.7844 | Val Loss: 3.0144

Epoch 11
Train Loss: 2.5685 | Val Loss: 2.8478

Epoch 12
Train Loss: 2.3628 | Val Loss: 2.7028

Epoch 13
Train Loss: 2.1852 | Val Loss: 2.5687

Epoch 14
Train Loss: 2.0204 | Val Loss: 2.4479

Epoch 15
Train Loss: 1.8551 | Val Loss: 2.3530

Epoch 16
Train Loss: 1.7151 | Val Loss: 2.2428

Epoch 17
Train Loss: 1.5774 | Val Loss: 2.1491

Epoch 18
Train Loss: 1.4497 | Val Loss: 2.0567

Epoch 19
Train Loss: 1.3349 | Val Loss: 2.0007

Epoch 20
Train Loss: 1.2203 | Val Loss: 1.9071

Epoch 21
Train Loss: 1.1069 | Val Loss: 1.8631



In [None]:
def correct(model, sentence):
    model.eval()
    with torch.no_grad():
        src_ids = [1] + sp.encode(sentence) + [2]
        src_ids = src_ids[:64] + [0]*(64 - len(src_ids))
        src_tensor = torch.tensor([src_ids]).to('cuda')

        tgt_ids = [1]
        for _ in range(64):
            tgt_tensor = torch.tensor([tgt_ids]).to('cuda')
            output = model(src_tensor, tgt_tensor)
            next_token = output[0, -1].argmax(-1).item()
            if next_token == 2 or len(tgt_ids) >= 64:
                break
            tgt_ids.append(next_token)

        return sp.decode(tgt_ids[1:])


In [None]:
!pip install rouge_score
import nltk
# Remove the incorrect exception handling for DownloadError
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    print("wordnet not found. Attempting to download wordnet...")
    nltk.download('wordnet')

from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def evaluate_metrics(model, data, tokenizer, max_len=64):
    bleu_scores, meteor_scores, em_scores = [], [], []
    r1, r2, rl = [], [], []

    for idx in range(len(data)):
        # Get sample from dataset
        batch = data[idx]

        # Dekode dan bersihkan kalimat target dan kalimat yang diprediksi
        # Dengan asumsi item batch sudah berupa tensor, ubah ke daftar sebelum mendekode
        src = tokenizer.decode(batch['src_ids'].tolist()).replace('<pad>', '').strip()
        tgt = tokenizer.decode(batch['trg_ids'].tolist()).replace('<pad>', '').strip()

        pred = correct(model, src).strip()

        # Tokenize for metrics that require it
        tgt_tokens = tgt.split()
        pred_tokens = pred.split()
        bleu_scores.append(sentence_bleu([tgt_tokens], pred_tokens))
        meteor_scores.append(meteor_score([tgt_tokens], pred_tokens))
        em_scores.append(int(tgt == pred))

        scores = scorer.score(tgt, pred)
        r1.append(scores['rouge1'].fmeasure)
        r2.append(scores['rouge2'].fmeasure)
        rl.append(scores['rougeL'].fmeasure)

    # Handle potential division by zero if data is empty
    num_samples = len(bleu_scores)
    if num_samples == 0:
        return {
            "BLEU": 0.0,
            "METEOR": 0.0,
            "ROUGE-1": 0.0,
            "ROUGE-2": 0.0,
            "ROUGE-L": 0.0,
            "Exact Match": 0.0,
        }

    return {
        "BLEU": sum(bleu_scores)/num_samples,
        "METEOR": sum(meteor_scores)/num_samples,
        "ROUGE-1": sum(r1)/num_samples,
        "ROUGE-2": sum(r2)/num_samples,
        "ROUGE-L": sum(rl)/num_samples,
        "Exact Match": sum(em_scores)/num_samples,
    }

metrics = evaluate_metrics(model, val_data, sp)
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

wordnet not found. Attempting to download wordnet...


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use Smoot

BLEU: 0.4598
METEOR: 0.6636
ROUGE-1: 0.7075
ROUGE-2: 0.5816
ROUGE-L: 0.6948
Exact Match: 0.3894


SKENARIO 3

In [None]:
import pandas as pd

df = pd.read_csv("/content/Skenario 3 (pure dari dataset grammar correction).csv")
df = df.dropna()
df = df[['input', 'target']]


Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
import sentencepiece as spm
import re

# Preprocessing fungsi
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Menghilangkan tanda baca
    text = re.sub(r'\d+', '', text)  # Menghilangkan angka
    text = re.sub(r'\s+', ' ', text)  # Menghilangkan spasi berlebih
    return text.strip()

# Kita terapkan preprocessing
df['input'] = df['input'].astype(str).apply(clean_text)
df['target'] = df['target'].astype(str).apply(clean_text)

df['input'].to_csv("src.txt", index=False, header=False)
df['target'].to_csv("tgt.txt", index=False, header=False)

# Latih tokenizer SentencePiece
spm.SentencePieceTrainer.train(
    input='src.txt,tgt.txt',
    model_prefix='tokenizer',
    vocab_size=1603,
    pad_id=0,
    bos_id=1,
    eos_id=2,
    unk_id=3
)

# Load tokenizer
sp = spm.SentencePieceProcessor(model_file='tokenizer.model')


Mempersiapkan data teks (input dan target) dalam format tensor numerik agar bisa digunakan oleh model Transformer (atau Seq2Seq lainnya).

In [None]:
import torch
from torch.utils.data import Dataset

class GrammarDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, tokenizer, max_len=64):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_ids = [1] + self.tokenizer.encode(self.src_texts[idx]) + [2]
        tgt_ids = [1] + self.tokenizer.encode(self.tgt_texts[idx]) + [2]

        src_ids = src_ids[:self.max_len] + [0] * (self.max_len - len(src_ids))
        tgt_ids = tgt_ids[:self.max_len] + [0] * (self.max_len - len(tgt_ids))

        return {
            'src_ids': torch.tensor(src_ids),
            'trg_ids': torch.tensor(tgt_ids)
        }

# Split data
train_src, val_src, train_tgt, val_tgt = train_test_split(df['input'], df['target'], test_size=0.1)

# Dataset
train_data = GrammarDataset(train_src.tolist(), train_tgt.tolist(), sp)
val_data = GrammarDataset(val_src.tolist(), val_tgt.tolist(), sp)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=32)

Arsitektur model Transformer berbasis Seq2Seq

In [None]:
import torch.nn as nn

class Seq2SeqTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, num_heads=8, num_layers=3, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.transformer = nn.Transformer(d_model=embed_dim, nhead=num_heads, num_encoder_layers=num_layers,
                                          num_decoder_layers=num_layers, dropout=dropout)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, src, tgt):
        src_mask = self.transformer.generate_square_subsequent_mask(src.size(1)).to(src.device)
        tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)

        src_emb = self.embedding(src)
        tgt_emb = self.embedding(tgt)

        out = self.transformer(src_emb.permute(1, 0, 2), tgt_emb.permute(1, 0, 2),
                               src_mask=src_mask, tgt_mask=tgt_mask)
        return self.fc(out.permute(1, 0, 2))  # (batch, seq, vocab)


In [None]:
import torch.optim as optim

model = Seq2SeqTransformer(vocab_size=2059).to('cuda')
criterion = nn.CrossEntropyLoss(ignore_index=0)
# Use optim.Adam instead of torch.optim.Adam for clarity and consistency, though torch.optim works
optimizer = optim.Adam(model.parameters(), lr=0.0001)

def train_epoch(model, loader):
    model.train()
    total_loss = 0
    for batch in loader:
        src = batch['src_ids'].to('cuda')
        tgt = batch['trg_ids'].to('cuda')

        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.reshape(-1, 2059), tgt[:, 1:].reshape(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate_loss(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in loader:
            src = batch['src_ids'].to('cuda')
            tgt = batch['trg_ids'].to('cuda')
            output = model(src, tgt[:, :-1])
            loss = criterion(output.reshape(-1, 2059), tgt[:, 1:].reshape(-1))
            total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(25):
    print(f"\nEpoch {epoch+1}")
    train_loss = train_epoch(model, train_loader)
    val_loss = evaluate_loss(model, val_loader)
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

def correct(model, sentence):
    model.eval()
    with torch.no_grad():
        src_ids = [1] + sp.encode(sentence) + [2]
        src_ids = src_ids[:64] + [0]*(64 - len(src_ids))
        src_tensor = torch.tensor([src_ids]).to('cuda')

        tgt_ids = [1]
        for _ in range(64):
            tgt_tensor = torch.tensor([tgt_ids]).to('cuda')
            output = model(src_tensor, tgt_tensor)
            next_token = output[0, -1].argmax(-1).item()
            if next_token == 2 or len(tgt_ids) >= 64:
                break
            tgt_ids.append(next_token)

        return sp.decode(tgt_ids[1:])




Epoch 1
Train Loss: 6.5965 | Val Loss: 5.9687

Epoch 2
Train Loss: 5.6902 | Val Loss: 5.3024

Epoch 3
Train Loss: 5.0949 | Val Loss: 4.8349

Epoch 4
Train Loss: 4.6400 | Val Loss: 4.4632

Epoch 5
Train Loss: 4.2560 | Val Loss: 4.1601

Epoch 6
Train Loss: 3.9515 | Val Loss: 3.9216

Epoch 7
Train Loss: 3.6725 | Val Loss: 3.6929

Epoch 8
Train Loss: 3.4115 | Val Loss: 3.4952

Epoch 9
Train Loss: 3.1834 | Val Loss: 3.3245

Epoch 10
Train Loss: 2.9597 | Val Loss: 3.1358

Epoch 11
Train Loss: 2.7496 | Val Loss: 2.9890

Epoch 12
Train Loss: 2.5797 | Val Loss: 2.8514

Epoch 13
Train Loss: 2.3850 | Val Loss: 2.7358

Epoch 14
Train Loss: 2.2352 | Val Loss: 2.6228

Epoch 15
Train Loss: 2.1079 | Val Loss: 2.5282

Epoch 16
Train Loss: 1.9453 | Val Loss: 2.4163

Epoch 17
Train Loss: 1.8241 | Val Loss: 2.3436

Epoch 18
Train Loss: 1.7132 | Val Loss: 2.2467

Epoch 19
Train Loss: 1.5666 | Val Loss: 2.1899

Epoch 20
Train Loss: 1.4610 | Val Loss: 2.1211

Epoch 21
Train Loss: 1.3656 | Val Loss: 2.0726



In [None]:
def correct(model, sentence):
    model.eval()
    with torch.no_grad():
        src_ids = [1] + sp.encode(sentence) + [2]
        src_ids = src_ids[:64] + [0]*(64 - len(src_ids))
        src_tensor = torch.tensor([src_ids]).to('cuda')

        tgt_ids = [1]
        for _ in range(64):
            tgt_tensor = torch.tensor([tgt_ids]).to('cuda')
            output = model(src_tensor, tgt_tensor)
            next_token = output[0, -1].argmax(-1).item()
            if next_token == 2 or len(tgt_ids) >= 64:
                break
            tgt_ids.append(next_token)

        return sp.decode(tgt_ids[1:])


In [None]:
!pip install rouge_score
import nltk
# Remove the incorrect exception handling for DownloadError
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    print("wordnet not found. Attempting to download wordnet...")
    nltk.download('wordnet')

from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def evaluate_metrics(model, data, tokenizer, max_len=64):
    bleu_scores, meteor_scores, em_scores = [], [], []
    r1, r2, rl = [], [], []

    for idx in range(len(data)):
        # Get sample from dataset
        batch = data[idx]

        # Dekode dan bersihkan kalimat target dan kalimat yang diprediksi
        # Dengan asumsi item batch sudah berupa tensor, ubah ke daftar sebelum mendekode
        src = tokenizer.decode(batch['src_ids'].tolist()).replace('<pad>', '').strip()
        tgt = tokenizer.decode(batch['trg_ids'].tolist()).replace('<pad>', '').strip()

        pred = correct(model, src).strip()

        # Tokenize for metrics that require it
        tgt_tokens = tgt.split()
        pred_tokens = pred.split()
        bleu_scores.append(sentence_bleu([tgt_tokens], pred_tokens))
        meteor_scores.append(meteor_score([tgt_tokens], pred_tokens))
        em_scores.append(int(tgt == pred))

        scores = scorer.score(tgt, pred)
        r1.append(scores['rouge1'].fmeasure)
        r2.append(scores['rouge2'].fmeasure)
        rl.append(scores['rougeL'].fmeasure)

    # Handle potential division by zero if data is empty
    num_samples = len(bleu_scores)
    if num_samples == 0:
        return {
            "BLEU": 0.0,
            "METEOR": 0.0,
            "ROUGE-1": 0.0,
            "ROUGE-2": 0.0,
            "ROUGE-L": 0.0,
            "Exact Match": 0.0,
        }

    return {
        "BLEU": sum(bleu_scores)/num_samples,
        "METEOR": sum(meteor_scores)/num_samples,
        "ROUGE-1": sum(r1)/num_samples,
        "ROUGE-2": sum(r2)/num_samples,
        "ROUGE-L": sum(rl)/num_samples,
        "Exact Match": sum(em_scores)/num_samples,
    }

metrics = evaluate_metrics(model, val_data, sp)
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

wordnet not found. Attempting to download wordnet...


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use Smoot

BLEU: 0.3431
METEOR: 0.5491
ROUGE-1: 0.6039
ROUGE-2: 0.4488
ROUGE-L: 0.5845
Exact Match: 0.1890
