In [5]:
!pip install sentencepiece --upgrade



In [45]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import sentencepiece as spm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction



In [47]:

# Parameters
BATCH_SIZE = 64
EMBED_SIZE = 256
HIDDEN_SIZE = 512
NUM_LAYERS = 1
LR = 0.001
VOCAB_SIZE = 8000       # subword vocab size for each language
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [48]:

# 1. Load corpus
data = pd.read_csv('sentences.csv')  # columns: 'src','tgt', optional prosody cols

# 2. Prepare and train SentencePiece models (once)
#    Extract columns to plain text files for reliability, then train.
with open('src.txt', 'w', encoding='utf-8') as f_src, open('tgt.txt', 'w', encoding='utf-8') as f_tgt:
    for s, t in zip(data['src'], data['tgt']):
        f_src.write(s.replace('\n',' ').strip() + '\n')
        f_tgt.write(t.replace('\n',' ').strip() + '\n')


#    Train SentencePiece on the extracted files
#    This generates src_spm.model / src_spm.vocab and tgt_spm.model / tgt_spm.vocab
spm.SentencePieceTrainer.Train(
    f"--input=src.txt --model_prefix=src_spm --vocab_size={VOCAB_SIZE} "
    "--model_type=bpe --unk_id=0 --pad_id=1 --bos_id=2 --eos_id=3"
)
spm.SentencePieceTrainer.Train(
    f"--input=tgt.txt --model_prefix=tgt_spm --vocab_size={VOCAB_SIZE} "
    "--model_type=bpe --unk_id=0 --pad_id=1 --bos_id=2 --eos_id=3"
)

# 3. Load SP models
src_sp = spm.SentencePieceProcessor(model_file='src_spm.model')
tgt_sp = spm.SentencePieceProcessor(model_file='tgt_spm.model')
SRC_VOCAB = src_sp.get_piece_size()
TGT_VOCAB = tgt_sp.get_piece_size()


In [49]:
def add_special(ids, sos_id, eos_id):
    return [sos_id] + ids + [eos_id]

class MTDataset(Dataset):
    def __init__(self, df, src_sp, tgt_sp, prosody_cols=None):
        self.src = df['src'].tolist()
        self.tgt = df['tgt'].tolist()
        self.prosody_cols = prosody_cols
        if prosody_cols:
            self.prosody = df[prosody_cols].values.astype(float)
        else:
            self.prosody = None
        self.src_sp = src_sp
        self.tgt_sp = tgt_sp
        # special IDs
        self.src_sos, self.src_eos = src_sp.bos_id(), src_sp.eos_id()
        self.tgt_sos, self.tgt_eos = tgt_sp.bos_id(), tgt_sp.eos_id()

    def __len__(self):
        return len(self.src)

    def encode(self, text, sp):
        return sp.encode(text, out_type=int)

    def __getitem__(self, idx):
        src_ids = add_special(self.encode(self.src[idx], self.src_sp), self.src_sos, self.src_eos)
        tgt_ids = add_special(self.encode(self.tgt[idx], self.tgt_sp), self.tgt_sos, self.tgt_eos)
        pros = self.prosody[idx] if self.prosody is not None else None
        return torch.tensor(src_ids), torch.tensor(tgt_ids), pros

from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    src_batch, tgt_batch, pros_batch = zip(*batch)
    src_pad = pad_sequence(src_batch, padding_value=src_sp.pad_id(), batch_first=True)
    tgt_pad = pad_sequence(tgt_batch, padding_value=tgt_sp.pad_id(), batch_first=True)
    pros = torch.tensor(pros_batch, dtype=torch.float) if pros_batch[0] is not None else None
    return src_pad.to(DEVICE), tgt_pad.to(DEVICE), pros



# Split into train/test
dataset = MTDataset(data, src_sp, tgt_sp, prosody_cols=None)
train_size = int(0.98 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=1, shuffle=True, collate_fn=collate_fn)


In [50]:
# Print 5 samples from the training set
print("Training Set Samples:")
for i, (src, tgt, _) in enumerate(train_loader):
    if i >= 5:
        break
    print(f"Sample {i+1}:")
    print(f"Source: {src_sp.decode(src[0].tolist())}")
    print(f"Target: {tgt_sp.decode(tgt[0].tolist())}")
    print("---")

# Print 5 samples from the testing set
print("\nTesting Set Samples:")
for i, (src, tgt, _) in enumerate(test_loader):
    if i >= 5:
        break
    print(f"Sample {i+1}:")
    print(f"Source: {src_sp.decode(src[0].tolist())}")
    print(f"Target: {tgt_sp.decode(tgt[0].tolist())}")
    print("---")


Training Set Samples:
Sample 1:
Source: The report of the Committee of the House of Commons painted so black a picture of Newgate as then conducted, that the Corporation were roused in very shame
Target: హౌస్ ఆఫ్ కామన్స్ కమిటీ నివేదిక న్యూగేట్ యొక్క చిత్రాన్ని చాలా నల్లగా చిత్రీకరించింది, కార్పొరేషన్ చాలా అవమానానికి గురైంది
---
Sample 2:
Source: but his sporting operations did not prosper, and he became a needy man, always driven to desperate straits for cash.
Target: కానీ అతని క్రీడా కార్యకలాపాలు అభివృద్ధి చెందలేదు మరియు అతను నిరుపేద వ్యక్తి అయ్యాడు, ఎల్లప్పుడూ నగదు కోసం తీరని కష్టాలకు నెట్టబడ్డాడు.
---
Sample 3:
Source: of more depraved and systematic criminals.
Target: మరింత చెడిపోయిన మరియు క్రమబద్ధమైన నేరస్థులు.
---
Sample 4:
Source: This was the question which presented itself to the fertile brain of one Pierce,
Target: ఇది ఒక పియర్స్ యొక్క సారవంతమైన మెదడుకు అందించిన ప్రశ్న,
---
Sample 5:
Source: Few of the Newgate notorieties of late years show any marked peculiarities;
Target: చ

In [58]:

# 5. Encoder
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, hidden = self.gru(embedded)
        return outputs, hidden

# 6. Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc_out = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)
        output, hidden = self.gru(embedded, hidden)
        pred = self.fc_out(output.squeeze(1))
        return pred, hidden

# 7. Seq2Seq
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size, tgt_len = tgt.size()
        outputs = torch.zeros(batch_size, tgt_len, TGT_VOCAB).to(DEVICE)
        _, hidden = self.encoder(src)
        input = tgt[:, 0]
        for t in range(1, tgt_len):
            out, hidden = self.decoder(input, hidden)
            outputs[:, t, :] = out
            input = tgt[:, t] if torch.rand(1).item() < teacher_forcing_ratio else out.argmax(1)
        return outputs


In [67]:
def evaluate_bleu(model, dataloader):
    model.eval()
    refs, hyps = [], []

    # print("Sample translations:\n")
    with torch.no_grad():
        for i, (src, tgt, _) in enumerate(dataloader):
            _, hidden = model.encoder(src)
            input = torch.tensor([tgt_sp.bos_id()] * src.size(0)).to(DEVICE)
            outputs = []
            for _ in range(50):
                out, hidden = model.decoder(input, hidden)
                input = out.argmax(1)
                outputs.append(input)
            outputs = torch.stack(outputs, dim=1).cpu().tolist()
            tgt = tgt.cpu().tolist()
            for ref, hyp in zip(tgt, outputs):
                ref_tokens = [t for t in ref[1:] if t not in [tgt_sp.pad_id(), tgt_sp.eos_id()]]
                hyp_tokens = [t for t in hyp if t not in [tgt_sp.pad_id(), tgt_sp.eos_id()]]
                refs.append([ref_tokens])
                hyps.append(hyp_tokens)
                if i < 5:
                    ref_text = tgt_sp.decode(ref_tokens)
                    hyp_text = tgt_sp.decode(hyp_tokens)
    bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method4)
    print(f"BLEU Score: {bleu:.4f}")

In [61]:
def train(model, dataloader, optimizer, criterion, epochs=30):
    model.to(DEVICE)
    for ep in range(epochs):
        model.train()
        total = 0
        for src_batch, tgt_batch, pros in dataloader:
            optimizer.zero_grad()
            preds = model(src_batch, tgt_batch)
            out = preds[:, 1:].reshape(-1, preds.size(-1))
            tgt = tgt_batch[:, 1:].reshape(-1)
            loss = criterion(out, tgt)
            loss.backward()
            optimizer.step()
            total += loss.item()
        print(f"Epoch {ep+1}, Loss: {total/len(dataloader):.4f}")

enc = Encoder(SRC_VOCAB, EMBED_SIZE, HIDDEN_SIZE, NUM_LAYERS)
# attn = Attention(HIDDEN_SIZE)
dec = Decoder(TGT_VOCAB, EMBED_SIZE, HIDDEN_SIZE, NUM_LAYERS)
model = Seq2Seq(enc, dec)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=src_sp.pad_id())

if __name__ == '__main__':
    train(model, train_loader, optimizer, criterion)



Epoch 1, Loss: 7.5393
Epoch 2, Loss: 7.1086
Epoch 3, Loss: 6.8686
Epoch 4, Loss: 6.6067
Epoch 5, Loss: 6.3074
Epoch 6, Loss: 6.0260
Epoch 7, Loss: 5.7120
Epoch 8, Loss: 5.4271
Epoch 9, Loss: 5.1301
Epoch 10, Loss: 4.8723
Epoch 11, Loss: 4.5836
Epoch 12, Loss: 4.2979
Epoch 13, Loss: 4.1173
Epoch 14, Loss: 3.7761
Epoch 15, Loss: 3.5722
Epoch 16, Loss: 3.2240
Epoch 17, Loss: 2.9612
Epoch 18, Loss: 2.7236
Epoch 19, Loss: 2.3499
Epoch 20, Loss: 2.0534
Epoch 21, Loss: 1.8165
Epoch 22, Loss: 1.5074
Epoch 23, Loss: 1.2438
Epoch 24, Loss: 0.9970
Epoch 25, Loss: 0.7740
Epoch 26, Loss: 0.5935
Epoch 27, Loss: 0.4263
Epoch 28, Loss: 0.3003
Epoch 29, Loss: 0.2148
Epoch 30, Loss: 0.1529


In [68]:
evaluate_bleu(model, test_loader)


BLEU Score: 0.2056
