In [1]:
!pip install sentencepiece --upgrade



In [11]:
# English
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
!gunzip cc.en.300.vec.gz

# Telugu
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.vec.gz
!gunzip cc.te.300.vec.gz


--2025-05-05 19:45:34--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.171.198.46, 3.171.198.102, 3.171.198.8, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.171.198.46|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘cc.en.300.vec.gz.1’


2025-05-05 19:45:39 (262 MB/s) - ‘cc.en.300.vec.gz.1’ saved [1325960915/1325960915]

--2025-05-05 19:46:30--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.165.75.91, 3.165.75.95, 3.165.75.59, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.165.75.91|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1117025794 (1.0G) [binary/octet-stream]
Saving to: ‘cc.te.300.vec.gz.1’


2025-05-05 19:46:36 (186 MB/s) - ‘cc.te.300.vec.gz.1’ saved [111702

In [12]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import sentencepiece as spm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import numpy as np



In [13]:
BATCH_SIZE = 64
EMBED_SIZE = 300  # updated for FastText
HIDDEN_SIZE = 512
NUM_LAYERS = 1
LR = 0.001
VOCAB_SIZE = 8000
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load pretrained embeddings

def load_pretrained_embeddings(sp_model, embedding_path, embedding_dim):
    embeddings_index = {}
    with open(embedding_path, 'r', encoding='utf-8', errors='ignore') as f:
        next(f)
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            vec = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vec

    vocab_size = sp_model.get_piece_size()
    embedding_matrix = np.random.normal(0, 1, (vocab_size, embedding_dim)).astype('float32')

    for i in range(vocab_size):
        token = sp_model.id_to_piece(i)
        clean_token = token[1:] if token.startswith('▁') else token
        if clean_token in embeddings_index:
            embedding_matrix[i] = embeddings_index[clean_token]
    return torch.tensor(embedding_matrix)

In [14]:

# 1. Load corpus
data = pd.read_csv('sentences.csv')  # columns: 'src','tgt', optional prosody cols

# 2. Prepare and train SentencePiece models (once)
#    Extract columns to plain text files for reliability, then train.
with open('src.txt', 'w', encoding='utf-8') as f_src, open('tgt.txt', 'w', encoding='utf-8') as f_tgt:
    for s, t in zip(data['src'], data['tgt']):
        f_src.write(s.replace('\n',' ').strip() + '\n')
        f_tgt.write(t.replace('\n',' ').strip() + '\n')


#    Train SentencePiece on the extracted files
#    This generates src_spm.model / src_spm.vocab and tgt_spm.model / tgt_spm.vocab
spm.SentencePieceTrainer.Train(
    f"--input=src.txt --model_prefix=src_spm --vocab_size={VOCAB_SIZE} "
    "--model_type=bpe --unk_id=0 --pad_id=1 --bos_id=2 --eos_id=3"
)
spm.SentencePieceTrainer.Train(
    f"--input=tgt.txt --model_prefix=tgt_spm --vocab_size={VOCAB_SIZE} "
    "--model_type=bpe --unk_id=0 --pad_id=1 --bos_id=2 --eos_id=3"
)

# 3. Load SP models
src_sp = spm.SentencePieceProcessor(model_file='src_spm.model')
tgt_sp = spm.SentencePieceProcessor(model_file='tgt_spm.model')
SRC_VOCAB = src_sp.get_piece_size()
TGT_VOCAB = tgt_sp.get_piece_size()


src_embed_weights = load_pretrained_embeddings(src_sp, "cc.en.300.vec", EMBED_SIZE)
tgt_embed_weights = load_pretrained_embeddings(tgt_sp, "cc.te.300.vec", EMBED_SIZE)


In [15]:
def add_special(ids, sos_id, eos_id):
    return [sos_id] + ids + [eos_id]

class MTDataset(Dataset):
    def __init__(self, df, src_sp, tgt_sp, prosody_cols=None):
        self.src = df['src'].tolist()
        self.tgt = df['tgt'].tolist()
        self.prosody_cols = prosody_cols
        if prosody_cols:
            self.prosody = df[prosody_cols].values.astype(float)
        else:
            self.prosody = None
        self.src_sp = src_sp
        self.tgt_sp = tgt_sp
        # special IDs
        self.src_sos, self.src_eos = src_sp.bos_id(), src_sp.eos_id()
        self.tgt_sos, self.tgt_eos = tgt_sp.bos_id(), tgt_sp.eos_id()

    def __len__(self):
        return len(self.src)

    def encode(self, text, sp):
        return sp.encode(text, out_type=int)

    def __getitem__(self, idx):
        src_ids = add_special(self.encode(self.src[idx], self.src_sp), self.src_sos, self.src_eos)
        tgt_ids = add_special(self.encode(self.tgt[idx], self.tgt_sp), self.tgt_sos, self.tgt_eos)
        pros = self.prosody[idx] if self.prosody is not None else None
        return torch.tensor(src_ids), torch.tensor(tgt_ids), pros

from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    src_batch, tgt_batch, pros_batch = zip(*batch)
    src_pad = pad_sequence(src_batch, padding_value=src_sp.pad_id(), batch_first=True)
    tgt_pad = pad_sequence(tgt_batch, padding_value=tgt_sp.pad_id(), batch_first=True)
    pros = torch.tensor(pros_batch, dtype=torch.float) if pros_batch[0] is not None else None
    return src_pad.to(DEVICE), tgt_pad.to(DEVICE), pros



# Split into train/test
dataset = MTDataset(data, src_sp, tgt_sp, prosody_cols=None)
train_size = int(0.98 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=1, shuffle=True, collate_fn=collate_fn)


In [16]:
# Print 5 samples from the training set
print("Training Set Samples:")
for i, (src, tgt, _) in enumerate(train_loader):
    if i >= 5:
        break
    print(f"Sample {i+1}:")
    print(f"Source: {src_sp.decode(src[0].tolist())}")
    print(f"Target: {tgt_sp.decode(tgt[0].tolist())}")
    print("---")

# Print 5 samples from the testing set
print("\nTesting Set Samples:")
for i, (src, tgt, _) in enumerate(test_loader):
    if i >= 5:
        break
    print(f"Sample {i+1}:")
    print(f"Source: {src_sp.decode(src[0].tolist())}")
    print(f"Target: {tgt_sp.decode(tgt[0].tolist())}")
    print("---")


Training Set Samples:
Sample 1:
Source: While they were in a state of insensibility the murder was committed.
Target: వారు అపస్మారక స్థితిలో ఉండగానే ఈ హత్య జరిగింది.
---
Sample 2:
Source: The Fleet, which stood in Farringdon Street,
Target: ఫారింగ్ డన్ స్ట్రీట్ లో ఉన్న ఫ్లీట్,
---
Sample 3:
Source: While thus engaged, Howard thrust the poker into the fire.
Target: ఈ విధంగా నిశ్చితార్థం చేస్తున్నప్పుడు, హోవార్డ్ పేకాటను మంటల్లోకి నెట్టాడు.
---
Sample 4:
Source: It should be peremptorily forbidden to the keeper or any officer to make a pecuniary profit out of the supplies of food, fuel, or other necessaries.
Target: ఆహారం, ఇంధనం లేదా ఇతర అవసరాల సామాగ్రి నుండి డబ్బు లాభం పొందడం కీపర్ లేదా ఏ అధికారికి ఖచ్చితంగా నిషేధించబడాలి.
---
Sample 5:
Source: convicted of obtaining jewelery under the false pretense of making silly women "beautiful for ever."
Target: వెర్రి స్త్రీలను "ఎప్పటికీ అందంగా" తీర్చిదిద్దుతాననే తప్పుడు నెపంతో నగలు పొందినందుకు దోషిగా నిర్ధారించబడింది.
---

Testing Set Samples:
S

In [22]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, pretrained=None):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=1)
        if pretrained is not None:
            self.embedding.weight.data.copy_(pretrained)
            self.embedding.weight.requires_grad = False
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, hidden_size)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, hidden = self.gru(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))).unsqueeze(0)
        return outputs, hidden

# Decoder without attention
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, pretrained=None):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=1)
        if pretrained is not None:
            self.embedding.weight.data.copy_(pretrained)
            self.embedding.weight.requires_grad = False
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc_out = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)
        output, hidden = self.gru(embedded, hidden)
        output = self.fc_out(output.squeeze(1))
        return output, hidden

# Seq2Seq
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size, tgt_len = tgt.size()
        outputs = torch.zeros(batch_size, tgt_len, TGT_VOCAB).to(DEVICE)
        _, hidden = self.encoder(src)
        input = tgt[:, 0]
        for t in range(1, tgt_len):
            out, hidden = self.decoder(input, hidden)
            outputs[:, t, :] = out
            input = tgt[:, t] if torch.rand(1).item() < teacher_forcing_ratio else out.argmax(1)
        return outputs


In [25]:
def evaluate_bleu(model, dataloader):
    model.eval()
    refs, hyps = [], []
    print("Sample translations:\n")
    with torch.no_grad():
        for i, (src, tgt, _) in enumerate(dataloader):
            _, hidden = model.encoder(src)
            input = torch.tensor([tgt_sp.bos_id()] * src.size(0)).to(DEVICE)
            outputs = []
            for _ in range(50):
                out, hidden = model.decoder(input, hidden)
                input = out.argmax(1)
                outputs.append(input)
            outputs = torch.stack(outputs, dim=1).cpu().tolist()
            tgt = tgt.cpu().tolist()
            src = src.cpu().tolist()
            for ref, hyp, src_seq in zip(tgt, outputs, src):
                ref_tokens = [t for t in ref[1:] if t not in [tgt_sp.pad_id(), tgt_sp.eos_id()]]
                hyp_tokens = [t for t in hyp if t not in [tgt_sp.pad_id(), tgt_sp.eos_id()]]
                src_tokens = [t for t in src_seq[1:] if t not in [src_sp.pad_id(), src_sp.eos_id()]]
                refs.append([ref_tokens])
                hyps.append(hyp_tokens)
                if i < 5:
                    src_text = src_sp.decode(src_tokens)
                    ref_text = tgt_sp.decode(ref_tokens)
                    hyp_text = tgt_sp.decode(hyp_tokens)
                    print(f"Source   : {src_text}")
                    print(f"Reference: {ref_text}")
                    print(f"Predicted: {hyp_text}\n")
    bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method4)
    print(f"BLEU Score: {bleu:.4f}")

In [28]:
def train(model, dataloader, optimizer, criterion, epochs=20):
    model.to(DEVICE)
    for ep in range(epochs):
        model.train()
        total = 0
        for src_batch, tgt_batch, pros in dataloader:
            optimizer.zero_grad()
            preds = model(src_batch, tgt_batch)
            out = preds[:, 1:].reshape(-1, preds.size(-1))
            tgt = tgt_batch[:, 1:].reshape(-1)
            loss = criterion(out, tgt)
            loss.backward()
            optimizer.step()
            total += loss.item()
        print(f"Epoch {ep+1}, Loss: {total/len(dataloader):.4f}")

# Initialize
enc = Encoder(SRC_VOCAB, EMBED_SIZE, HIDDEN_SIZE, NUM_LAYERS, pretrained=src_embed_weights)
# attn = Attention(HIDDEN_SIZE)
# dec = Decoder(TGT_VOCAB, EMBED_SIZE, HIDDEN_SIZE, NUM_LAYERS, attn,  pretrained=tgt_embed_weights)
dec = Decoder(TGT_VOCAB, EMBED_SIZE, HIDDEN_SIZE, NUM_LAYERS,  pretrained=tgt_embed_weights)
model = Seq2Seq(enc, dec)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=src_sp.pad_id())

if __name__ == '__main__':
    train(model, train_loader, optimizer, criterion)
    evaluate_bleu(model, test_loader)




Epoch 1, Loss: 7.6037
Epoch 2, Loss: 7.2617
Epoch 3, Loss: 7.1743
Epoch 4, Loss: 7.0952
Epoch 5, Loss: 7.0151
Epoch 6, Loss: 6.9422
Epoch 7, Loss: 6.8735
Epoch 8, Loss: 6.7922
Epoch 9, Loss: 6.7272
Epoch 10, Loss: 6.6401
Epoch 11, Loss: 6.5754
Epoch 12, Loss: 6.4946
Epoch 13, Loss: 6.4122
Epoch 14, Loss: 6.3331
Epoch 15, Loss: 6.2742
Epoch 16, Loss: 6.2498
Epoch 17, Loss: 6.0939
Epoch 18, Loss: 6.0349
Epoch 19, Loss: 5.9660
Epoch 20, Loss: 5.9026
Sample translations:

Source   : seemed to strike her mind with horror and consternation, to the exclusion of all power of recollectedness in preparation for the approaching awful moment.
Reference: ఆసన్నమైన భయంకర క్షణానికి సన్నాహకంగా స్మరించుకునే శక్తిని మినహాయించి, ఆమె మనస్సును భయానక మరియు దిగ్భ్రాంతితో కొట్టినట్లు అనిపించింది.
Predicted: మరియు,,,,,,,,,,,,

Source   : All privacy was impossible under the circumstances.
Reference: పరిస్థితులలో అన్ని గోప్యత అసాధ్యం.
Predicted: మరియు,,,,,,,,,,,,

Source   : the Exchequer, the Commissioners of b

In [26]:
evaluate_bleu(model, test_loader)


Sample translations:

Source   : A desperate and deadly struggle must have taken place in the carriage, and the stain of a bloody hand marked the door.
Reference: క్యారేజ్ లో తీరని మరియు ఘోరమైన పోరాటం జరిగి ఉండాలి మరియు రక్తపు చేతి మరక తలుపును గుర్తించింది.
Predicted: మరియు,,,,,,,,,,,,,,,.

Source   : The inquiry was most searching and complete, and the committee spoke plainly in its report.
Reference: విచారణ చాలా శోధించబడింది మరియు పూర్తి చేయబడింది మరియు కమిటీ తన నివేదికలో స్పష్టంగా మాట్లాడింది.
Predicted: మరియు,,,,,,,,,,,,,,,.

Source   : In this way he formed the acquaintance of Watson and others, with whom he was arraigned for treasonable practices, and imprisoned.
Reference: ఈ విధంగా అతను వాట్సన్ మరియు ఇతరులతో పరిచయాన్ని ఏర్పరచుకున్నాడు, వీరితో అతను దేశద్రోహ చర్యలకు పాల్పడ్డాడు మరియు జైలు శిక్ష అనుభవించాడు.
Predicted: మరియు,,,,,,,,,,,,,,,.

Source   : some, especially of low stature, found it difficult to remain standing, and several, although held up for some time by the men near