In [1]:
import torch
# Check for MPS (Apple Silicon GPU) availability
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

device = torch.device("cpu")
print("Using device:", device)


Using device: cpu


In [14]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

hidden_size = 1000
PAD = "<PAD>"
EOS = "<EOS>"

def read_data():
    s_lines = open("news-commentary-v13.zh-en.en", 'r').readlines()
    t_lines = open("news-commentary-v13.zh-en.zh", 'r').readlines()

    print(len(s_lines), len(t_lines))
    assert len(s_lines) == len(t_lines), "src target lines not matching"
    return s_lines, t_lines

def create_voc(lines, is_target=False):
    if is_target:
        lines = [[EOS] + list(line) for line in lines]
    else:
        lines = [list(line) for line in lines]

    voc = set()
    for line in lines:
        for c in line:
            voc.add(c)

    voc = sorted(list(voc))
    voc = [PAD] + voc

    itoc, ctoi = {}, {}
    for i, c in enumerate(voc):
        itoc[i] = c
        ctoi[c] = i

    res = []
    for line in lines:
        cur = []
        for c in line:
            cur.append(ctoi[c])

        res.append(cur)

    return res, itoc, ctoi

ss, ts = read_data()

source, source_itoc, _ = create_voc(ss)
target, target_itoc, _ = create_voc(ts, True)

source_voc_size = len(source_itoc)
target_voc_size = len(target_itoc)

def collate_fn(batch):
    source_batch, target_batch = zip(*batch)
    source_padded = pad_sequence([torch.tensor(s) for s in source_batch], padding_value=0)  # Assuming 0 is the PAD index
    target_padded = pad_sequence([torch.tensor(t) for t in target_batch], padding_value=0)
    return source_padded, target_padded

class MTDataset(Dataset):
    def __init__(self, source, target):
        print("len = ", len(source), len(target))
        self.source = source
        self.target = target

    def __getitem__(self, item):
        return self.source[item], self.target[item]

    def __len__(self):
        return len(self.source)

batch_size = 32
training_loader = DataLoader(MTDataset(source, target), batch_size=batch_size, collate_fn=collate_fn, drop_last=True)



252777 252777
len =  252777 252777


In [15]:
print(source_itoc)

{0: '<PAD>', 1: '\t', 2: '\n', 3: ' ', 4: '!', 5: '"', 6: '#', 7: '$', 8: '%', 9: '&', 10: "'", 11: '(', 12: ')', 13: '*', 14: '+', 15: ',', 16: '-', 17: '.', 18: '/', 19: '0', 20: '1', 21: '2', 22: '3', 23: '4', 24: '5', 25: '6', 26: '7', 27: '8', 28: '9', 29: ':', 30: ';', 31: '=', 32: '?', 33: '@', 34: 'A', 35: 'B', 36: 'C', 37: 'D', 38: 'E', 39: 'F', 40: 'G', 41: 'H', 42: 'I', 43: 'J', 44: 'K', 45: 'L', 46: 'M', 47: 'N', 48: 'O', 49: 'P', 50: 'Q', 51: 'R', 52: 'S', 53: 'T', 54: 'U', 55: 'V', 56: 'W', 57: 'X', 58: 'Y', 59: 'Z', 60: '[', 61: '\\', 62: ']', 63: '_', 64: '`', 65: 'a', 66: 'b', 67: 'c', 68: 'd', 69: 'e', 70: 'f', 71: 'g', 72: 'h', 73: 'i', 74: 'j', 75: 'k', 76: 'l', 77: 'm', 78: 'n', 79: 'o', 80: 'p', 81: 'q', 82: 'r', 83: 's', 84: 't', 85: 'u', 86: 'v', 87: 'w', 88: 'x', 89: 'y', 90: 'z', 91: '\x80', 92: '\x91', 93: '\x9d', 94: '\xa0', 95: '£', 96: '¥', 97: '\xad', 98: '°', 99: '±', 100: '´', 101: '·', 102: 'º', 103: '¼', 104: 'Á', 105: 'Ã', 106: 'Å', 107: 'Ç', 108: 'É

In [12]:

for x,y in training_loader:
    print(x[0], y[0])
    break

tensor([20, 50, 35, 54, 54, 39, 40, 49, 43, 59, 54, 35, 43, 40, 49, 40, 36, 53,
        39, 59, 43, 35, 49, 40, 36, 57, 57, 35, 54, 57, 57, 36]) tensor([  19, 1366,  233, 1114, 2834, 2223, 2567, 1463,  299, 2567,  252, 3275,
          19, 1268, 1463, 4475,  242,  517, 2258, 3275, 3245,  391, 1658,  335,
         381,  406, 1463, 4048,   19, 1658,   20,  373])


In [28]:
from torch import nn
class TranslateModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.source_embedding = nn.Embedding(source_voc_size, hidden_size)
        self.target_embedding = nn.Embedding(target_voc_size, hidden_size)

        self.encoder = nn.LSTM(hidden_size, hidden_size, num_layers=2)
        self.decoder = nn.LSTM(target_voc_size, hidden_size, num_layers=2)
        self.linear = nn.Linear(hidden_size, target_voc_size)
    def forward(self, source_sentences, target_sentences):
        #return self.encoder(x)

        emb = self.source_embedding(source_sentences)
        _, (h, c) = self.encoder(emb)


        output = self.decoder(self.target_embedding(target_sentences), (h, c))


        return self.linear(output)




In [29]:
from torch.optim import Adam, SGD
from torch.nn import CrossEntropyLoss
from torch.nn.utils import clip_grad_norm_


model = TranslateModel().to(device)

criterion = CrossEntropyLoss()
optimizer= Adam(model.parameters(), lr=1)


for epoch in range(10):
    model.train()
    print("epoch", epoch)
    for train_x, train_y in training_loader:
        train_x, train_y = train_x.to(device), train_y.to(device)
        optimizer.zero_grad()

        r = model.forward(train_x, train_y)

        loss = criterion(r, train_y)
        loss.backward()

        print(loss.item())

        norm = clip_grad_norm_(model.parameters(), 5)

        optimizer.step()


epoch 0


TypeError: TranslateModel.forward() missing 1 required positional argument: 'target_sentences'