In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator
import ast


In [2]:
data = pd.read_csv("./data/cleaned.csv")
data_src = pd.read_csv("./data/source.csv")

In [5]:
# Assume data is loaded and vocab is built as before
class CBHG(nn.Module):
    def __init__(self, input_dim, embedding_dim, conv_channels, max_filter_width, highway_layers, gru_units):
        super(CBHG, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)  # Embedding layer
        self.conv1d_banks = nn.ModuleList([nn.Conv1d(embedding_dim, conv_channels, kernel_size=k, padding=k//2) for k in range(1, max_filter_width + 1)])
        self.max_pool = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
        self.conv1d_proj1 = nn.Conv1d(conv_channels * max_filter_width, conv_channels, kernel_size=3, padding=1)
        self.conv1d_proj2 = nn.Conv1d(conv_channels, embedding_dim, kernel_size=3, padding=1)
        self.highway = nn.ModuleList([nn.Linear(embedding_dim, embedding_dim) for _ in range(highway_layers)])
        self.bidirectional_GRU = nn.GRU(embedding_dim, gru_units, batch_first=True, bidirectional=True)

    def forward(self, x):
        x = self.embedding(x)  # Embed input indices
        x = x.transpose(1, 2)  # (batch_size, embedding_dim, sequence_length)
        
        # Convolutional bank
        out = torch.cat([F.relu(conv(x)) for conv in self.conv1d_banks], dim=1)
        out = self.max_pool(out)
        
        # First projection
        out = F.relu(self.conv1d_proj1(out))
        
        # Second projection with residual connection
        residual = self.conv1d_proj2(out)
        out = residual + x[:,:,:residual.size(2)]  # Adjust the residual dimension if necessary

        # Highway networks
        out = out.transpose(1, 2)  # (batch_size, sequence_length, embedding_dim)
        for layer in self.highway:
            H = F.relu(layer(out))
            T = torch.sigmoid(layer(out))
            out = H * T + out * (1.0 - T)

        # Bi-directional GRU
        outputs, _ = self.bidirectional_GRU(out)
        return outputs

# Initialize model
input_dim = len(vocab)  # Total number of characters
embedding_dim = 128  # Size of embedding vector
cbhg = CBHG(input_dim, embedding_dim, conv_channels=128, max_filter_width=16, highway_layers=4, gru_units=128)


In [4]:
# Define a function to safely convert string representations of lists back to actual lists
def str_to_list(char_list_str):
    try:
        return ast.literal_eval(char_list_str)
    except ValueError:
        return []  # Return an empty list in case of error

# Apply this function to your character tokens column
data['char_tokens'] = data['char_tokens'].apply(str_to_list)
data_src['char_tokens'] = data_src['char_tokens'].apply(str_to_list)

# Build the vocabulary from the character lists
def yield_tokens(data):
    for tokens in data:
        yield tokens

vocab = build_vocab_from_iterator(yield_tokens(data['char_tokens']), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])  # Handle unknown characters


In [6]:
# Prepare inputs and targets
def encode_and_pad(data):
    batch_encoded = [torch.tensor(vocab(tokens)) for tokens in data]
    batch_padded = pad_sequence(batch_encoded, batch_first=True, padding_value=vocab['<pad>'])
    return batch_padded

def prepare_data(data, data_src):
    inputs = encode_and_pad(data['char_tokens'])
    targets = encode_and_pad(data_src['char_tokens'])
    return inputs, targets

inputs, targets = prepare_data(data, data_src)


In [None]:
inputs.shape

torch.Size([52362, 26056])

In [7]:
# Define optimizer and criterion
optimizer = torch.optim.Adam(cbhg.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


In [8]:

# Training function
def train(model, inputs, targets, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.shape[-1])  # Flatten output for loss calculation
        targets = targets.view(-1)  # Flatten targets
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')


# Run training
train(cbhg, inputs, targets, criterion, optimizer)


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 698544267264 bytes.

In [None]:

# Save the model
torch.save(cbhg.state_dict(), 'cbhg_model.pth')

# Load the model
cbhg.load_state_dict(torch.load('cbhg_model.pth'))
cbhg.eval()