In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import random

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [16]:
# Read the CSV file (adjust the filename if necessary)
df = pd.read_csv("data.csv")
print(df.head())

# Combine the desired columns (Name, Email, School) into one string per record.
def combine_fields(row):
    return f"{row['Name']}|{row['Email']}|{row['School']}"

data = df.apply(combine_fields, axis=1).tolist()

# Build a character-level vocabulary from the data
all_text = "\n".join(data)
chars = sorted(list(set(all_text)))
print(f"Found {len(chars)} unique characters.")

# Create mappings from characters to indices and back
char2idx = {ch: i for i, ch in enumerate(chars)}
idx2char = {i: ch for i, ch in enumerate(chars)}

# Add special tokens for start-of-sequence, end-of-sequence, and padding.
special_tokens = ["<SOS>", "<EOS>", "<PAD>"]
for token in special_tokens:
    if token not in char2idx:
        idx = len(char2idx)
        char2idx[token] = idx
        idx2char[idx] = token

vocab_size = len(char2idx)
print("Vocabulary size (including special tokens):", vocab_size)


                           Name  \
0  Dr. Robert M. Gamper, Ed. D.   
1           Julianne Huettinger   
2                 Robert Wright   
3             Grace Biancorosso   
4               Krista Kersting   

                                               Title  \
0                          Superintendent of Schools   
1     Administrative Assistant to the Superintendent   
2             Business Administrator/Board Secretary   
3  Administrative Assistant to the Business Admin...   
4                   Assistant Business Administrator   

                                    Email         School  
0        robertgamper@parkridge.k12.nj.us  Park Ridge HS  
1  juliannehuettinger@parkridge.k12.nj.us  Park Ridge HS  
2        robertwright@parkridge.k12.nj.us  Park Ridge HS  
3    gracebiancorosso@parkridge.k12.nj.us  Park Ridge HS  
4      kristakersting@parkridge.k12.nj.us  Park Ridge HS  
Found 63 unique characters.
Vocabulary size (including special tokens): 66


In [17]:
class TextDataset(Dataset):
    def __init__(self, texts, char2idx):
        self.texts = texts
        self.char2idx = char2idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        # Prepend <SOS> and append <EOS>
        tokens = ["<SOS>"] + list(text) + ["<EOS>"]
        indices = [self.char2idx.get(token, self.char2idx["<PAD>"]) for token in tokens]
        # For reconstruction: input is tokens[:-1] and target is tokens[1:]
        input_tensor = torch.tensor(indices[:-1], dtype=torch.long)
        target_tensor = torch.tensor(indices[1:], dtype=torch.long)
        return input_tensor, target_tensor

def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=char2idx["<PAD>"])
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=char2idx["<PAD>"])
    return inputs_padded.to(device), targets_padded.to(device)

# Create dataset and dataloader
dataset = TextDataset(data, char2idx)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)


In [24]:
# Hyperparameters
embedding_dim = 128
hidden_dim = 256
latent_dim = 2

class VAE(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # Encoder LSTM
        self.encoder_rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden_to_mu = nn.Linear(hidden_dim, latent_dim)
        self.hidden_to_logvar = nn.Linear(hidden_dim, latent_dim)

        # Decoder LSTM
        self.latent_to_hidden = nn.Linear(latent_dim, hidden_dim)
        self.decoder_rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.outputs_to_vocab = nn.Linear(hidden_dim, vocab_size)

    def encode(self, x):
        embedded = self.embedding(x)
        _, (h_n, _) = self.encoder_rnn(embedded)
        h_n = h_n.squeeze(0)
        mu = self.hidden_to_mu(h_n)
        logvar = self.hidden_to_logvar(h_n)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z, targets=None, max_length=100, teacher_forcing_ratio=0.5):
        batch_size = z.size(0)
        hidden = self.latent_to_hidden(z).unsqueeze(0)
        cell = torch.zeros_like(hidden).to(device)
        input_token = torch.full((batch_size, 1), char2idx["<SOS>"], dtype=torch.long).to(device)
        outputs = []
        for t in range(max_length):
            embedded = self.embedding(input_token)
            output, (hidden, cell) = self.decoder_rnn(embedded, (hidden, cell))
            logits = self.outputs_to_vocab(output.squeeze(1))
            outputs.append(logits.unsqueeze(1))
            if targets is not None and random.random() < teacher_forcing_ratio and t < targets.size(1):
                input_token = targets[:, t].unsqueeze(1)
            else:
                input_token = logits.argmax(dim=1).unsqueeze(1)
        outputs = torch.cat(outputs, dim=1)
        return outputs

    def forward(self, x, targets=None, teacher_forcing_ratio=0.5):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        max_length = targets.size(1) if targets is not None else 100
        outputs = self.decode(z, targets, max_length, teacher_forcing_ratio)
        return outputs, mu, logvar

model = VAE(vocab_size, embedding_dim, hidden_dim, latent_dim).to(device)


In [25]:
def loss_fn(recon_logits, target, mu, logvar):
    # Reconstruction loss: cross-entropy ignoring padding
    recon_loss = nn.functional.cross_entropy(
        recon_logits.view(-1, recon_logits.size(-1)),
        target.view(-1),
        ignore_index=char2idx["<PAD>"]
    )
    # KL divergence loss
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) / mu.size(0)
    return recon_loss + kl_loss

optimizer = optim.Adam(model.parameters(), lr=1e-3)
num_epochs = 1000  # Adjust epochs as needed

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        optimizer.zero_grad()
        outputs, mu, logvar = model(inputs, targets, teacher_forcing_ratio=0.5)
        loss = loss_fn(outputs, targets, mu, logvar)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")


Epoch 1/1000 - Loss: 4.1495
Epoch 2/1000 - Loss: 3.6121
Epoch 3/1000 - Loss: 3.4016
Epoch 4/1000 - Loss: 3.3183
Epoch 5/1000 - Loss: 3.2661
Epoch 6/1000 - Loss: 3.2028
Epoch 7/1000 - Loss: 3.1046
Epoch 8/1000 - Loss: 3.0501
Epoch 9/1000 - Loss: 3.0287
Epoch 10/1000 - Loss: 2.9439
Epoch 11/1000 - Loss: 2.9087
Epoch 12/1000 - Loss: 2.8265
Epoch 13/1000 - Loss: 2.7771
Epoch 14/1000 - Loss: 2.7773
Epoch 15/1000 - Loss: 2.7415
Epoch 16/1000 - Loss: 2.6018
Epoch 17/1000 - Loss: 2.5933
Epoch 18/1000 - Loss: 2.5844
Epoch 19/1000 - Loss: 2.4463
Epoch 20/1000 - Loss: 2.4581
Epoch 21/1000 - Loss: 2.4388
Epoch 22/1000 - Loss: 2.3522
Epoch 23/1000 - Loss: 2.3040
Epoch 24/1000 - Loss: 2.2114
Epoch 25/1000 - Loss: 2.1653
Epoch 26/1000 - Loss: 2.1230
Epoch 27/1000 - Loss: 2.1812
Epoch 28/1000 - Loss: 2.0778
Epoch 29/1000 - Loss: 2.0028
Epoch 30/1000 - Loss: 2.0962
Epoch 31/1000 - Loss: 2.1389
Epoch 32/1000 - Loss: 2.0977
Epoch 33/1000 - Loss: 2.0624
Epoch 34/1000 - Loss: 1.9116
Epoch 35/1000 - Loss: 1

KeyboardInterrupt: 

In [26]:
def generate_samples(model, num_samples=5, max_length=100):
    model.eval()
    samples = []
    with torch.no_grad():
        z = torch.randn(num_samples, latent_dim).to(device)
        outputs = model.decode(z, targets=None, max_length=max_length, teacher_forcing_ratio=0.0)
        tokens = outputs.argmax(dim=-1)
        for i in range(num_samples):
            token_list = tokens[i].cpu().numpy().tolist()
            generated = []
            for token_idx in token_list:
                token = idx2char[token_idx]
                if token == "<EOS>":
                    break
                if token in ["<SOS>", "<PAD>"]:
                    continue
                generated.append(token)
            samples.append("".join(generated))
    return samples

# Display generated samples
print("\nGenerated Samples:")
for sample in generate_samples(model, num_samples=5, max_length=100):
    print(sample)



Generated Samples:
Macha Goran|suramon@plps.org|Pompton Lakes
Macha Goran|suramon@plps.org|Pompton Lakes
Macha Goran|suramon@plps.org|Pompton Lakes
Macha Goran|suramon@plps.org|Pompton Lakes
Macha Goran|suramon@plps.org|Pompton Lakes
