In [22]:
import random
import pandas as pd

# Generate segment data
num_samples = 3000
segments_df = pd.DataFrame({
    'Age': [random.randint(18, 70) for _ in range(num_samples)],
    'Income': [random.choice(['low', 'medium', 'high']) for _ in range(num_samples)],
    'Gender': [random.choice(['male', 'female']) for _ in range(num_samples)],
    'Behavior': [random.choice(['tech-savvy', 'traditional', 'budget-conscious'])
                for _ in range(num_samples)]
})

# Generate corresponding messages
def generate_message(row):
    if row['Behavior'] == 'tech-savvy':
        return f"Discover the latest technology tailored for {row['Gender']} {row['Age']} years old!"
    elif row['Behavior'] == 'traditional':
        return f"Check out our classic products for {row['Gender']} aged {row['Age']}."
    else:
        return f"Budget-friendly options for {row['Gender']} at {row['Age']} years!"

# Create final dataset
segments_df['Message'] = segments_df.apply(generate_message, axis=1)
final_dataset = segments_df[['Age', 'Income', 'Gender', 'Behavior', 'Message']]

In [23]:
# Extract segments and messages
segment_columns = ['Age', 'Income', 'Gender', 'Behavior']
message_column = 'Message'

# Create segment-message pairs
segment_message_pairs = []
for index, row in final_dataset.iterrows():
    segment = tuple(row[segment_columns])
    message = row[message_column]
    segment_message_pairs.append((segment, message))

In [24]:
# Define vocabulary (alphabet, numbers, spaces, and punctuation)
# Define character set and vocabulary
chars = sorted(set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ,.!-()\''))
vocab_size = len(chars) + 1  # Add 1 for padding token
stoi = {ch:i+1 for i,ch in enumerate(chars)}  # Shift all indices by 1
itos = {i+1:ch for i,ch in enumerate(chars)}  # Shift all indices by 1
#itos[0] = ''  # Padding token

# Extract segments and messages separately from the pairs
segments = [str(pair[0]) for pair in segment_message_pairs]  # Get segments
messages = [pair[1] for pair in segment_message_pairs]  # Get messages

# Calculate maximum lengths
seg_max = max(len(segment) for segment in segments)
message_max = max(len(message) for message in messages)

def encode(sentence, length=seg_max):
    indicies = []
    for c in sentence:
        indicies.append(stoi[c])

    # Add padding

    if length is not None and len(indicies) < length:
        indicies += [0] * (length - len(indicies))

    return indicies

def decode(sequence):
    sentence = []
    for i in sequence:
        if i != 0:  # Skip padding tokens
            sentence.append(itos[i])
    return ''.join(sentence)



encoded_pairs = []
for segment, message in segment_message_pairs:
    # Convert segment tuple to string
    segment_str = str(segment)  # Convert tuple to string

    # Encode both segment and message
    segment_encoded = encode(segment_str, seg_max)
    message_encoded = encode(message, message_max)

    encoded_pairs.append((segment_encoded, message_encoded))

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math


class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config["n_embed"] % config["n_heads"] == 0

        self.c_attn = nn.Linear(config["n_embed"], 3 * config["n_embed"])
        self.c_proj = nn.Linear(config["n_embed"], config["n_embed"])
        self.attn_dropout = nn.Dropout(config["dropout"])
        self.resid_dropout = nn.Dropout(config["dropout"])

        # Assign config values to instance variables
        self.n_heads = config["n_heads"]
        self.n_embed = config["n_embed"]
        self.seg_length = config["seg_length"]
        self.message_length = config["message_length"]
        self.block_size = config["block_size"]

        # Fixed causal mask for training
        total_length = self.seg_length + self.message_length
        mask = torch.zeros(total_length, total_length)

        # Full 1s for the segment portion
        mask[:self.seg_length, :self.seg_length] = 1

        # Lower triangular 1s for the message portion
        for i in range(self.seg_length, total_length):
            mask[i, :i + 1] = 1

        # Reshape to 4D for attention (1, 1, total_length, total_length)
        mask = mask.view(1, 1, total_length, total_length)

        # Register the mask as a buffer
        self.register_buffer("fixed_causal_mask", mask)

    def forward(self, x):
        B, T, _ = x.size()

        # Linear projections for query, key, value
        q, k, v = self.c_attn(x).split(self.n_embed, dim=2)

        # Reshape projections for multi-head attention
        k = k.view(B, T, self.n_heads, self.n_embed // self.n_heads).transpose(1, 2)  # (B, nh, T, hs)
        q = q.view(B, T, self.n_heads, self.n_embed // self.n_heads).transpose(1, 2)  # (B, nh, T, hs)
        v = v.view(B, T, self.n_heads, self.n_embed // self.n_heads).transpose(1, 2)  # (B, nh, T, hs)

        # Compute attention scores
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))

        # Apply the fixed causal mask during training or slice for generation
        mask = self.fixed_causal_mask[:, :, :T, :T]  # Dynamically slice the mask for the current sequence length
        att = att.masked_fill(mask == 0, float('-inf'))

        # Compute attention probabilities and apply dropout
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)

        # Weighted sum of values
        y = att @ v

        # Reshape and project back to original embedding size
        y = y.transpose(1, 2).contiguous().view(B, T, self.n_embed)
        y = self.resid_dropout(self.c_proj(y))

        return y


In [26]:
import torch.nn.functional as F

class MLP(nn.Module):
  def __init__(self,config):
    super().__init__()
    self.c_fc = nn.Linear(config["n_embed"], 4*config["n_embed"])
    self.c_proj=nn.Linear(4*config["n_embed"], config["n_embed"])
    self.drop= nn.Dropout(config["dropout"])

  def forward(self,x):
    x = self.c_fc(x)
    x = F.gelu(x)
    x = self.c_proj(x)
    x = self.drop(x)
    return x

In [27]:
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attn = CausalSelfAttention(config)
        self.ln_1 = nn.LayerNorm(config["n_embed"])
        self.ln_2 = nn.LayerNorm(config["n_embed"])
        self.ff = MLP(config)  # Using the updated MLP class name

        # Optional: Initialize layer norm parameters
        self._init_weights()

    def _init_weights(self):
        # Initialize layer norm weights and biases
        self.ln_1.bias.data.zero_()
        self.ln_1.weight.data.fill_(1.0)
        self.ln_2.bias.data.zero_()
        self.ln_2.weight.data.fill_(1.0)

    def forward(self, x):
        # Residual connections maintain device of input tensor
        x = x + self.attn(self.ln_1(x))  # First residual connection
        x = x + self.ff(self.ln_2(x))    # Second residual connection
        return x

In [28]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config["vocab_size"] is not None
        assert config["block_size"] is not None
        self.config = config

        self.transformer = nn.ModuleDict({
            'wte': nn.Embedding(config["vocab_size"], config["n_embed"]),
            'wpe': nn.Embedding(config["block_size"], config["n_embed"]),
            'drop': nn.Dropout(config["dropout"]),
            'h': nn.ModuleList([Block(config) for _ in range(config["n_layers"])]),
            'ln_f': nn.LayerNorm(config["n_embed"])
        })

        self.lm_head = nn.Linear(config["n_embed"], config["vocab_size"], bias=False)

        # Tie weights between embedding and output layer
        self.transformer.wte.weight = self.lm_head.weight

        # Initialize weights
        self._init_weights()

    def _init_weights(self):
        # Initialize embedding layers
        nn.init.normal_(self.transformer.wte.weight, std=0.02)
        nn.init.normal_(self.transformer.wpe.weight, std=0.02)

        # Initialize layer norm
        nn.init.normal_(self.transformer.ln_f.weight, std=0.02)
        nn.init.zeros_(self.transformer.ln_f.bias)

    def forward(self, segment, message=None):
        device = segment.device

        # Combine segment and message
        if message is not None:

            idx = torch.cat([segment, message], dim=1)

        else:

            idx = segment


        B, T = idx.size()
        # Block size check
        assert T <= self.config["block_size"], f"Cannot forward sequence of length {T}, block size is {self.config['block_size']}"

        # Create position tensor directly on correct device
        pos = torch.arange(0, T, device=device).unsqueeze(0)

        # Embeddings
        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = self.transformer.drop(tok_emb + pos_emb)

        # Transformer blocks
        for block in self.transformer.h:
            x = block(x)

        # Final layer norm and projection
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)

        # Loss computation
        loss = None
        if message is not None:
            # Get message logits and compute loss
            message_start = segment.size(1)
            logits_message = logits[:, message_start:, :]

            loss = F.cross_entropy(
                logits_message.reshape(-1, logits.size(-1)),
                message.reshape(-1),
                ignore_index=-1
            )

        return logits, loss

    def generate(self, segment, max_new_tokens, temperature=1.0, top_k=None):
        self.eval()  # Set to evaluation mode

        with torch.no_grad():
            seg_len = segment.size(1)  # Store original segment length
            for _ in range(max_new_tokens):
                if segment.size(1) > self.config["block_size"]:
                    idx_cond = segment[:, -self.config["block_size"]:]
                else:
                    idx_cond = segment

                # Get predictions
                logits, _ = self(idx_cond)
                logits = logits[:, -1, :] / temperature  # Take the logits for the last token

                # Apply top-k sampling if specified
                if top_k is not None:
                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                    logits[logits < v[:, [-1]]] = float('-inf')

                # Sample next token
                probs = F.softmax(logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)

                # Concatenate to segment
                segment = torch.cat((segment, next_token), dim=1)

                # Optional: Stop if end token is generated
                if next_token.item() == self.config.get("eos_token", -1):
                    break

            # Return generated sequence excluding input segment
            return segment[:, seg_len:]


In [29]:
import torch
import torch.nn as nn
import torch.optim as optim

chars = sorted(set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ,.!-()\''))
vocab_size = len(chars) + 1

config = {
    "vocab_size":vocab_size,
    "n_embed": 512,
    "n_heads": 8,
    "seg_length": 44,
    "message_length": 70,
    "dropout": 0.1,
    "block_size": 44 + 70,
    "n_layers": 5,
    "batch_size": 128,
    "n_epochs": 10
}


segments = []
messages = []

# Extract and convert each segment and message to a tensor
for segment_encoded, message_encoded in encoded_pairs:
    segments.append(torch.tensor(segment_encoded))  # Convert to tensor
    messages.append(torch.tensor(message_encoded))  # Convert to tensor

# Stack the padded segments and messages into tensors (batch first)
segments_tensor = torch.stack(segments)  # Shape: (batch_size, seg_length)
messages_tensor = torch.stack(messages)  # Shape: (batch_size, msg_length)

batch_size = 32  # Number of rows per batch
total_samples = segments_tensor.size(0)  # Number of rows in the dataset


def train_model(model, segments_tensor, messages_tensor, config):
    device = next(model.parameters()).device
    # Training setup
    optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.1)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config["n_epochs"])

    # Calculate total batches
    total_samples = len(segments_tensor)
    batch_size = config["batch_size"]
    n_batches = (total_samples + batch_size - 1) // batch_size

    model.train()
    for epoch in range(config["n_epochs"]):
        total_loss = 0

        for i in range(0, total_samples, batch_size):
            # Get batch
            batch_segments = segments_tensor[i:i+batch_size]
            batch_messages = messages_tensor[i:i+batch_size]

            # Move to GPU
            segment = batch_segments.to(device, non_blocking=True)
            message = batch_messages.to(device, non_blocking=True)

            # Forward pass
            optimizer.zero_grad(set_to_none=True)  # More efficient than zero_grad()
            logits, loss = model(segment, message)

            # Backward pass
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            # Accumulate loss
            total_loss += loss.item()

            # Print progress
            batch_idx = i // batch_size
            if batch_idx % 10 == 0:
                avg_loss = total_loss / (batch_idx + 1)
                print(f"Epoch {epoch+1}/{config['n_epochs']}, "
                      f"Batch {batch_idx}/{n_batches}, "
                      f"Loss: {loss.item():.4f}, "
                      f"Avg Loss: {avg_loss:.4f}")

        # Step the learning rate scheduler
        scheduler.step()

        # Calculate epoch statistics
        epoch_loss = total_loss / n_batches
        print(f"Epoch {epoch+1} completed. Average Loss: {epoch_loss:.4f}")

In [30]:


model = GPT(config).to("cuda")

# Train the model
train_model(model, segments_tensor, messages_tensor, config)

Epoch 1/10, Batch 0/24, Loss: 4.2607, Avg Loss: 4.2607
Epoch 1/10, Batch 10/24, Loss: 4.1493, Avg Loss: 4.1957
Epoch 1/10, Batch 20/24, Loss: 4.0191, Avg Loss: 4.1396
Epoch 1 completed. Average Loss: 4.1211
Epoch 2/10, Batch 0/24, Loss: 3.9674, Avg Loss: 3.9674
Epoch 2/10, Batch 10/24, Loss: 3.8383, Avg Loss: 3.9037
Epoch 2/10, Batch 20/24, Loss: 3.6873, Avg Loss: 3.8337
Epoch 2 completed. Average Loss: 3.8110
Epoch 3/10, Batch 0/24, Loss: 3.6185, Avg Loss: 3.6185
Epoch 3/10, Batch 10/24, Loss: 3.4404, Avg Loss: 3.5309
Epoch 3/10, Batch 20/24, Loss: 3.2404, Avg Loss: 3.4354
Epoch 3 completed. Average Loss: 3.4054
Epoch 4/10, Batch 0/24, Loss: 3.1558, Avg Loss: 3.1558
Epoch 4/10, Batch 10/24, Loss: 2.9646, Avg Loss: 3.0598
Epoch 4/10, Batch 20/24, Loss: 2.7731, Avg Loss: 2.9631
Epoch 4 completed. Average Loss: 2.9341
Epoch 5/10, Batch 0/24, Loss: 2.6978, Avg Loss: 2.6978
Epoch 5/10, Batch 10/24, Loss: 2.5450, Avg Loss: 2.6200
Epoch 5/10, Batch 20/24, Loss: 2.3967, Avg Loss: 2.5438
Epoch

In [31]:
def generate_marketing_messagez(model, segment_text, max_length=50, temperature=0.7, top_k=50):
    """
    Generate a marketing message using the trained GPT model.

    Args:
        model: Trained GPT model
        segment_text: String containing the customer segment description
        max_length: Maximum length of generated message
        temperature: Controls randomness (0.7 = balanced, <0.7 = focused, >0.7 = creative)
        top_k: Number of top tokens to consider for sampling
    """
    # Tokenize the segment
    #model.eval()
    segment_tokens = encode(segment_text, config['seg_length'])
    segment_tensor = torch.tensor(segment_tokens).unsqueeze(0).to('cuda')

    # Generate message tokens
    generated_tokens = model.generate(
        segment=segment_tensor,
        max_new_tokens=max_length,
        temperature=temperature,
        top_k=top_k
    )

    # Decode the generated tolist())

    return message

segment = "Young urban man"
message = generate_marketing_messagez(
    model=model,
    segment_text=segment,
    max_length=64,
    temperature=0.7,
    top_k=50
    )

print(f"Segment: {segment}")
print(f"Generated Message: {message}")

Segment: Young urban man
Generated Message: Discover the latest technology tailored for male 51 years old!
