In [82]:
import numpy as np
import random
from pprint import pprint
from faker import Faker

In [83]:
class SimpleTransformer:
    def __init__(self, vocab, embed_dim=32):
        self.vocab = vocab
        self.vocab_size = len(vocab)
        self.token_to_idx = {token: idx for idx, token in enumerate(vocab)}
        self.idx_to_token = {idx: token for token, idx in self.token_to_idx.items()}
        self.embed_dim = embed_dim

        self.embeddings = np.random.randn(self.vocab_size, embed_dim)
        self.W_q = np.random.randn(embed_dim, embed_dim)
        self.W_k = np.random.randn(embed_dim, embed_dim)
        self.W_v = np.random.randn(embed_dim, embed_dim)
        self.W_out = np.random.randn(embed_dim, self.vocab_size)

    def softmax(self, x):
        e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return e_x / e_x.sum(axis=-1, keepdims=True)

    def encode(self, tokens):
        return [self.token_to_idx.get(token, 0) for token in tokens]

    def decode(self, indices):
        return [self.idx_to_token.get(idx, "<unk>") for idx in indices]

    def self_attention(self, x):
        Q = x @ self.W_q
        K = x @ self.W_k
        V = x @ self.W_v
        scores = Q @ K.transpose((0, 2, 1)) / np.sqrt(self.embed_dim)
        weights = self.softmax(scores)
        return weights @ V

    def forward(self, prompt_tokens, output_length=4):
        idxs = self.encode(prompt_tokens)
        x = np.array([self.embeddings[idx] for idx in idxs])[np.newaxis, :, :]
        attn_output = self.self_attention(x)
        pooled = attn_output.mean(axis=1)
        generated = []
        for _ in range(output_length):
            logits = pooled @ self.W_out
            idx = np.argmax(logits, axis=-1)[0]
            generated.append(idx)
            pooled += self.embeddings[idx]
        return self.decode(generated)

In [84]:
vocab = [
    "bank", "school", "sport", "hospital", "company",
    "account_number", "name", "balance", "currency",
    "student_id", "email", "grade",
    "member_id", "sport", "membership_status",
    "patient_id", "diagnosis", "doctor_name",
    "employee_id", "position", "salary"
]

In [85]:
fake = Faker()
FAKE_VALUE_FUNCTIONS = {
    "name": lambda: fake.name(),
    "email": lambda: fake.email(),
    "account_number": lambda: fake.iban(),
    "balance": lambda: round(fake.pyfloat(left_digits=5, right_digits=2), 2),
    "currency": lambda: fake.currency_code(),
    "student_id": lambda: fake.uuid4(),
    "grade": lambda: random.choice(["A", "B", "C", "D", "F"]),
    "sport": lambda: random.choice(["football", "tennis", "basketball"]),
    "membership_status": lambda: random.choice(["active", "inactive"]),
    "patient_id": lambda: fake.uuid4(),
    "diagnosis": lambda: random.choice(["flu", "allergy", "injury"]),
    "doctor_name": lambda: fake.name(),
    "employee_id": lambda: fake.uuid4(),
    "position": lambda: random.choice(["manager", "developer", "analyst"]),
    "salary": lambda: round(fake.pyfloat(left_digits=5, right_digits=2), 2)
}

In [86]:
def generate_fake_data(columns, n=5):
    rows = []
    for _ in range(n):
        row = [FAKE_VALUE_FUNCTIONS.get(col, lambda: fake.word())() for col in columns]
        rows.append(row)
    return {"columns": columns, "rows": rows}

In [87]:
prompt = "generate a school dataset"
prompt_tokens = prompt.lower().split()

In [88]:
transformer = SimpleTransformer(vocab)
predicted_columns = transformer.forward(prompt_tokens, output_length=4)

In [89]:
columns = list(dict.fromkeys([
    token for token in predicted_columns
    if token in vocab and token not in prompt_tokens
]))

if not columns:
    print("⚠️ Transformer could not determine valid schema.")
else:
    dataset = generate_fake_data(columns, n=5)
    print("Generated Columns:", columns)
    pprint(dataset)

Generated Columns: ['patient_id', 'member_id']
{'columns': ['patient_id', 'member_id'],
 'rows': [['9a61aca4-436d-4542-8507-ccbafdeaa6e9', 'newspaper'],
          ['8cbcd555-3c40-4b40-8586-c05efbb6ea93', 'exactly'],
          ['fd621e50-3545-42d5-9ffb-e2a29bc7f3be', 'suffer'],
          ['381b67fa-5f0f-4fe6-81db-c6c8330bfa61', 'front'],
          ['15aa8b52-eb4a-46d3-a736-e7cda98eac99', 'have']]}
