# Custom Token Definition Guide

This notebook demonstrates how to create custom token definitions with minimal code using the OpenToken notebook helpers.

## Setup

First, import the necessary modules:

In [None]:
# Constants for token generation
HASHING_SECRET = "my-hashing-secret-key"
ENCRYPTION_KEY = "my-encryption-key-32-characters!"

from opentoken_pyspark.notebook_helpers import (
    TokenBuilder,
    CustomTokenDefinition,
    create_token_generator,
    quick_token,
    list_attributes,
    expression_help
)
from opentoken.attributes.general.record_id_attribute import RecordIdAttribute
from opentoken.attributes.person.first_name_attribute import FirstNameAttribute
from opentoken.attributes.person.last_name_attribute import LastNameAttribute
from opentoken.attributes.person.birth_date_attribute import BirthDateAttribute
from opentoken.attributes.person.sex_attribute import SexAttribute
from opentoken.attributes.person.social_security_number_attribute import SocialSecurityNumberAttribute
from opentoken.attributes.person.postal_code_attribute import PostalCodeAttribute

## View Available Attributes

Check what attributes are available:

In [None]:
attrs = list_attributes()
print("Available attributes:")
for name in attrs.keys():
    print(f"  - {name}")

## Expression Syntax Help

Get help on expression syntax:

In [None]:
print(expression_help())

## Method 1: Quick Token (Simplest)

Create a custom T6 token in one line with the rule:
`U(last-name)|U(first-name)|birth-date|postal-code-3|U(sex)`

In [None]:
# Create T6 token generator in one call
generator = quick_token(
    "T6",
    [
        ("last_name", "T|U"),
        ("first_name", "T|U"),
        ("birth_date", "T|D"),
        ("postal_code", "T|S(0,3)"),
        ("sex", "T|U")
    ],
    HASHING_SECRET,
    ENCRYPTION_KEY
)

# Test it with sample data
person_attrs = {
    RecordIdAttribute: "1",
    FirstNameAttribute: "John",
    LastNameAttribute: "Doe",
    BirthDateAttribute: "1990-01-15",
    SexAttribute: "Male",
    PostalCodeAttribute: "98101"
}

result = generator.get_all_tokens(person_attrs)
print(f"T6 Token: {result.tokens.get('T6')}")

## Method 2: Token Builder (More Flexible)

Use the fluent TokenBuilder API for more control:

In [None]:
# Create a custom T6 token
t6_token = TokenBuilder("T6") \
    .add("last_name", "T|U") \
    .add("first_name", "T|U") \
    .add("birth_date", "T|D") \
    .add("postal_code", "T|S(0,3)") \
    .add("sex", "T|U") \
    .build()

# Create a custom token definition
custom_definition = CustomTokenDefinition().add_token(t6_token)

# Create token generator
generator = create_token_generator(
    HASHING_SECRET,
    ENCRYPTION_KEY,
    custom_definition
)

# Generate tokens
result = generator.get_all_tokens(person_attrs)
print(f"T6 Token: {result.tokens.get('T6')}")

## Method 3: Multiple Custom Tokens

Define multiple tokens in one definition:

In [None]:
# Create T6 token: U(last-name)|U(first-name)|birth-date|postal-code-3|U(sex)
t6_token = TokenBuilder("T6") \
    .add("last_name", "T|U") \
    .add("first_name", "T|U") \
    .add("birth_date", "T|D") \
    .add("postal_code", "T|S(0,3)") \
    .add("sex", "T|U") \
    .build()

# Create T7 token: U(last-name-3)|U(first-name-3)|birth-date
t7_token = TokenBuilder("T7") \
    .add("last_name", "T|S(0,3)|U") \
    .add("first_name", "T|S(0,3)|U") \
    .add("birth_date", "T|D") \
    .build()

# Add both tokens to definition
custom_definition = CustomTokenDefinition() \
    .add_token(t6_token) \
    .add_token(t7_token)

# Create generator
generator = create_token_generator(
    HASHING_SECRET,
    ENCRYPTION_KEY,
    custom_definition
)

# Generate both tokens
result = generator.get_all_tokens(person_attrs)
print(f"T6 Token: {result.tokens.get('T6')}")
print(f"T7 Token: {result.tokens.get('T7')}")

## Using Custom Tokens with PySpark DataFrames

The key is to pass your custom TokenDefinition to the OpenTokenProcessor:

In [None]:
from pyspark.sql import SparkSession
from opentoken_pyspark import OpenTokenProcessor
from opentoken_pyspark.notebook_helpers import TokenBuilder, CustomTokenDefinition

# Create Spark session
spark = SparkSession.builder.appName("CustomTokens").getOrCreate()

# Create sample DataFrame with ALL required columns (even if not used by custom token)
# Note: Currently OpenTokenProcessor validates ALL standard columns regardless of custom tokens
data = [
    ("1", "John", "Doe", "1990-01-15", "Male", "98101", "123-45-6789"),
    ("2", "Jane", "Smith", "1985-06-20", "Female", "94105", "987-65-4321")
]
df = spark.createDataFrame(data, ["RecordId", "FirstName", "LastName", "BirthDate", "Sex", "PostalCode", "SocialSecurityNumber"])

# Step 1: Define your custom T6 token
t6_token = TokenBuilder("T6") \
    .add("last_name", "T|U") \
    .add("first_name", "T|U") \
    .add("birth_date", "T|D") \
    .add("postal_code", "T|S(0,3)") \
    .add("sex", "T|U") \
    .build()

# Step 2: Create custom token definition
custom_definition = CustomTokenDefinition().add_token(t6_token)

# Step 3: Create processor with custom definition
processor = OpenTokenProcessor(
    hashing_secret=HASHING_SECRET,
    encryption_key=ENCRYPTION_KEY,
    token_definition=custom_definition
)

# Step 4: Process DataFrame with custom tokens
tokens_df = processor.process_dataframe(df)

# Show results - you'll see T6 tokens instead of T1-T5!
print("Custom T6 tokens generated:")
print("DataFrame structure:")
tokens_df.printSchema()
tokens_df.show(truncate=False)

### Multiple Custom Tokens with PySpark

You can also use multiple custom tokens:

In [None]:
# Define multiple custom tokens
t6_token = TokenBuilder("T6") \
    .add("last_name", "T|U") \
    .add("first_name", "T|U") \
    .add("birth_date", "T|D") \
    .add("postal_code", "T|S(0,3)") \
    .add("sex", "T|U") \
    .build()

t7_token = TokenBuilder("T7") \
    .add("last_name", "T|S(0,3)|U") \
    .add("first_name", "T|S(0,3)|U") \
    .add("birth_date", "T|D") \
    .build()

# Add both to definition
multi_definition = CustomTokenDefinition() \
    .add_token(t6_token) \
    .add_token(t7_token)

# Create processor with multiple custom tokens
processor_multi = OpenTokenProcessor(
    hashing_secret=HASHING_SECRET,
    encryption_key=ENCRYPTION_KEY,
    token_definition=multi_definition
)

# Process - will generate both T6 and T7 tokens
multi_tokens_df = processor_multi.process_dataframe(df)

print("Multiple custom tokens (T6 and T7):")
multi_tokens_df.show(truncate=False)

## Experiment with Different Rules

Try different combinations:

## Token Processing Options

OpenTokenProcessor supports three modes for token generation:

1. **Hash + Encrypt** (Production): Most secure, tokens are hashed then encrypted
2. **Hash Only** (Experimentation): Faster, tokens are hashed but not encrypted  
3. **Plain Text** (Debugging): Raw concatenated strings like `DOE|JOHN|1990-01-15|981|MALE`

Simply omit the `hashing_secret` and/or `encryption_key` to control the mode:

In [None]:
# Reload the modules to pick up the latest changes
import importlib
import opentoken_pyspark.token_processor
import opentoken.tokens.token_generator
importlib.reload(opentoken.tokens.token_generator)
importlib.reload(opentoken_pyspark.token_processor)
from opentoken_pyspark.token_processor import OpenTokenProcessor

In [None]:
# Option 1: Hash + Encrypt (most secure - production use)
processor_encrypted = OpenTokenProcessor(
    hashing_secret=HASHING_SECRET,
    encryption_key=ENCRYPTION_KEY,
    token_definition=multi_definition
)

# Option 2: Hash only (no encryption - faster for experimentation)
processor_hashed = OpenTokenProcessor(
    hashing_secret=HASHING_SECRET,
    token_definition=multi_definition
)

# Option 3: Plain text (no hash, no encryption - debugging only)  
processor_plain = OpenTokenProcessor(
    token_definition=multi_definition
)

# Process with all three modes
encrypted_df = processor_encrypted.process_dataframe(df)
hashed_df = processor_hashed.process_dataframe(df)
plain_df = processor_plain.process_dataframe(df)

print("=== Encrypted Tokens (Hash + Encrypt) ===")
encrypted_df.show(truncate=False)

print("\n=== Hashed Tokens (Hash only) ===")
hashed_df.show(truncate=False)

print("\n=== Plain Tokens (Raw concatenated strings) ===")
print("Example: DOE|JOHN|1990-01-15|981|MALE")
plain_df.show(truncate=False)

In [None]:
# Minimal token: just last name and first initial
minimal_token = TokenBuilder("MINIMAL") \
    .add("last_name", "T|U") \
    .add("first_name", "T|S(0,1)|U") \
    .build()

# Full token: everything (needs SSN added to person_attrs)
full_token = TokenBuilder("FULL") \
    .add("last_name", "T|U") \
    .add("first_name", "T|U") \
    .add("birth_date", "T|D") \
    .add("sex", "T|U") \
    .add("postal_code", "T|S(0,5)") \
    .add("ssn", "T") \
    .build()

# Create definition with both
definition = CustomTokenDefinition() \
    .add_token(minimal_token) \
    .add_token(full_token)

generator = create_token_generator(
    HASHING_SECRET,
    ENCRYPTION_KEY,
    definition
)

person_attrs_with_ssn = person_attrs.copy()
person_attrs_with_ssn[SocialSecurityNumberAttribute] = "234-56-7890"  # Valid format

result = generator.get_all_tokens(person_attrs_with_ssn)
print(f"Minimal Token: {result.tokens.get('MINIMAL')}")
print(f"Full Token: {result.tokens.get('FULL')}")

## Cleanup

In [None]:
# Stop Spark session
spark.stop()

## Summary

Three ways to create custom tokens:

1. **`quick_token()`** - Fastest, one-liner approach for simple cases
2. **`TokenBuilder`** - Fluent API for readable, flexible definitions
3. **Manual classes** - Full control (as shown in previous examples)

Choose the method that best fits your workflow!