# Privacy-Preserving Techniques for LLM Code Completion

This notebook measures and plots the privacy-utility trade-off for code completion using the HumanEval dataset.

## 1. Setup and Imports

In [None]:
import sys
sys.path.insert(0, '.')

from src.data import HumanEvalDataLoader
from src.obfuscation import LowObfuscator, HighObfuscator
from src.models import CodeCompletionModel
from src.evaluation import UtilityEvaluator, PrivacyEvaluator
from src.visualization import PrivacyUtilityPlotter

import numpy as np
import matplotlib.pyplot as plt

## 2. Load HumanEval Dataset

Load the first 20 examples from the openai/openai_humaneval test split.

In [None]:
data_loader = HumanEvalDataLoader(num_examples=20)
data_loader.load_dataset()

prompts = data_loader.get_prompts()
canonical_solutions = data_loader.get_canonical_solutions()

print(f"Loaded {len(prompts)} examples")
print(f"\nExample prompt (first 300 chars):\n{prompts[0][:300]}...")

## 3. Define Obfuscation Functions

Two levels of obfuscation:
- **Low**: Rename variables to generic names (var1, var2, etc.)
- **High**: Replace all identifiers with placeholders and strip comments

In [None]:
low_obfuscator = LowObfuscator()
high_obfuscator = HighObfuscator()

low_obfuscated_prompts = [low_obfuscator.obfuscate(p) for p in prompts]
high_obfuscated_prompts = [high_obfuscator.obfuscate(p) for p in prompts]

print("Original prompt (first 200 chars):")
print(prompts[0][:200])
print("\nLow obfuscated (first 200 chars):")
print(low_obfuscated_prompts[0][:200])
print("\nHigh obfuscated (first 200 chars):")
print(high_obfuscated_prompts[0][:200])

## 4. Initialize Code Completion Model

Using Salesforce/codet5-small for code completion.

In [None]:
model = CodeCompletionModel(model_name="Salesforce/codet5-small", device="cpu")
model.load_model()
print("Model loaded successfully")

## 5. Generate Completions

Generate 60 completions total: 20 original, 20 low-obfuscated, 20 high-obfuscated.

In [None]:
print("Generating completions for original prompts...")
original_completions = model.generate_completions_batch(prompts)

print("Generating completions for low-obfuscated prompts...")
low_completions = model.generate_completions_batch(low_obfuscated_prompts)

print("Generating completions for high-obfuscated prompts...")
high_completions = model.generate_completions_batch(high_obfuscated_prompts)

print(f"\nGenerated {len(original_completions) + len(low_completions) + len(high_completions)} total completions")

## 6. Compute Utility Scores

Using ROUGE-L F1 score to compare completions to canonical solutions.

In [None]:
utility_evaluator = UtilityEvaluator()

original_utility = [utility_evaluator.get_utility_score(c, r) for c, r in zip(original_completions, canonical_solutions)]
low_utility = [utility_evaluator.get_utility_score(c, r) for c, r in zip(low_completions, canonical_solutions)]
high_utility = [utility_evaluator.get_utility_score(c, r) for c, r in zip(high_completions, canonical_solutions)]

print(f"Original - Mean Utility: {np.mean(original_utility):.4f}, Std: {np.std(original_utility):.4f}")
print(f"Low Obf  - Mean Utility: {np.mean(low_utility):.4f}, Std: {np.std(low_utility):.4f}")
print(f"High Obf - Mean Utility: {np.mean(high_utility):.4f}, Std: {np.std(high_utility):.4f}")

## 7. Compute Privacy Scores

Using normalized Levenshtein distance between obfuscated and original prompts.

In [None]:
privacy_evaluator = PrivacyEvaluator()

original_privacy = [0.0] * len(prompts)
low_privacy = privacy_evaluator.compute_privacy_batch(prompts, low_obfuscated_prompts)
high_privacy = privacy_evaluator.compute_privacy_batch(prompts, high_obfuscated_prompts)

print(f"Original - Mean Privacy: {np.mean(original_privacy):.4f}, Std: {np.std(original_privacy):.4f}")
print(f"Low Obf  - Mean Privacy: {np.mean(low_privacy):.4f}, Std: {np.std(low_privacy):.4f}")
print(f"High Obf - Mean Privacy: {np.mean(high_privacy):.4f}, Std: {np.std(high_privacy):.4f}")

## 8. Create Scatter Plot

Visualize the privacy-utility trade-off.

In [None]:
all_privacy = original_privacy + low_privacy + high_privacy
all_utility = original_utility + low_utility + high_utility
all_labels = ['None'] * 20 + ['Low'] * 20 + ['High'] * 20

plotter = PrivacyUtilityPlotter(figure_size=(10, 8))
fig = plotter.create_scatter_plot(
    all_privacy,
    all_utility,
    labels=all_labels,
    title="Privacy-Utility Trade-off by Obfuscation Level",
    xlabel="Privacy Score (Normalized Levenshtein Distance)",
    ylabel="Utility Score (ROUGE-L F1)"
)
plt.show()

## 9. Analysis

Brief analysis of the observed privacy-utility trade-off.

In [None]:
print("="*60)
print("PRIVACY-UTILITY TRADE-OFF ANALYSIS")
print("="*60)
print("\nSummary Statistics:")
print("-"*40)
print(f"{'Level':<10} {'Privacy Mean':>15} {'Utility Mean':>15}")
print("-"*40)
print(f"{'None':<10} {np.mean(original_privacy):>15.4f} {np.mean(original_utility):>15.4f}")
print(f"{'Low':<10} {np.mean(low_privacy):>15.4f} {np.mean(low_utility):>15.4f}")
print(f"{'High':<10} {np.mean(high_privacy):>15.4f} {np.mean(high_utility):>15.4f}")
print("-"*40)

print("\nObservations:")
print("1. No obfuscation (None) has privacy=0 since prompts are unchanged.")
print("2. Low obfuscation provides moderate privacy with variable renaming.")
print("3. High obfuscation maximizes privacy but may reduce utility.")
print("4. The trade-off shows that increased privacy often comes at the cost")
print("   of reduced utility, as the model receives less semantic information.")

privacy_increase = np.mean(high_privacy) - np.mean(original_privacy)
utility_change = np.mean(high_utility) - np.mean(original_utility)
print(f"\nHigh vs None obfuscation:")
print(f"  Privacy increase: +{privacy_increase:.4f}")
print(f"  Utility change: {utility_change:+.4f}")

## 10. Save Results

In [None]:
import json

results = {
    'none': {'privacy_scores': original_privacy, 'utility_scores': original_utility},
    'low': {'privacy_scores': low_privacy, 'utility_scores': low_utility},
    'high': {'privacy_scores': high_privacy, 'utility_scores': high_utility}
}

with open('results.json', 'w') as f:
    json.dump(results, f, indent=2)

plotter.save_figure(fig, 'privacy_utility_scatter.png')
print("Results saved to results.json and privacy_utility_scatter.png")