<a href="https://colab.research.google.com/github/mars137/synthetic-data/blob/main/docs/notebooks/synthetic_data_uber_differential_privacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A differentially private, synthetic ride-share dataset

This blueprint utilizes Gretel's SDKs to create a synthetic version of your own data. Our SDKs create automatic data validators to help ensure the data generated has the same semantics as the source data. Additionally, the SDKs do autmoatic header clustering to help maintain statistical relations between columns.

In [1]:
%%capture
!pip install gretel-client 

In [2]:
# Load your Gretel API key. You can acquire this from the Gretel Console 
# @ https://console.gretel.cloud

import pandas as pd
from gretel_client import configure_session

pd.set_option('max_colwidth', None)
configure_session(api_key="prompt", cache="yes", validate=True)

Gretel Api Key··········
Caching Gretel config to disk.
Using endpoint https://api.gretel.cloud
Logged in as atif.tahir13@gmail.com ✅


In [3]:
# Read the training dataset before inserting canary values:
dataset_path = "https://gretel-public-website.s3.amazonaws.com/datasets/uber_scooter_rides_1day.csv"
df = pd.read_csv(dataset_path,names = ["hour","bike_id","src_lat","src_lon","dst_lat","dst_lon"]).round(5)
df.head()

Unnamed: 0,hour,bike_id,src_lat,src_lon,dst_lat,dst_lon
0,3,HRN405,37.75406,-122.39296,37.7552,-122.41225
1,18,TQY725,30.28496,-97.74205,30.28349,-97.739
2,6,32179,36.94995,-122.0557,36.95979,-122.05451
3,13,HVA276,30.20272,-97.71802,30.29068,-97.74482
4,18,31354,38.56024,-121.76195,38.5374,-121.75214


In [4]:
from numpy.random import uniform
import numpy as np
from numpy.random import choice
 
# Create random secrets (canaries) to insert into training set
secrets = [85.31243, 80.71705, 84.98992, 63.20242]
weights = np.array([.05, .15, .30, .50])

def create_canaries(df: pd.DataFrame, secrets, weights, frac=0.01) -> pd.DataFrame:
    """Insert secrets randomly into the location columns.
       These values should never be repeated by the model
    """
    weights /= weights.sum()
    cols = ['src_lon', 'src_lat', 'dst_lon', 'dst_lat']
    
    canaries = df.sample(frac=frac, random_state=42)
    for i, row in canaries.iterrows():
         canaries.at[i, choice(cols)] = choice(secrets, p=weights)
    return canaries
        
 
canaries = create_canaries(df, secrets, weights, 0.01)
canaries.head()

Unnamed: 0,hour,bike_id,src_lat,src_lon,dst_lat,dst_lon
14219,7,41527,39.7394,84.98992,39.7334,-104.98664
6657,21,OBD546,38.86269,80.71705,38.8628,-77.05456
6108,17,30703,38.56437,63.20242,38.56196,-121.42373
10438,15,LWV239,37.76182,-122.4205,37.77581,85.31243
4639,11,37474,38.90238,-77.01677,84.98992,-77.00587


In [5]:
train_df = df.append(canaries,ignore_index= True)
# shuffle the training dataset with appended canary values before training the model:
from sklearn.utils import shuffle
train_df = shuffle(train_df,random_state=42).reset_index(drop =True)
# Save the dataset in a csv to train the model with.
train_df.to_csv("train.csv", index=False)
train_df.head()


  train_df = df.append(canaries,ignore_index= True)


Unnamed: 0,hour,bike_id,src_lat,src_lon,dst_lat,dst_lon
0,21,QJW109,38.91102,-77.04164,38.92249,-77.04323
1,16,9535,38.56972,-121.48163,38.57382,-121.50359
2,14,54778,37.77358,-122.42262,37.7714,-122.42182
3,20,19811,38.89204,-76.99562,38.8853,-76.99606
4,1,30642,38.57157,-121.46589,38.54668,-121.44412


In [6]:
from gretel_client.projects.models import read_model_config

# Create model configuration.
config = read_model_config("synthetics/default")

config['models'][0]["synthetics"]["params"]["vocab_size"] = 0
config['models'][0]["synthetics"]["params"]["epochs"] = 50
config['models'][0]["synthetics"]["params"]["learning_rate"] = 0.001  # set low to demonstrate gradient clipping
config['models'][0]["synthetics"]["params"]["batch_size"] = 4
config['models'][0]["synthetics"]["params"]["predict_batch_size"] = 1

# Enable Differential Privacy:
config['models'][0]["synthetics"]["params"]["dp"] = True
config['models'][0]["synthetics"]["params"]["dp_noise_multiplier"] = 0.001
config['models'][0]["synthetics"]["params"]["dp_l2_norm_clip"] = 1.5

#Setting the privacy filters off, since we are already using DP.
config["models"][0]['synthetics']['privacy_filters']["outliers"] = None
config["models"][0]['synthetics']['privacy_filters']["similarity"] = None

seed_columns = ["hour", "bike_id"]
task = {"type": "seed", "attrs": {"fields": seed_columns}}
config["models"][0]["synthetics"]["task"] = task

# DP configurationsetting summary:
data = config["models"][0]["synthetics"]["params"]
pd.DataFrame.from_dict(data,orient="index",columns=["values"])



Unnamed: 0,values
epochs,50
vocab_size,0
learning_rate,0.001
validation_split,False
batch_size,4
predict_batch_size,1
dp,True
dp_noise_multiplier,0.001
dp_l2_norm_clip,1.5


In [None]:
# Create a project
from gretel_client.helpers import poll
from gretel_client.projects import create_or_get_unique_project

project = create_or_get_unique_project(name="ride-share-DP-Model")
model = project.create_model_obj(model_config=config, data_source="train.csv")
model.submit_cloud()
poll(model)

INFO: Starting poller


{
    "uid": "641ec7d13301d921f934f4a8",
    "guid": "model_2NVANCN9GYCpgGIuhZEUH2QkjXW",
    "model_name": "default-config",
    "runner_mode": "cloud",
    "user_id": "61779c3ebff62105d3757a71",
    "user_guid": "user_26hlyPRrQXap2t6NhfbC1G7JA0l",
    "billing_domain": null,
    "billing_domain_guid": null,
    "project_id": "641ec7cae5bc51838c29c8c9",
    "project_guid": "proj_2NVAMKwfnbawZrk4Y4xxI64KOw9",
    "status_history": {
        "created": "2023-03-25T10:07:13.942446Z"
    },
    "last_modified": "2023-03-25T10:07:14.127975Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "annotations": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:0e0d8d352d355d498b9da449f6ffb4bb33e87f530380a98afe157718f66877d1",
    "container_image_version": "2.10.41",
    "model_type": "synthetics",
    "model_type_alias": null,
    "config"

INFO: Status is created. Model creation has been queued.
INFO: Status is pending. A Gretel Cloud worker is being allocated to begin model creation.
INFO: Status is active. A worker has started creating your model!
2023-03-25T10:07:24.462195Z  Analyzing input data and checking for auto-params...
2023-03-25T10:07:48.716920Z  Starting synthetic model training
2023-03-25T10:07:48.718904Z  Loading training data
2023-03-25T10:07:48.726543Z  Running pre-flight data checks on input data

	1 field with surrounding whitespaces: 'bike_id' contains whitespaces at the beginning or the end for at least 20% of its values. Leading and trailing whitespaces reduce model performance.
2023-03-25T10:07:51.890611Z  Training data loaded.
{
    "record_count": 27385,
    "field_count": 6,
    "upsample_count": 0
}
2023-03-25T10:07:59.416046Z  Creating semantic validators and preparing training data
2023-03-25T10:08:05.835170Z  Beginning ML model training
2023-03-25T10:08:12.931447Z  Running training on 1 batc

In [None]:
# Read the synthetic data created from the conditioned synthetic data model.
synthetic_df = pd.read_csv(model.get_artifact_link("data_preview"), compression="gzip")
synthetic_df.head()

In [None]:
# Find the canaries that were replayed by our model
def find_canaries(df, secrets):
    frequency = []
    raw = df.to_string()
    for secret in secrets:
      frequency.append(raw.count(str(secret)))
    return frequency

results = pd.DataFrame({"Secret value": secrets,
                        "Insertion count": find_canaries(train_df, secrets),
                        "Repetition by synthetic model" :find_canaries(synthetic_df, secrets)})

results
