<a href="https://colab.research.google.com/github/miteshkotak/demos/blob/master/docs/notebooks/create_synthetic_data_from_a_dataframe_or_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create synthetic data with the Python SDK

This notebook will walk you through the process of creating your own synthetic data using Gretel's Python SDK from a CSV or a DataFrame of your choosing using Gretel's `tabular-actgan` model.

This notebook will take about 5 minutes to run end to end. You will need an API key from the Gretel console, at https://console.gretel.cloud.

In [1]:
!pip install -Uqq gretel-client

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.2/312.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.6/96.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.1/148.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.6/58.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.9/143.9 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

To get started with your project, you'll need to set up the following parameters:

- `DATASET_PATH`: Specify the path to your dataset that you want to use for training and generation.
- `GRETEL_PROJECT`: Define the name of your Gretel project where you'll store the trained model and its results. This should be a unique and descriptive name.

In [2]:
import pandas as pd

DATASET_PATH = "/content/re-training 17.06.24.csv" # @param {type:"string"}
GRETEL_PROJECT_NAME = "synthetic-data" # @param {type:"string"}

In [3]:
# Specify your Gretel API key

from gretel_client import configure_session

pd.set_option("max_colwidth", None)

configure_session(api_key="prompt", cache="yes", validate=True)


Gretel API Key: ··········
Caching Gretel config to disk.
Using endpoint https://api.gretel.cloud
Logged in as mitesh@clareandme.com ✅


In [4]:
# Create a project

from gretel_client.projects import create_or_get_unique_project

project = create_or_get_unique_project(name="synthetic-data")


## Create the synthetic data configuration

Load the default configuration template. This template will work well for most datasets. View other templates at https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics


In [5]:
import json

from gretel_client.projects.models import read_model_config

config = read_model_config("synthetics/tabular-actgan")

# Adjust parame model epochs
config["models"][0]["actgan"]["params"]["epochs"] = "auto"
config["models"][0]["actgan"]["generate"]["num_records"] = 5000

print(f"Model configuration:\n{json.dumps(config, indent=2)}")


Model configuration:
{
  "schema_version": "1.0",
  "name": "tabular-actgan",
  "models": [
    {
      "actgan": {
        "data_source": "__tmp__",
        "params": {
          "epochs": "auto",
          "generator_dim": [
            1024,
            1024
          ],
          "discriminator_dim": [
            1024,
            1024
          ],
          "generator_lr": 0.0001,
          "discriminator_lr": 0.00033,
          "batch_size": "auto",
          "auto_transform_datetimes": false
        },
        "generate": {
          "num_records": 5000
        },
        "privacy_filters": {
          "outliers": null,
          "similarity": null
        }
      }
    }
  ]
}


## Load and preview the source dataset

Specify a data source to train the model on. This can be a local file, web location, or HDFS file.


In [None]:
# Load and preview the DataFrame to train the synthetic model on.

pd.read_csv(DATASET_PATH)

## Train the synthetic model

In this step, we will task the worker running in the Gretel cloud, or locally, to train a synthetic model on the source dataset.


In [None]:
# Train model and view synthetic data

from gretel_client.helpers import poll

model = project.create_model_obj(model_config=config, data_source=DATASET_PATH)
model.submit_cloud()

print(f"Follow along with training in the console: {project.get_console_url()}")
poll(model, verbose=False)

synthetic_df = pd.read_csv(model.get_artifact_link("data_preview"), compression="gzip")
synthetic_df

# View the synthetic data quality report


In [None]:
# Generate report that shows the statistical performance between the training and synthetic data

import IPython
from smart_open import open

IPython.display.HTML(data=open(model.get_artifact_link("report")).read(), metadata=dict(isolated=True))


# Generate unlimited synthetic data

You can now use the trained synthetic model to generate as much synthetic data as you like.


In [None]:
# Sample additional records from the trained model

record_handler = model.create_record_handler_obj(
    params={"num_records": 240, "max_invalid": 500}
)
record_handler.submit_cloud()
poll(record_handler, verbose=False)

synthetic_df = pd.read_csv(record_handler.get_artifact_link("data"), compression="gzip")
synthetic_df

In [None]:
# Load and preview real-world data

real_data = "/content/re-training 17.06.24.csv"

real_df = pd.read_csv(real_data)
real_df

In [None]:
synth_data = "/content/tabular-actgan-6671aadfa4a509ac785cc3fb-data.csv"

synth_df = pd.read_csv(synth_data)
synth_df


In [13]:
from gretel_client.evaluation.quality_report import QualityReport

report = QualityReport(data_source=synth_data, ref_data=real_data)
report.run()

In [14]:
report.peek()

{'raw_score': 71.91111111111111, 'grade': 'Good', 'score': 71}

In [15]:

import IPython

IPython.display.HTML(report.as_html, metadata=dict(isolated=True))

0,1,2,3,4,5
How to interpret your SQS,Excellent,Good,Moderate,Poor,Very Poor
Suitable for machine learning or statistical analysis,,,,,
Suitable for balancing or augmenting machine learning data sources,,,,,
Suitable for pre-production testing environments,,,,,
Suitable for demo environments or mock data,,,,,
Improve your model using our tips and advice,,,,,
Significant tuning required to improve model,,,,,

0,1,2,3,4,5
Data Sharing Use Case,Excellent,Very Good,Good,Normal,Poor
"Internally, within the same team",,,,,
"Internally, across different teams",,,,,
"Externally, with trusted partners",,,,,
"Externally, public availability",,,,,

Unnamed: 0,Training Data,Synthetic Data
Row Count,14,14
Column Count,28,28
Training Lines Duplicated,--,0

Default Privacy Protections,Advanced Protections

Field,Unique,Missing,Ave. Length,Type,Distribution Stability
boundaries,2,0,1.0,Binary,Excellent
sleep,1,0,1.0,Categorical,Excellent
blowing balloons,2,0,1.0,Binary,Excellent
positive experience,2,0,1.0,Binary,Excellent
stress enhancing thoughts,2,0,1.0,Binary,Excellent
text,14,0,360.71,Text,
body scan,2,0,1.0,Binary,Excellent
breathing,2,0,1.0,Binary,Excellent
Inner strength,2,0,1.0,Binary,Excellent
perfectionism,1,0,1.0,Categorical,Excellent


In [16]:
report.as_dict


{'left_rows': 14,
 'left_cols': 28,
 'right_rows': 14,
 'right_cols': 28,
 'fields': [{'name': 'imaginary friend',
   'left_field_features': {'name': 'imaginary friend',
    'type': 'binary',
    'count': 14,
    'unique_count': 2,
    'unique_percent': 14.2857,
    'missing_count': 0,
    'missing_percent': 0.0,
    'min_str_length': 1,
    'max_str_length': 1,
    'avg_str_length': 1.0},
   'right_field_features': {'name': 'imaginary friend',
    'type': 'binary',
    'count': 14,
    'unique_count': 2,
    'unique_percent': 14.2857,
    'missing_count': 0,
    'missing_percent': 0.0,
    'min_str_length': 1,
    'max_str_length': 1,
    'avg_str_length': 1.0},
   'left_distribution': {'1': 28.571428571428573, '0': 71.42857142857143},
   'right_distribution': {'1': 35.714285714285715, '0': 64.28571428571429},
   'distribution_distance': 0.06499963403257181,
   'distribution_stability': {'raw_score': 0.06499963403257181,
    'grade': 'Excellent',
    'score': 92}},
  {'name': 'lonelin