In [1]:
!pip install boto3 sagemaker --upgrade



In [2]:
import boto3
import json
import sagemaker
import pandas as pd
import json
import re
import numpy as np

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
sm_runtime = boto3.client('sagemaker-runtime')

In [4]:
import pandas as pd

# Load UCI Adult dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'

columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

df = pd.read_csv(url, header=None, names=columns, na_values=" ?", skipinitialspace=True)

# Drop missing values for simplicity (optional)
df.dropna(inplace=True)

print("Dataset loaded. Shape:", df.shape)
df.head()

Dataset loaded. Shape: (32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
from ctgan import CTGAN

# Define categorical columns
categorical_columns = df.select_dtypes(include='object').columns.tolist()

# Initialize CTGAN model
model = CTGAN(epochs=300, verbose = True)  # Adjust epochs based on testing speed vs. quality tradeoff

# Fit model with defined categorical columns
model.fit(df, discrete_columns=categorical_columns)

print("CTGAN model training completed.")

Gen. (-0.53) | Discrim. (-0.01): 100%|██████████| 300/300 [28:59<00:00,  5.80s/it]

CTGAN model training completed.





In [6]:
baseline_synthetic = model.sample(len(df))
baseline_synthetic.to_csv('../data/processed/synthetic_adult_ctgan.csv', index=False)

print("✅ CTGAN baseline synthetic data saved successfully.")

✅ CTGAN baseline synthetic data saved successfully.


In [7]:
# Generate schema text dynamically from dataframe
schema = "\n".join([f"{col}: {df[col].dtype}" for col in df.columns])

prompt_json = f"""
You are an expert data scientist.

Analyze the following dataset schema to support realistic synthetic data generation.

---

**Instructions for Output:**

1. **Important Features:** Identify the top features that most influence data realism and relationships. Choose features based on your analysis of this dataset schema that:

   - Have strong correlations with other features
   - Are important for downstream machine learning tasks
   - Define key demographic or behavioral properties

2. **Logical Constraints:** Identify as many real-life domain rules as you can come up with among the dataset features to maintain data realism. These should be derived based on your understanding of the schema, and these rules can be from any of the 3 categories below:

   - **Range constraints:** e.g. age must be between realistic working age ranges
   - **Conditional constraints:** e.g. if education level is PhD, age must be at least 25 realisticlly 
   - **Relationships:** e.g. hours-per-week correlates with income levels if everything else is same

**Note:** The examples above are illustrative. Generate **original, realistic constraints relevant to this dataset** rather than repeating the examples.

3. **Augmentation Suggestions:** Suggest engineered features that enhance data realism or GAN training, such as combining related fields or normalizing ratios.

4. **Rare Category Handling:** Suggest strategies for handling skewed or rare categories in categorical features.

---

**Provide ONLY valid JSON output in the following structure:**

{{
  "important_features": ["feature1", "feature2", ...],
  "logical_constraints": [
    {{
      "rule": "description of the constraint",
      "type": "range | conditional | relationship",
      "features_involved": ["feature1", "feature2", ...]
    }}
  ],
  "augmentation_suggestions": ["suggestion1", "suggestion2", ...],
  "rare_category_handling": ["strategy1", "strategy2", ...]
}}

---

Dataset schema:
{schema}

Do not include any explanation, comments, or formatting outside this JSON structure.
"""

In [8]:
payload = {
    "inputs": prompt_json,
    "parameters": {
        "max_new_tokens": 1024,
        "temperature": 0.3
    }
}

# Convert to JSON string
payload_json = json.dumps(payload)

In [9]:
response = sm_runtime.invoke_endpoint(
    EndpointName='jumpstart-dft-hf-llm-falcon-3-3b-in-20250706-040539',  # replace with your endpoint name if different
    ContentType='application/json',
    Body=payload_json
)

# Parse response
result = json.loads(response['Body'].read().decode('utf-8'))

# Print output
print(result)

{'generated_text': '<|assistant|>\n```json\n{\n  "important_features": ["age", "capital-gain", "hours-per-week", "education-num", "capital-loss"],\n  "logical_constraints": [\n    {\n      "rule": "If education level is PhD, age must be at least 25.",\n      "type": "conditional",\n      "features_involved": ["education", "age"]\n    },\n    {\n      "rule": "Hours per week worked should be between 0 and 60 for realistic scenarios.",\n      "type": "range",\n      "features_involved": ["hours-per-week"]\n    },\n    {\n      "rule": "Income should be higher for those with more education and higher work hours.",\n      "type": "relationship",\n      "features_involved": ["education", "hours-per-week", "income"]\n    }\n  ],\n  "augmentation_suggestions": [\n    "Combining \'education\' and \'education-num\' to create a more granular educational level feature.",\n    "Normalizing \'hours-per-week\' by \'capital-gain\' to capture work-life balance impact on income."\n  ],\n  "rare_categor

In [10]:
# Save LLM output to a text file for future reference
with open('../data/processed/falcon_feature_insights.txt', 'w') as f:
    f.write(result['generated_text'])

print("Feature insights saved to ../data/processed/falcon_feature_insights.txt")

Feature insights saved to ../data/processed/falcon_feature_insights.txt


In [11]:
with open('../data/processed/falcon_feature_insights.txt', 'r') as f:
    print(f.read())

<|assistant|>
```json
{
  "important_features": ["age", "capital-gain", "hours-per-week", "education-num", "capital-loss"],
  "logical_constraints": [
    {
      "rule": "If education level is PhD, age must be at least 25.",
      "type": "conditional",
      "features_involved": ["education", "age"]
    },
    {
      "rule": "Hours per week worked should be between 0 and 60 for realistic scenarios.",
      "type": "range",
      "features_involved": ["hours-per-week"]
    },
    {
      "rule": "Income should be higher for those with more education and higher work hours.",
      "type": "relationship",
      "features_involved": ["education", "hours-per-week", "income"]
    }
  ],
  "augmentation_suggestions": [
    "Combining 'education' and 'education-num' to create a more granular educational level feature.",
    "Normalizing 'hours-per-week' by 'capital-gain' to capture work-life balance impact on income."
  ],
  "rare_category_handling": [
    "Impute rare categories in 'race' 

In [12]:
# Extract JSON part from the LLM output
raw_output = result['generated_text']

# Use regex to extract JSON between ```json and ```
json_match = re.search(r"```json(.*?)```", raw_output, re.DOTALL)

if json_match:
    json_str = json_match.group(1).strip()
else:
    raise ValueError("No valid JSON found in LLM output.")

llm_output = json.loads(json_str)

# Extract conditioning features
conditioning_features = llm_output['important_features']

print("Conditioning features extracted successfully:", conditioning_features)

Conditioning features extracted successfully: ['age', 'capital-gain', 'hours-per-week', 'education-num', 'capital-loss']


In [13]:
def freedman_diaconis_bins(data):
    data = data.dropna()
    q75, q25 = np.percentile(data, [75 ,25])
    iqr = q75 - q25
    bin_width = 2 * iqr * len(data) ** (-1/3)
    
    if bin_width == 0:
        return 1
    
    bins = int(np.ceil((data.max() - data.min()) / bin_width))
    return max(1, bins)

In [14]:
synthetic_datasets = []

for feature in conditioning_features:
    if np.issubdtype(df[feature].dtype, np.number):
        optimal_bins = freedman_diaconis_bins(df[feature])
        print(f"Using {optimal_bins} bins for feature '{feature}' based on Freedman-Diaconis rule.")
        
        df[feature + '_binned'], bins = pd.qcut(df[feature], q=optimal_bins, retbins=True, duplicates='drop')
        conditioning_feature_to_use = feature + '_binned'
    else:
        conditioning_feature_to_use = feature
    
    unique_values = df[conditioning_feature_to_use].dropna().unique()
    
    for value in unique_values:
        print(f"Generating synthetic data conditioned on {conditioning_feature_to_use} = {value}")
        
        samples_collected = pd.DataFrame()
        attempts = 0
        max_attempts = 10
        
        while len(samples_collected) < 500 and attempts < max_attempts:
            samples_batch = model.sample(1000)
            
            if np.issubdtype(df[feature].dtype, np.number):
                samples_batch[feature + '_binned'] = pd.cut(samples_batch[feature], bins=bins, include_lowest=True)
            
            filtered_batch = samples_batch[samples_batch[conditioning_feature_to_use] == value]
            
            samples_collected = pd.concat([samples_collected, filtered_batch], ignore_index=True)
            attempts += 1
        
        if len(samples_collected) < 500:
            print(f"Warning: Only {len(samples_collected)} samples generated for {conditioning_feature_to_use}={value} after {max_attempts} attempts.")
        
        samples_final = samples_collected.head(500)
        samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"
        
        synthetic_datasets.append(samples_final)


Using 59 bins for feature 'age' based on Freedman-Diaconis rule.
Generating synthetic data conditioned on age_binned = (38.0, 39.0]
Generating synthetic data conditioned on age_binned = (49.0, 50.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (37.0, 38.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (52.0, 53.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (27.0, 28.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (36.0, 37.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (48.0, 49.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (51.0, 52.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (30.0, 31.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (41.0, 42.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (29.0, 30.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (22.0, 23.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (31.0, 32.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (39.0, 40.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (33.0, 34.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (24.0, 25.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (42.0, 43.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (53.0, 55.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (34.0, 35.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (58.0, 59.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (55.0, 56.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (18.0, 19.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (19.0, 20.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (44.0, 45.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (21.0, 22.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (47.0, 48.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (20.0, 21.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (23.0, 24.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (56.0, 58.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (43.0, 44.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (40.0, 41.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (28.0, 29.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (16.999, 18.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (46.0, 47.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (45.0, 46.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (35.0, 36.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (70.0, 90.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (26.0, 27.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (66.0, 70.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (32.0, 33.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (59.0, 61.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (63.0, 66.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (50.0, 51.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (25.0, 26.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on age_binned = (61.0, 63.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Using 1 bins for feature 'capital-gain' based on Freedman-Diaconis rule.
Generating synthetic data conditioned on capital-gain_binned = (-0.001, 99999.0]
Using 313 bins for feature 'hours-per-week' based on Freedman-Diaconis rule.
Generating synthetic data conditioned on hours-per-week_binned = (38.0, 40.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_featu

Generating synthetic data conditioned on hours-per-week_binned = (12.0, 13.0]
Generating synthetic data conditioned on hours-per-week_binned = (15.0, 16.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (44.0, 45.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (48.0, 50.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (75.0, 80.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (28.0, 30.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (32.0, 35.0]
Generating synthetic data conditioned on hours-per-week_binned = (58.0, 60.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (18.0, 20.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (51.0, 52.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (43.0, 44.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (13.0, 15.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (24.0, 25.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (37.0, 38.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (42.0, 43.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (54.0, 55.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (46.0, 48.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (56.0, 58.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (30.0, 32.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (65.0, 70.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (0.999, 4.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (20.0, 23.741]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (55.0, 56.0]
Generating synthetic data conditioned on hours-per-week_binned = (40.0, 41.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (26.0, 28.0]
Generating synthetic data conditioned on hours-per-week_binned = (35.0, 36.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (23.741, 24.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (45.0, 46.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (41.0, 42.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (10.0, 12.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (60.0, 65.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (8.0, 10.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (70.0, 75.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (94.974, 99.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (52.0, 54.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (6.0, 8.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (4.0, 6.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (16.0, 18.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (36.0, 37.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (25.0, 26.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (80.0, 94.974]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on hours-per-week_binned = (50.0, 51.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Using 80 bins for feature 'education-num' based on Freedman-Diaconis rule.
Generating synthetic data conditioned on education-num_binned = (12.0, 13.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_featu

Generating synthetic data conditioned on education-num_binned = (8.0, 9.0]
Generating synthetic data conditioned on education-num_binned = (6.0, 7.0]
Generating synthetic data conditioned on education-num_binned = (13.0, 14.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on education-num_binned = (4.0, 5.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on education-num_binned = (9.0, 10.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on education-num_binned = (11.0, 12.0]
Generating synthetic data conditioned on education-num_binned = (10.0, 11.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on education-num_binned = (3.0, 4.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on education-num_binned = (15.0, 16.0]
Generating synthetic data conditioned on education-num_binned = (14.0, 15.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on education-num_binned = (0.999, 3.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on education-num_binned = (5.0, 6.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Generating synthetic data conditioned on education-num_binned = (7.0, 8.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


Using 1 bins for feature 'capital-loss' based on Freedman-Diaconis rule.
Generating synthetic data conditioned on capital-loss_binned = (-0.001, 4356.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples_final['conditioned_on'] = f"{conditioning_feature_to_use}={value}"


In [15]:
# Combine all conditioned synthetic data
synthetic_combined = pd.concat(synthetic_datasets, ignore_index=True)

# Save to processed data folder
synthetic_combined.to_csv('../data/processed/synthetic_adult_ctgan_conditioned.csv', index=False)

print("✅ SYNTITAN conditioned synthetic data saved successfully. File: synthetic_adult_ctgan_conditioned.csv")


✅ SYNTITAN conditioned synthetic data saved successfully. File: synthetic_adult_ctgan_conditioned.csv


In [17]:
# Load conditioned synthetic data
synthetic_df = pd.read_csv('../data/processed/synthetic_adult_ctgan_conditioned.csv')

# Parse logical constraints from LLM output JSON
logical_constraints = llm_output['logical_constraints']

# Apply each constraint
for constraint in logical_constraints:
    rule_type = constraint['type']
    features = constraint['features_involved']
    
    if rule_type == 'range':
        # Robust extraction of min and max values using regex
        nums = re.findall(r"[-+]?\d*\.\d+|\d+", constraint['rule'])
        if len(nums) >= 2:
            min_val = float(nums[0])
            max_val = float(nums[1])
            feature = features[0]
            
            print(f"Applying range constraint on {feature}: {min_val} to {max_val}")
            
            synthetic_df = synthetic_df[
                (synthetic_df[feature] >= min_val) &
                (synthetic_df[feature] <= max_val)
            ]
        else:
            print(f"⚠️ Warning: Could not extract numeric min/max from rule: {constraint['rule']}")
    
    elif rule_type == 'conditional':
        # Simple hardcoded example for education PhD constraint
        if 'education' in features and 'age' in features:
            print(f"Applying conditional constraint: {constraint['rule']}")
            synthetic_df = synthetic_df[
                ~((synthetic_df['education'] == 'PhD') & (synthetic_df['age'] < 25))
            ]
        else:
            print(f"⚠️ Warning: Conditional constraint logic not implemented for: {constraint['rule']}")
    
    elif rule_type == 'relationship':
        # Placeholder for relationship constraints
        print(f"⚠️ Skipping relationship constraint (not implemented yet): {constraint['rule']}")

print("✅ Logical constraints enforcement completed.")

# Save final logical constraints enforced dataset
synthetic_df.to_csv('../data/processed/synthetic_adult_ctgan_conditioned_logical.csv', index=False)
print("✅ Saved as synthetic_adult_ctgan_conditioned_logical.csv")


Applying conditional constraint: If education level is PhD, age must be at least 25.
Applying range constraint on hours-per-week: 0.0 to 60.0
⚠️ Skipping relationship constraint (not implemented yet): Income should be higher for those with more education and higher work hours.
✅ Logical constraints enforcement completed.
✅ Saved as synthetic_adult_ctgan_conditioned_logical.csv


In [18]:
real_df = df.copy()  # Assuming 'df' is your real dataset already loaded

ctgan_df = pd.read_csv('../data/processed/synthetic_adult_ctgan.csv')
syntitan1_df = pd.read_csv('../data/processed/synthetic_adult_ctgan_conditioned.csv')
syntitan2_df = pd.read_csv('../data/processed/synthetic_adult_ctgan_conditioned_logical.csv')


In [19]:
from scipy.stats import ks_2samp
from scipy.spatial.distance import jensenshannon

def evaluate_ks(real, synthetic, numeric_features):
    results = {}
    for col in numeric_features:
        ks_stat, ks_p = ks_2samp(real[col].dropna(), synthetic[col].dropna())
        results[col] = {'ks_statistic': ks_stat, 'p_value': ks_p}
    return results

def evaluate_js(real, synthetic, categorical_features):
    results = {}
    for col in categorical_features:
        real_dist = real[col].value_counts(normalize=True)
        synth_dist = synthetic[col].value_counts(normalize=True)
        combined_index = real_dist.index.union(synth_dist.index)
        js = jensenshannon(real_dist.reindex(combined_index, fill_value=0),
                           synth_dist.reindex(combined_index, fill_value=0))
        results[col] = js
    return results

In [20]:
numeric_features = real_df.select_dtypes(include='number').columns.tolist()
categorical_features = real_df.select_dtypes(include='object').columns.tolist()

In [21]:
print("🔷 Evaluating CTGAN Baseline")
ctgan_ks = evaluate_ks(real_df, ctgan_df, numeric_features)
ctgan_js = evaluate_js(real_df, ctgan_df, categorical_features)

print("🔷 Evaluating Syntitan1 (Conditioned)")
syntitan1_ks = evaluate_ks(real_df, syntitan1_df, numeric_features)
syntitan1_js = evaluate_js(real_df, syntitan1_df, categorical_features)

print("🔷 Evaluating Syntitan2 (Conditioned + Logical Constraints)")
syntitan2_ks = evaluate_ks(real_df, syntitan2_df, numeric_features)
syntitan2_js = evaluate_js(real_df, syntitan2_df, categorical_features)


🔷 Evaluating CTGAN Baseline
🔷 Evaluating Syntitan1 (Conditioned)
🔷 Evaluating Syntitan2 (Conditioned + Logical Constraints)


In [22]:
import pprint

pp = pprint.PrettyPrinter(indent=4)

print("✅ CTGAN KS Test Results")
pp.pprint(ctgan_ks)

print("\n✅ CTGAN JS Divergence Results")
pp.pprint(ctgan_js)

print("\n✅ Syntitan1 KS Test Results")
pp.pprint(syntitan1_ks)

print("\n✅ Syntitan1 JS Divergence Results")
pp.pprint(syntitan1_js)

print("\n✅ Syntitan2 KS Test Results")
pp.pprint(syntitan2_ks)

print("\n✅ Syntitan2 JS Divergence Results")
pp.pprint(syntitan2_js)


✅ CTGAN KS Test Results
{   'age': {   'ks_statistic': 0.08921716163508492,
               'p_value': 3.323152194014496e-113},
    'capital-gain': {'ks_statistic': 0.6892294462700777, 'p_value': 0.0},
    'capital-loss': {'ks_statistic': 0.24916310924111668, 'p_value': 0.0},
    'education-num': {   'ks_statistic': 0.04683517090998432,
                         'p_value': 1.7980783711575165e-31},
    'fnlwgt': {   'ks_statistic': 0.0646171800620374,
                  'p_value': 1.5335662880299151e-59},
    'hours-per-week': {   'ks_statistic': 0.11160590890943156,
                          'p_value': 4.428482961068991e-177}}

✅ CTGAN JS Divergence Results
{   'education': 0.06646422301732209,
    'income': 0.03982275980369237,
    'marital-status': 0.07725520789457134,
    'native-country': 0.12390046422403223,
    'occupation': 0.14555593984399764,
    'race': 0.12798613750765028,
    'relationship': 0.06659978172144149,
    'sex': 0.0006920970911365942,
    'workclass': 0.061091818486

In [23]:
# Prepare KS data
ks_summary = []

for feature in numeric_features:
    ks_summary.append({
        'Feature': feature,
        'CTGAN KS Statistic': ctgan_ks[feature]['ks_statistic'],
        'Syntitan1 KS Statistic': syntitan1_ks[feature]['ks_statistic'],
        'Syntitan2 KS Statistic': syntitan2_ks[feature]['ks_statistic']
    })

ks_df = pd.DataFrame(ks_summary)
ks_df = ks_df.sort_values(by='Feature').reset_index(drop=True)

print("🔷 KS Test Summary Table")
display(ks_df)


🔷 KS Test Summary Table


Unnamed: 0,Feature,CTGAN KS Statistic,Syntitan1 KS Statistic,Syntitan2 KS Statistic
0,age,0.089217,0.08306,0.071515
1,capital-gain,0.689229,0.665126,0.675386
2,capital-loss,0.249163,0.234324,0.237912
3,education-num,0.046835,0.082158,0.087704
4,fnlwgt,0.064617,0.073211,0.067949
5,hours-per-week,0.111606,0.116283,0.140388


In [24]:
# Prepare JS Divergence data
js_summary = []

for feature in categorical_features:
    js_summary.append({
        'Feature': feature,
        'CTGAN JS Divergence': ctgan_js[feature],
        'Syntitan1 JS Divergence': syntitan1_js[feature],
        'Syntitan2 JS Divergence': syntitan2_js[feature]
    })

js_df = pd.DataFrame(js_summary)
js_df = js_df.sort_values(by='Feature').reset_index(drop=True)

print("🔷 JS Divergence Summary Table")
display(js_df)


🔷 JS Divergence Summary Table


Unnamed: 0,Feature,CTGAN JS Divergence,Syntitan1 JS Divergence,Syntitan2 JS Divergence
0,education,0.066464,0.110042,0.101587
1,income,0.039823,0.043805,0.029747
2,marital-status,0.077255,0.08258,0.089707
3,native-country,0.1239,0.128136,0.129112
4,occupation,0.145556,0.150225,0.151668
5,race,0.127986,0.121603,0.125125
6,relationship,0.0666,0.070316,0.083174
7,sex,0.000692,0.009997,0.02037
8,workclass,0.061092,0.058776,0.066504


In [25]:
def calculate_logical_violation_rate(df, logical_constraints):
    total_rows = len(df)
    violating_rows = pd.Series(False, index=df.index)

    for constraint in logical_constraints:
        rule_type = constraint['type']
        features = constraint['features_involved']
        
        if rule_type == 'range':
            # Robust extraction of min and max using regex
            import re
            nums = re.findall(r"[-+]?\d*\.\d+|\d+", constraint['rule'])
            if len(nums) >= 2:
                min_val = float(nums[0])
                max_val = float(nums[1])
                feature = features[0]
                violation = ~df[feature].between(min_val, max_val)
                violating_rows = violating_rows | violation
        
        elif rule_type == 'conditional':
            if 'education' in features and 'age' in features:
                violation = (df['education'] == 'PhD') & (df['age'] < 25)
                violating_rows = violating_rows | violation
        
        # Relationship constraints skipped for now
    
    violation_rate = violating_rows.sum() / total_rows
    return violation_rate

In [26]:
def calculate_correlation_mae(real_df, synthetic_df, numeric_features):
    real_corr = real_df[numeric_features].corr()
    synthetic_corr = synthetic_df[numeric_features].corr()
    mae = np.mean(np.abs(real_corr - synthetic_corr).fillna(0))
    return mae

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

def calculate_minimum_distance(real_df, synthetic_df, numeric_features):
    scaler = StandardScaler()
    real_scaled = scaler.fit_transform(real_df[numeric_features].dropna())
    synth_scaled = scaler.transform(synthetic_df[numeric_features].dropna())
    
    nn = NearestNeighbors(n_neighbors=1)
    nn.fit(real_scaled)
    distances, _ = nn.kneighbors(synth_scaled)
    
    min_dist = distances.min()
    mean_dist = distances.mean()
    return min_dist, mean_dist

In [28]:
def calculate_rare_category_coverage(real_df, synthetic_df, categorical_features, threshold=0.01):
    coverage_scores = {}
    
    for feature in categorical_features:
        real_counts = real_df[feature].value_counts(normalize=True)
        rare_categories = real_counts[real_counts < threshold].index.tolist()
        
        synth_counts = synthetic_df[feature].value_counts()
        covered = [cat for cat in rare_categories if cat in synth_counts.index]
        
        coverage = len(covered) / len(rare_categories) if rare_categories else 1.0
        coverage_scores[feature] = coverage
    
    avg_coverage = np.mean(list(coverage_scores.values()))
    return avg_coverage, coverage_scores

In [29]:
# Logical Violation Rate
ctgan_violation = calculate_logical_violation_rate(ctgan_df, logical_constraints)
syntitan1_violation = calculate_logical_violation_rate(syntitan1_df, logical_constraints)
syntitan2_violation = calculate_logical_violation_rate(syntitan2_df, logical_constraints)

print(f"CTGAN Logical Violation Rate: {ctgan_violation:.4f}")
print(f"Syntitan1 Logical Violation Rate: {syntitan1_violation:.4f}")
print(f"Syntitan2 Logical Violation Rate: {syntitan2_violation:.4f}")

# Correlation MAE
ctgan_corr_mae = calculate_correlation_mae(real_df, ctgan_df, numeric_features)
syntitan1_corr_mae = calculate_correlation_mae(real_df, syntitan1_df, numeric_features)
syntitan2_corr_mae = calculate_correlation_mae(real_df, syntitan2_df, numeric_features)

print(f"CTGAN Correlation MAE: {ctgan_corr_mae:.4f}")
print(f"Syntitan1 Correlation MAE: {syntitan1_corr_mae:.4f}")
print(f"Syntitan2 Correlation MAE: {syntitan2_corr_mae:.4f}")

# Privacy (Minimum Distance)
ctgan_privacy = calculate_minimum_distance(real_df, ctgan_df, numeric_features)
syntitan1_privacy = calculate_minimum_distance(real_df, syntitan1_df, numeric_features)
syntitan2_privacy = calculate_minimum_distance(real_df, syntitan2_df, numeric_features)

print(f"CTGAN Privacy (Min Dist, Mean Dist): {ctgan_privacy}")
print(f"Syntitan1 Privacy (Min Dist, Mean Dist): {syntitan1_privacy}")
print(f"Syntitan2 Privacy (Min Dist, Mean Dist): {syntitan2_privacy}")

# Rare Category Coverage
ctgan_coverage, _ = calculate_rare_category_coverage(real_df, ctgan_df, categorical_features)
syntitan1_coverage, _ = calculate_rare_category_coverage(real_df, syntitan1_df, categorical_features)
syntitan2_coverage, _ = calculate_rare_category_coverage(real_df, syntitan2_df, categorical_features)

print(f"CTGAN Rare Category Coverage: {ctgan_coverage:.4f}")
print(f"Syntitan1 Rare Category Coverage: {syntitan1_coverage:.4f}")
print(f"Syntitan2 Rare Category Coverage: {syntitan2_coverage:.4f}")


CTGAN Logical Violation Rate: 0.0512
Syntitan1 Logical Violation Rate: 0.0664
Syntitan2 Logical Violation Rate: 0.0000
CTGAN Correlation MAE: 0.0314
Syntitan1 Correlation MAE: 0.0378
Syntitan2 Correlation MAE: 0.0283
CTGAN Privacy (Min Dist, Mean Dist): (0.0001367257404724488, 0.16762662125237177)
Syntitan1 Privacy (Min Dist, Mean Dist): (0.00014345365018432238, 0.1951970495941703)
Syntitan2 Privacy (Min Dist, Mean Dist): (0.00014345365018432238, 0.17681027527899712)
CTGAN Rare Category Coverage: 1.0000
Syntitan1 Rare Category Coverage: 1.0000
Syntitan2 Rare Category Coverage: 1.0000
