In [None]:
import os
from dotenv import load_dotenv

load_dotenv('api.env')
INDOX_API_KEY = os.environ['INDOX_API_KEY']
NVIDIA_API_KEY = os.environ['NVIDIA_API_KEY']

NVIDIA_API_KEY

In [None]:
from indoxGen.llms import OpenAi, IndoxApi

indox = IndoxApi(api_key=INDOX_API_KEY)
nemotron = OpenAi(api_key=NVIDIA_API_KEY, model="nvidia/nemotron-4-340b-instruct", base_url="https://integrate.api.nvidia.com/v1")


In [None]:
# Sample dataset as a list of dictionaries
sample_data = [
    {
        'age': 25,
        'income': 45.5,
        'years_of_experience': 3,
        'job_title': 'Junior Developer',
        'remarks': 'Looking to grow my career.'
    },
    {
        'age': 32,
        'income': 60.0,
        'years_of_experience': 7,
        'job_title': 'Developer',
        'remarks': 'Experienced professional.'
    },
    {
        'age': 45,
        'income': 80.2,
        'years_of_experience': 20,
        'job_title': 'Lead Developer',
        'remarks': 'Seasoned expert in the field.'
    },
    {
        'age': 28,
        'income': 50.1,
        'years_of_experience': 5,
        'job_title': 'Developer',
        'remarks': 'Eager to take on new challenges.'
    },
    {
        'age': 38,
        'income': 70.0,
        'years_of_experience': 15,
        'job_title': 'Senior Developer',
        'remarks': 'Dedicated to delivering quality software.'
    },
    {
        'age': 23,
        'income': 40.0,
        'years_of_experience': 2,
        'job_title': 'Junior Developer',
        'remarks': 'Passionate about coding and learning.'
    },
    {
        'age': 50,
        'income': 90.5,
        'years_of_experience': 25,
        'job_title': 'Lead Developer',
        'remarks': 'Expert in software architecture.'
    },
    {
        'age': 29,
        'income': 55.3,
        'years_of_experience': 6,
        'job_title': 'Developer',
        'remarks': 'Enjoys collaborating with teams.'
    },
    {
        'age': 35,
        'income': 65.0,
        'years_of_experience': 10,
        'job_title': 'Senior Developer',
        'remarks': 'Focused on backend development.'
    },
    {
        'age': 27,
        'income': 48.7,
        'years_of_experience': 4,
        'job_title': 'Developer',
        'remarks': 'Interested in front-end technologies.'
    }
]

# You can convert this list into a pandas DataFrame if needed
import pandas as pd

data = pd.DataFrame(sample_data)

# Preview the data
data


In [None]:
numerical_columns = ['age', 'income', 'years_of_experience']
text_columns = ['job_title', 'remarks']
integer_columns = ['age','years_of_experience']

all_columns = numerical_columns + text_columns

example_data = data[all_columns].to_dict(orient='records')

In [None]:
from libs.indoxGen.indoxGen.hybrid_synth import TextTabularSynth, initialize_gan_synth, initialize_llm_synth

# Initialize LLM setup
llm_setup = initialize_llm_synth(
    generator_llm=nemotron,
    judge_llm=indox,
    columns=text_columns,
    example_data=example_data,
    user_instruction="Generate realistic and diverse text data based on the numerical context provided.",
    diversity_threshold=0.3,  # Adjusted for higher diversity
    max_diversity_failures=30,
    verbose=1
)

# Initialize numerical data
numerical_data = pd.DataFrame(data[numerical_columns])

# Initialize GAN setup
gan_setup = initialize_gan_synth(
    input_dim=200,
    generator_layers=[128, 256, 512],
    discriminator_layers=[512, 256, 128],
    learning_rate=2e-4,
    beta_1=0.5,
    beta_2=0.9,
    batch_size=64,
    epochs=1,
    n_critic=5,
    categorical_columns=[],
    mixed_columns={},
    integer_columns=integer_columns,
    data=numerical_data
)

# Create an instance of TextTabularSynth
synth_pipeline = TextTabularSynth(tabular=gan_setup, text=llm_setup)

# Number of synthetic samples to generate
num_samples = 10

# Generate synthetic data
synthetic_data = synth_pipeline.generate(num_samples)

# Preview the synthetic data
print("\nSynthetic Data:")
synthetic_data.head()