In [4]:
import os
from dotenv import load_dotenv
from indoxGen.llms import IndoxApi
load_dotenv()
INDOX_API_KEY = os.getenv("INDOX_API_KEY")

## Generate data from LLm without data

In [5]:
from indoxGen.synthCore.SynthDataGen import DataFromPrompt
from indoxGen.synthCore.SynthDataGen import DataGenerationPrompt


user_prompt = "Generate a dataset with 3 column and 3 row about astronomy."

LLM = IndoxApi(api_key=INDOX_API_KEY)
instruction = DataGenerationPrompt.get_instruction(user_prompt)

data_generator = DataFromPrompt(
    prompt_name="Generate Research Paper Abstracts",
    args={
        "llm": LLM,
        "n": 1,
       "instruction": instruction,
    },
    outputs={"generations": "generate"},

)

generated_df = data_generator.run()

print(generated_df)
data_generator.save_to_excel("output_data.xlsx")


[32mINFO[0m: [1mGenerated DataFrame with shape: (3, 1)[0m
                                                data
0  {'Celestial_Object': 'Quasar', 'Distance_Light...
1  {'Celestial_Object': 'Neutron Star', 'Distance...
2  {'Celestial_Object': 'Exoplanet', 'Distance_Li...
[32mINFO[0m: [1mDataFrame saved to Excel file at: output_data.xlsx[0m


In [6]:
generated_df

Unnamed: 0,data
0,"{'Celestial_Object': 'Quasar', 'Distance_Light..."
1,"{'Celestial_Object': 'Neutron Star', 'Distance..."
2,"{'Celestial_Object': 'Exoplanet', 'Distance_Li..."


## Generate data with uploaded data

In [7]:
from indoxGen.synthCore.SynthDataGen import DataFromPrompt
from indoxGen.synthCore.SynthDataGen import DataGenerationPrompt
from indoxGen.synthCore.SynthDataGen import Excel

dataset_file_path = "output_data.xlsx"

excel_loader = Excel(dataset_file_path) 
df = excel_loader.load()  
user_prompt = " based on given dataset generate one unique row about astronomy"
LLM = IndoxApi(api_key=INDOX_API_KEY)

instruction = DataGenerationPrompt.get_instruction(user_prompt)

dataset = DataFromPrompt(
    prompt_name="Generate New Column",
    args={
        "llm": LLM,
        "n": 1,
        "instruction": instruction,
    },
    outputs={"generations": "generate"},
    dataframe=df
)
updated_df = dataset.run()
print(updated_df)


Error loading Excel file: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.
[32mINFO[0m: [1mGenerated DataFrame with shape: (1, 9)[0m
  Celestial_Object  Distance_Light_Years  Redshift  \
0           Quasar                 12000       3.2   

                    Luminosity Galaxy_Type  Discovery_Year  \
0  10^14 times that of the Sun  Lenticular            1985   

                   Location                    Mass  \
0  Constellation Ursa Major  1 billion solar masses   

                                          New_Column  
0  [High-energy emissions, Supermassive black hol...  


In [8]:
updated_df

Unnamed: 0,Celestial_Object,Distance_Light_Years,Redshift,Luminosity,Galaxy_Type,Discovery_Year,Location,Mass,New_Column
0,Quasar,12000,3.2,10^14 times that of the Sun,Lenticular,1985,Constellation Ursa Major,1 billion solar masses,"[High-energy emissions, Supermassive black hol..."


## Generate Data with Few-shot input output Examples

In [11]:
from indoxGen.synthCore.SynthDataGen import FewShotPrompt

examples = [
    {
        "input": "Generate a dataset with 3 columns and 2 rows about biology.",
        "output": '[{"Species": "Human", "Cell Count": 37.2, "Age": 30}, {"Species": "Mouse", "Cell Count": 3.2, "Age": 2}]'
    },
    {
        "input": "Generate a dataset with 3 columns and 2 rows about chemistry.",
        "output": '[{"Element": "Hydrogen", "Atomic Number": 1, "Weight": 1.008}, {"Element": "Oxygen", "Atomic Number": 8, "Weight": 15.999}]'
    }
]

user_prompt = "Generate a dataset with 3 columns and 2 rows about astronomy."
#instruction = DataGenerationPrompt.get_instruction(user_prompt)
LLM = IndoxApi(api_key=INDOX_API_KEY)

data_generator= FewShotPrompt(
    prompt_name="Generate Astronomy Dataset",
    args={
        "llm": LLM,
        "n": 1,  
        "instruction": user_prompt,  
    },
    outputs={"generations": "generate"},
    examples=examples  
)

generated_df = data_generator.run()

print(generated_df)
data_generator.save_to_excel("output_data.xlsx",df)


  Celestial Body  Diameter (km)  Distance from Sun (million km)
0          Earth          12742                           149.6
1           Mars           6779                           227.9
[32mERROR[0m: [31m[1mDataFrame is empty. Cannot save to Excel.[0m
[31mERROR[0m: [31m[1mDataFrame is empty. Cannot save to Excel.[0m


ValueError: DataFrame is empty. Cannot save to Excel.

In [10]:
df

## Generate Data with Attribute

In [12]:
from indoxGen.synthCore.SynthDataGen import DataFromAttributedPrompt
LLM = IndoxApi(api_key=INDOX_API_KEY)

args = {
    "instruction": "Generate a {adjective} sentence that is {length}.",
    "attributes": {
        "adjective": ["serious", "funny"],
        "length": ["short", "long"]
    },
    "llm":LLM
}

dataset = DataFromAttributedPrompt(prompt_name="ExamplePrompt",
                                            args=args,
                                            outputs={})

df = dataset .run()

print(df)

[32mINFO[0m: [1mGenerated 4 prompts from attributes.[0m
[32mINFO[0m: [1mRunning prompt: Generate a serious sentence that is short.[0m
[32mINFO[0m: [1mRunning prompt: Generate a serious sentence that is long.[0m
[32mINFO[0m: [1mRunning prompt: Generate a funny sentence that is short.[0m
[32mINFO[0m: [1mRunning prompt: Generate a funny sentence that is long.[0m
[32mINFO[0m: [1mGenerated DataFrame with 4 records.[0m
                                            response
0                             Time waits for no one.
1  In the quiet stillness of the early morning, a...
2  I told my computer I needed a break, and now i...
3  As the overly ambitious squirrel, wearing a ti...


In [None]:
df