In [1]:
import os
from dotenv import load_dotenv
from indoxGen.llms import IndoxApi
load_dotenv()
INDOX_API_KEY = os.getenv("INDOX_API_KEY")

## Generate data from LLm without data

In [2]:
from indoxGen.synthCore import DataFromPrompt

user_prompt = "Generate a dataset with 3 column and 3 row about soccer."

LLM = IndoxApi(api_key=INDOX_API_KEY)
# instruction = DataGenerationPrompt.get_instruction(user_prompt)

data_generator = DataFromPrompt(llm=LLM,user_instruction=user_prompt,verbose=1)

generated_df = data_generator.generate_data()

# print(generated_df)
data_generator.save_to_excel("output_dataFromPrompt.xlsx")


[32mINFO[0m: [1mVerbose mode activated.[0m
[32mINFO[0m: [1mGenerated data: {
  "soccer_data": [
    {
      "player_name": "Lionel Messi",
      "team": "Paris Saint-Germain",
      "goals_scored": 672
    },
    {
      "player_name": "Cristiano Ronaldo",
      "team": "Al Nassr",
      "goals_scored": 674
    },
    {
      "player_name": "Neymar Jr.",
      "team": "Al Hilal",
      "goals_scored": 400
    }
  ]
}[0m
[32mINFO[0m: [1mData saved to Excel file at: output_dataFromPrompt.xlsx[0m


In [3]:
generated_df

Unnamed: 0,player_name,team,goals_scored
0,Lionel Messi,Paris Saint-Germain,672
1,Cristiano Ronaldo,Al Nassr,674
2,Neymar Jr.,Al Hilal,400


## Generate data with uploaded data

In [4]:
from indoxGen.synthCore import DataFromPrompt
from indoxGen.utils import Excel

dataset_file_path = "output_dataFromPrompt.xlsx"

excel_loader = Excel(dataset_file_path) 
df = excel_loader.load()  
user_prompt = " based on given dataset generate one unique row about soccer"
LLM = IndoxApi(api_key=INDOX_API_KEY)

added_row = DataFromPrompt(llm=LLM, user_instruction=user_prompt, example_data=df, verbose=1).generate_data()
print(added_row)



[32mINFO[0m: [1mVerbose mode activated.[0m
[32mINFO[0m: [1mGenerated data: {
  "player_name": "Kylian Mbapp\u00e9",
  "team": "Paris Saint-Germain",
  "goals_scored": 250
}[0m
     player_name                 team  goals_scored
0  Kylian Mbappé  Paris Saint-Germain           250


In [5]:
added_row

Unnamed: 0,player_name,team,goals_scored
0,Kylian Mbappé,Paris Saint-Germain,250


## Generate Data with Few-shot input output Examples

In [6]:
from indoxGen.synthCore import FewShotPrompt
# Define your Language Model (LLM) instance (replace with the actual LLM you're using)
LLM = IndoxApi(api_key=INDOX_API_KEY)

# Define a user prompt for the generation task
user_prompt = "Describe the formation of stars in simple terms. Return the result in JSON format, with the key 'description'."

# Define few-shot examples (input-output pairs) to help guide the LLM
examples = [
    {
        "input": "Describe the process of photosynthesis.",
        "output": "Photosynthesis is the process by which green plants use sunlight to synthesize food from carbon dioxide and water."
    },
    {
        "input": "Explain the water cycle.",
        "output": "The water cycle is the process by which water circulates between the earth's oceans, atmosphere, and land, involving precipitation, evaporation, and condensation."
    }
]

# Create an instance of FewShotPrompt using the defined LLM, user prompt, and few-shot examples
data_generator = FewShotPrompt(
    llm=LLM,                            # Language model instance (LLM)
    user_instruction=user_prompt,        # Main user instruction or query
    examples=examples,                   # Few-shot input-output examples
    verbose=1,                           # Verbosity level (optional)
    max_tokens=8000                      # Max tokens for generation (optional)
)

# Generate the data based on the few-shot setup
df = data_generator.generate_data()


In [7]:
df

Unnamed: 0,description
0,The formation of stars begins in a cloud of ga...


Stars form from clouds of gas and dust in space. When parts of these clouds collapse under their own gravity, they heat up and create a dense core. As the core gets hotter and denser, nuclear fusion begins, turning hydrogen into helium and releasing energy, which makes the star shine.

In [5]:
data_generator.save_to_excel("output_data.xlsx",df)

[32mINFO[0m: [1mDataFrame saved to Excel file at: output_data.xlsx[0m


## Generate Data with Attribute

In [2]:
from indoxGen.synthCore import DataFromAttributedPrompt
LLM = IndoxApi(api_key=INDOX_API_KEY)

user_instruction =  "Generate a {adjective} sentence that is {length}."

attributes = {
        "adjective": ["serious", "funny"],
        "length": ["short", "long"]
 }

dataset = DataFromAttributedPrompt(llm=LLM,user_instruction=user_instruction,attributes=attributes,verbose=1)

df = dataset.generate_data()

[32mINFO[0m: [1mDataFromAttributedPrompt initialized.[0m
[32mINFO[0m: [1mGenerated 4 prompts from attributes in 0.00 seconds.[0m
[32mINFO[0m: [1mGenerated 4 data points in 12.00 seconds.[0m


In [3]:
df

Unnamed: 0,sentence
0,"The storm approached rapidly, darkening the sky."
1,In an era characterized by rapid technological...
2,"I told my computer I needed a break, and now i..."
3,"As the overly ambitious squirrel, sporting a t..."
