In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
INDOX_API_KEY = os.environ['INDOX_API_KEY']
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
NVIDIA_API_KEY = os.environ['NVIDIA_API_KEY']

In [3]:
from openai import OpenAI
import os
import re
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import json
import matplotlib

In [4]:
from indox.llms import OpenAi

nemotron = OpenAi(api_key=NVIDIA_API_KEY, model="nvidia/nemotron-4-340b-instruct",
                  base_url="https://integrate.api.nvidia.com/v1")
openai = OpenAi(api_key=OPENAI_API_KEY, model="gpt-4o-mini")

[32mINFO[0m: [1mInitializing OpenAi with model: nvidia/nemotron-4-340b-instruct[0m
[32mINFO[0m: [1mOpenAi initialized successfully[0m
[32mINFO[0m: [1mInitializing OpenAi with model: gpt-4o-mini[0m
[32mINFO[0m: [1mOpenAi initialized successfully[0m


In [4]:
datagen_model = "gpt-4o-mini"
question = """
Create a CSV file with 10 rows of housing data.
Each row should include the following fields:
 - id (incrementing integer starting at 1)
 - house size (m^2)
 - house price
 - location
 - number of bedrooms

Make sure that the numbers make sense (i.e. more rooms is usually bigger size, more expensive locations increase price. more size is usually higher price etc. make sure all the numbers make sense). Also only respond with the CSV.
"""

response = client.chat.completions.create(
    model=datagen_model,
    messages=[
        {"role": "system", "content": "You are a helpful assistant designed to generate synthetic data."},
        {"role": "user", "content": question}
    ]
)
res = response.choices[0].message.content
print(res)

```csv
id,house_size_m2,house_price_location_USD,location,number_of_bedrooms
1,50,150000,"Downtown",1
2,75,200000,"City Suburb",2
3,100,300000,"Popular Neighborhood",3
4,120,350000,"City Suburb",3
5,150,500000,"Upscale Area",4
6,180,600000,"Luxury District",4
7,200,750000,"Waterfront",5
8,250,900000,"Downtown",5
9,300,1200000,"Luxury District",6
10,400,1500000,"Exclusive Area",7
```


In [5]:
question = """
Create a Python program to generate 100 rows of housing data.
I want you to at the end of it output a pandas dataframe with 100 rows of data.
Each row should include the following fields:
 - id (incrementing integer starting at 1)
 - house size (m^2)
 - house price
 - location
 - number of bedrooms

Make sure that the numbers make sense (i.e. more rooms is usually bigger size, more expensive locations increase price. more size is usually higher price etc. make sure all the numbers make sense).
"""

response = client.chat.completions.create(
    model=datagen_model,
    messages=[
        {"role": "system", "content": "You are a helpful assistant designed to generate synthetic data."},
        {"role": "user", "content": question}
    ]
)
res = response.choices[0].message.content
print(res)

Certainly! Below is a Python program that generates 100 rows of synthetic housing data. The program uses the Pandas library to create a DataFrame with the specified structure and considerations regarding the relationships between the features.

```python
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Constants for generating data
num_rows = 100
location_multiplier = {
    'city_center': 1.5,
    'suburbs': 1.0,
    'rural': 0.5
}

# Generate sample data
data = []
for i in range(1, num_rows + 1):
    location = np.random.choice(['city_center', 'suburbs', 'rural'])
    
    # Define number of bedrooms based on location
    if location == 'city_center':
        bedrooms = np.random.randint(1, 5)  # more expensive areas tend to have smaller properties
    elif location == 'suburbs':
        bedrooms = np.random.randint(2, 6)  # suburbs usually have bigger houses
    else:  # rural
        bedrooms = np.random.randint(3, 7)  # rural house

In [5]:
question = f"""
  I am creating input output training pairs to fine tune my gpt model. I want the input to be product name and category and output to be description. the category should be things like: mobile phones, shoes, headphones, laptop, electronic toothbrush, etc. and also more importantly the categories should come under 4 main topics: vehicle, clothing, toiletries, food)
  After the number of each example also state the topic area. The format should be of the form:
  1. topic_area
  Input: product_name, category
  Output: description

  Do not add any extra characters around that formatting as it will make the output parsing break.

  Here are some helpful examples so you get the style of output correct.

  1) clothing
  Input: "Shoe Name, Shoes"
  Output: "Experience unparalleled comfort. These shoes feature a blend of modern style and the traditional superior cushioning, perfect for those always on the move."
  """


In [6]:
output_string = ""
for i in range(3):
    response = nemotron.chat(prompt=question,
                                  system_prompt="You are a helpful assistant designed to generate synthetic data.")
    output_string += response + "\n" + "\n"


[32mINFO[0m: [1mGenerating response[0m
[32mINFO[0m: [1mResponse generated successfully[0m
[32mINFO[0m: [1mGenerating response[0m
[32mINFO[0m: [1mResponse generated successfully[0m
[32mINFO[0m: [1mGenerating response[0m
[32mINFO[0m: [1mResponse generated successfully[0m


In [17]:
print(output_string[:1000])  #displaying truncated response

Sure, I'd be happy to help you generate some input-output training pairs for your GPT model. Here are some examples that follow the format you provided:

1. vehicle
Input: "Tesla Model S, Electric Cars"
Output: "The Tesla Model S is a premium electric sedan that combines sleek design, impressive performance, and advanced technology. With a range of up to 400 miles on a single charge, it offers a sustainable and luxurious driving experience."

2. clothing
Input: "Levi's 501 Jeans, Denim Pants"
Output: "Levi's 501 Jeans are a timeless classic, featuring a straight-leg fit and durable denim construction. These jeans are perfect for any casual occasion and are built to last, making them a staple in any wardrobe."

3. toiletries
Input: "Oral-B Pro 1000 Electric Toothbrush, Electronic Toothbrushes"
Output: "The Oral-B Pro 1000 Electric Toothbrush provides a superior clean compared to manual toothbrushes. With its oscillating and rotating technology, it removes more plaque and promotes health

In [18]:
correct_output_string = output_string[154:]

In [22]:
print(correct_output_string[:1000])

1. vehicle
Input: "Tesla Model S, Electric Cars"
Output: "The Tesla Model S is a premium electric sedan that combines sleek design, impressive performance, and advanced technology. With a range of up to 400 miles on a single charge, it offers a sustainable and luxurious driving experience."

2. clothing
Input: "Levi's 501 Jeans, Denim Pants"
Output: "Levi's 501 Jeans are a timeless classic, featuring a straight-leg fit and durable denim construction. These jeans are perfect for any casual occasion and are built to last, making them a staple in any wardrobe."

3. toiletries
Input: "Oral-B Pro 1000 Electric Toothbrush, Electronic Toothbrushes"
Output: "The Oral-B Pro 1000 Electric Toothbrush provides a superior clean compared to manual toothbrushes. With its oscillating and rotating technology, it removes more plaque and promotes healthier gums, leaving your mouth feeling fresh and clean."

4. food
Input: " KIND Dark Chocolate Nuts & Sea Salt Bars, Snack Bars"
Output: "KIND Dark Chocolat

In [23]:
pattern = re.compile(r'(\d+)\.\s*(\w+)\s*Input:\s*"(.+?),\s*(.+?)"\s*Output:\s*"(.*?)"', re.DOTALL)
matches = pattern.findall(correct_output_string)

topics = []
products = []
categories = []
descriptions = []

for match in matches:
    number, topic, product, category, description = match
    topics.append(topic)
    products.append(product)
    categories.append(category)
    descriptions.append(description)

In [25]:
products

['Tesla Model S',
 "Levi's 501 Jeans",
 'Oral-B Pro 1000 Electric Toothbrush',
 ' KIND Dark Chocolate Nuts & Sea Salt Bars',
 'Apple MacBook Pro',
 'Adidas Ultraboost 21 Running Shoes',
 "Gillette Fusion5 ProGlide Men's Razor",
 "Annie's Organic Shells & White Cheddar Macaroni & Cheese",
 'Sony WH-1000XM4 Wireless Noise-Cancelling Headphones',
 'Honda Civic',
 'Tesla Model S',
 "Levi's 501 Jeans",
 'Oral-B Pro 1000 Electric Toothbrush',
 ' KIND Dark Chocolate Nuts & Sea Salt Bars',
 'Apple MacBook Pro',
 'Adidas Ultraboost 21 Running Shoes',
 "Gillette Fusion5 ProGlide Men's Razor",
 "Annie's Organic Shells & White Cheddar Macaroni & Cheese",
 'Sony WH-1000XM4 Wireless Noise-Cancelling Headphones',
 'Honda Civic',
 'Tesla Model S',
 "Levi's 501 Jeans",
 'Oral-B Pro 1000 Electric Toothbrush',
 ' KIND Dark Chocolate Nuts & Sea Salt Bars',
 'Apple MacBook Pro',
 'Adidas Ultraboost 21 Running Shoes',
 "Gillette Fusion5 ProGlide Men's Razor",
 "Annie's Organic Shells & White Cheddar Macaron

In [26]:
data = {
    'Product': products,
    'Category': categories,
    'Description': descriptions
}

df = pd.DataFrame(data)