In [1]:
import asyncio 
import json
import random
import os
from collections import Counter

import factory
import factory.fuzzy
import factory.random
from faker import Faker
from faker_vehicle import VehicleProvider
import pandas as pd
from openai import OpenAI, AsyncOpenAI

SEED = 43
random.seed(SEED)
factory.random.reseed_random(SEED)

fake = Faker()
fake.add_provider(VehicleProvider)



In [2]:
openai_api_key=""

In [3]:
os.environ["OPENAI_API_KEY"]=openai_api_key

## 1. Create Random data with Factory

In [4]:
k = 10000

In [5]:
colors = ["Green", "Black", "White", "Red", "Blue"]
type_vehicle = ["Vehicle", "Machine"]

In [6]:
class VehicleFactory(factory.DictFactory):
    class Meta:
        exclude = "_vehicle"

    type = factory.fuzzy.FuzzyChoice(type_vehicle)
    color = factory.fuzzy.FuzzyChoice(colors)
    _vehicle = factory.LazyAttribute(
        lambda o: fake.vehicle_object()
        if o.type == "Vehicle"
        else fake.machine_object()
    )
    brand = factory.LazyAttribute(lambda o: o._vehicle["Make"])
    model = factory.LazyAttribute(lambda o: o._vehicle["Model"])
    category = factory.LazyAttribute(lambda o: o._vehicle["Category"])
    year = factory.LazyAttribute(lambda o: o._vehicle["Year"])

    price = factory.fuzzy.FuzzyInteger(1000, 10000, 100)

In [7]:
VehicleFactory()

{'type': 'Vehicle',
 'color': 'White',
 'brand': 'Volkswagen',
 'model': 'Tiguan',
 'category': 'SUV',
 'year': 2019,
 'price': 9900}

In [8]:
vehicle = [VehicleFactory() for i in range(k)]
len(vehicle)

10000

## 2. LLM

In [9]:
semaphore = asyncio.Semaphore(5)

In [10]:
# Define the function to generate a description and JSON facets with OpenAI Chat API
async def generate_description_and_facets(item):
    # Fill in missing fields with default values to avoid errors
    item_with_defaults = {
        "type": item.get("type", "vehicle"),
        "brand": item.get("brand", "Unknown Brand"),
        "model": item.get("model", "Unknown Model"),
        "year": item.get("year", "Unknown Year"),
        "engine_type": item.get("engine_type", "N/A"),
        "engine_size": item.get("engine_size", "Unknown Size"),
        "transmission": item.get("transmission", "N/A"),
        "drive_type": item.get("drive_type", "N/A"),
        "fuel_type": item.get("fuel_type", "N/A"),
        "color": item.get("color", "Unknown Color"),
        "interior_color": item.get("interior_color", "Standard"),
        "trim_level": item.get("trim_level", "Standard"),
        "notable_features": item.get("notable_features", ["Standard Features"]),
        "safety_features": item.get("safety_features", ["Basic Safety Features"]),
        "price": item.get("price", "N/A"),
        "category": item.get("category", "Unknown Category"),
    }

    # Define the prompt with explicit instructions for JSON output
    prompt = f"""
    You are creating entries for a **synthetic dataset of vehicles** for a **fake car e-commerce search engine** on Elasticsearch. Each vehicle entry needs a **short, engaging description** without directly mentioning the price, but instead using adjectives like "affordable," "premium," or "budget-friendly" where appropriate. Additionally, provide a **structured JSON output for standardized facet fields**.

    For any missing fields, do not leave placeholders like "N/A" or "Unknown." Instead, infer realistic and typical values based on the vehicle's type, brand, model, year, and category. Use complete and natural values for each field, ensuring no field is left blank or with placeholder text. Use the following guidelines to infer missing details:

    - **Drive Types**: Use standardized terms like "FWD" for Front Wheel Drive, "RWD" for Rear Wheel Drive, "AWD" for All Wheel Drive, and "4WD" for Four Wheel Drive, inferred based on the vehicle type and category.
    - **Fuel Types**: Choose "Gasoline," "Diesel," "Electric," or "Hybrid" based on the vehicle’s age, category, and brand.
    - **Trim Levels**: Include a realistic trim level such as "XLE," "Sport," "SE," or "Standard" that aligns with the vehicle model and brand.
    - **Engine Type and Size**: Specify a typical engine type and size (e.g., "V6 3.5L" or "I4 2.0L") that would be common for the vehicle’s brand, model, and year.
    - **Transmission**: Use terms like "Automatic" or "Manual" for vehicles. Inferred based on vehicle type and era if not provided.
    - **Safety Features**: For vehicles, include common safety features like "ABS," "Airbags," and "Lane-keeping assist." For machinery, use "Stability control" or "Reinforced structure" where applicable.
    - **Notable Features**: List standout features, such as "Leather seats," "Navigation," "Backup camera," or "Bluetooth," inferred based on typical configurations for the brand and model.

    Below are the item details. Avoid directly referencing the price in the description, but include the `price` in the structured JSON output.

    Item details:
    - Type: {item.get("type", "vehicle")}
    - Make and Model: {item.get("brand", "Unknown Brand")} {item.get("model", "Unknown Model")}
    - Year: {item.get("year", "Unknown Year")}
    - Engine Type and Size: {item.get("engine_type", "")}
    - Transmission: {item.get("transmission", "")}
    - Drive Type: {item.get("drive_type", "")}
    - Fuel Type: {item.get("fuel_type", "")}
    - Exterior Color: {item.get("color", "Unknown Color")}
    - Interior Color: {item.get("interior_color", "Standard")}
    - Trim Level: {item.get("trim_level", "Standard")}
    - Notable Features: {', '.join(item.get("notable_features", []))}
    - Safety Features: {', '.join(item.get("safety_features", []))}
    - Price: {item.get("price", "N/A")}
    - Category: {item.get("category", "Unknown Category")}

    Please respond in **JSON format only** with the following structure, using `{{JSON START}}` and `{{JSON END}}` as delimiters around the JSON:

    {{JSON START}}
    {{
        "description": "<description_text>",
        "facets": {{
            "type": "{item_with_defaults['type']}",
            "category": "{item_with_defaults['category']}",
            "brand": "{item_with_defaults['brand']}",
            "model": "{item_with_defaults['model']}",
            "year": "{item_with_defaults['year']}",
            "engine_type": "{item_with_defaults['engine_type']}",
            "engine_size": "{item_with_defaults['engine_size']}",
            "transmission": "{item_with_defaults['transmission']}",
            "drive_type": "{item_with_defaults['drive_type']}",
            "fuel_type": "{item_with_defaults['fuel_type']}",
            "exterior_color": "{item_with_defaults['color']}",
            "interior_color": "{item_with_defaults['interior_color']}",
            "trim_level": "{item_with_defaults['trim_level']}",
            "notable_features": ["{', '.join(item_with_defaults['notable_features'])}"],
            "safety_features": ["{', '.join(item_with_defaults['safety_features'])}"],
            "price": "{item_with_defaults['price']}"
        }}
    }}
    {{JSON END}}
    """
    # Execute the API call within the semaphore context
    async with semaphore:
        response = await client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt},
            ],
            model="gpt-4o-mini",
            max_tokens=300,
            temperature=0.5,
        )

    text_response=response.choices[0].message.content.strip()
    try:
        # Find JSON within delimiters
        json_start = text_response.find(r"{JSON START}") + len(r"{JSON START}")
        json_end = text_response.find(r"{JSON END}")
        json_content = text_response[json_start:json_end].strip()
        
        # Parse the JSON part
        data = json.loads(json_content)
        return data
    except (ValueError, json.JSONDecodeError) as e:
        print("Error parsing response:", e)
        return {
            "description": "Parsing error - check response format",
            "facets": item_with_defaults  # Use original item with defaults as fallback
        }

In [11]:
client = AsyncOpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)


In [12]:
items =vehicle[0:5000]

In [13]:
processed_data = await asyncio.gather(*[generate_description_and_facets(item) for item in items])

In [14]:
processed_data

[{'description': 'Experience the thrill of driving with this 2008 Lotus Exige S, a budget-friendly sports coupe that combines lightweight design with exhilarating performance. With its striking red exterior and agile handling, this vehicle promises an unforgettable ride. Perfect for enthusiasts seeking a unique driving experience without breaking the bank.',
  'facets': {'type': 'Vehicle',
   'category': 'Coupe',
   'brand': 'Lotus',
   'model': 'Exige S',
   'year': '2008',
   'engine_type': 'I4',
   'engine_size': '1.8L',
   'transmission': 'Manual',
   'drive_type': 'RWD',
   'fuel_type': 'Gasoline',
   'exterior_color': 'Red',
   'interior_color': 'Standard',
   'trim_level': 'Standard',
   'notable_features': ['Leather seats', 'Navigation', 'Bluetooth'],
   'safety_features': ['ABS', 'Airbags', 'Stability control'],
   'price': '5700'}},
 {'description': 'Experience the future of driving with the 2014 BMW i3, a compact hatchback that combines sustainability with style. This budget

In [15]:
len(processed_data)

5000

In [16]:
type_counts = Counter(item['facets']['type'] for item in processed_data)
type_counts

Counter({'Vehicle': 2500, 'Machine': 2500})

In [17]:
df = pd.DataFrame(processed_data)


In [18]:
df.to_json("./data/processed_vehicle_machine_data.json", orient="records", indent=2)