# Generate Mock Data

Demonstrates use of the Intelligence Toolkit library to generate mock data, both structured records and unstructured texts.

See [readme](https://github.com/microsoft/intelligence-toolkit/blob/main/app/workflows/generate_mock_data/README.md) for more details.

In [1]:
import sys

sys.path.append("..")
import os
from intelligence_toolkit.generate_mock_data.api import GenerateMockData
from intelligence_toolkit.AI.openai_configuration import OpenAIConfiguration
import json

In [2]:
# Create the workflow object
gmd = GenerateMockData()
# Set the AI configuration
ai_configuration = OpenAIConfiguration(
    {
        "api_type": "OpenAI",
        "api_key": os.environ["OPENAI_API_KEY"],
        "model": "gpt-4o",
    }
)
gmd.set_ai_configuration(ai_configuration)
# Load the data schema
schema_path = "../example_outputs/generate_mock_data/customer_complaints/customer_complaints_schema.json"
json_schema = json.loads(open(schema_path, "r").read())
# Set the schema
gmd.set_schema(json_schema)
print("Loaded data schema")
print(json_schema)

Loaded data schema
{'$schema': 'http://json-schema.org/draft/2020-12/schema', 'title': 'Customer complaints', 'description': 'An example schema storing an array of customer complaints', 'type': 'object', 'properties': {'customer_complaints': {'type': 'array', 'description': 'The list of customers and their complaints', 'items': {'type': 'object', 'properties': {'name': {'type': 'string', 'description': 'The name of the customer'}, 'street': {'type': 'string', 'description': 'The street of the customer, including property name/number'}, 'city': {'type': 'string', 'description': 'The city of the customer'}, 'age': {'type': 'number', 'description': 'The age of the customer'}, 'email': {'type': 'string', 'description': 'The email address of the customer'}, 'price_issue': {'type': 'boolean', 'description': 'The complaint is a price issue'}, 'quality_issue': {'type': 'boolean', 'description': 'The complaint is a quality issue'}, 'service_issue': {'type': 'boolean', 'description': 'The compla

In [3]:
# Generate mock data records
await gmd.generate_data_records(
    num_records_overall=100,
    records_per_batch=10,
    duplicate_records_per_batch=1,
    related_records_per_batch=1,
)
print("Generated data records")

# 28s

100%|██████████| 9/9 [00:10<00:00,  1.15s/it]

Generated data records





In [4]:
# Inspect the data as JSON
print(gmd.json_object)

{'customer_complaints': [{'name': 'John Doe', 'street': '123 Elm Street', 'city': 'Springfield', 'age': 35, 'email': 'john.doe@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2023-Q1'}, {'name': 'Jane Smith', 'street': '456 Oak Avenue', 'city': 'Riverton', 'age': 28, 'email': 'jane.smith@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'B', 'quarter': '2022-Q4'}, {'name': 'Alice Johnson', 'street': '789 Pine Road', 'city': 'Laketown', 'age': 42, 'email': 'alice.johnson@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'C', 'quarter': '2021-Q3'}, {'name': 'Bob Brown', 'street': '101 Maple Street', 'city': 'Hill Valley', 'age': 50, 'email': 'bob.brown@example.com', 'p

In [5]:
# Inspect the data as dataframes (one per array field)
print(gmd.array_dfs)

{'customer_complaints':              name               street           city  age  \
0        John Doe       123 Elm Street    Springfield   35   
1      Jane Smith       456 Oak Avenue       Riverton   28   
2   Alice Johnson        789 Pine Road       Laketown   42   
3       Bob Brown     101 Maple Street    Hill Valley   50   
4     Carol White       202 Birch Lane      Sunnydale   37   
..            ...                  ...            ...  ...   
95    Emily White        321 Pine Road     Coast City   42   
96  Michael Green      654 Cedar Drive      Bludhaven   39   
97     Laura Blue  987 Birch Boulevard  Keystone City   50   
98   Henry Silver   159 Spruce Terrace     Smallville   33   
99    Olivia Gold    753 Cherry Circle  National City   37   

                        email  price_issue  quality_issue  service_issue  \
0        john.doe@example.com         True          False          False   
1      jane.smith@example.com        False           True          False   
2  

In [6]:
# Use the customer_complaints dataframe to generate mock text data (first 10 records only)
df = gmd.array_dfs["customer_complaints"][:10]
await gmd.generate_text_data(df)
print("Generated text data")

100%|██████████| 10/10 [00:09<00:00,  1.10it/s]

Generated text data





In [7]:
# Inspect texts as dataframe
print(gmd.text_df)

                                           mock_text
0  **Customer Service Report**\n\n**Customer Info...
1  **Customer Feedback Report**\n\n**Customer Inf...
2  **Customer Service Report**\n\n**Customer Info...
3  **Customer Feedback Report**\n\n**Customer Inf...
4  ---\n\n**Customer Feedback Report**\n\n**Name:...
5  **Customer Complaint Report**\n\n**Customer In...
6  **Customer Service Report**\n\n**Customer Info...
7  **Customer Feedback Report**\n\n**Customer Inf...
8  **Customer Feedback Report**\n\n**Customer Inf...
9  **Customer Feedback Report**\n\n**Customer Inf...
