# Generate Mock Data

Demonstrates use of the Intelligence Toolkit library to generate mock data, both structured records and unstructured texts.

See [readme](https://github.com/microsoft/intelligence-toolkit/blob/main/app/workflows/generate_mock_data/README.md) for more details.

In [None]:
import sys

sys.path.append("..")
import os
from intelligence_toolkit.generate_mock_data.api import GenerateMockData
from intelligence_toolkit.AI.openai_configuration import OpenAIConfiguration
import json

In [3]:
# Create the workflow object
gmd = GenerateMockData()
# Set the AI configuration
ai_configuration = OpenAIConfiguration(
    {
        "api_type": "OpenAI",
        "api_key": os.environ["OPENAI_API_KEY"],
        "model": "gpt-4o",
    }
)
gmd.set_ai_configuration(ai_configuration)
# Load the data schema
schema_path = "../example_outputs/generate_mock_data/customer_complaints/customer_complaints_schema.json"
json_schema = json.loads(open(schema_path, "r").read())
# Set the schema
gmd.set_schema(json_schema)
print("Loaded data schema")
print(json_schema)

Loaded data schema
{'$schema': 'http://json-schema.org/draft/2020-12/schema', 'title': 'Customer complaints', 'description': 'An example schema storing an array of customer complaints', 'type': 'object', 'properties': {'customer_complaints': {'type': 'array', 'description': 'The list of customers and their complaints', 'items': {'type': 'object', 'properties': {'name': {'type': 'string', 'description': 'The name of the customer'}, 'street': {'type': 'string', 'description': 'The street of the customer, including property name/number'}, 'city': {'type': 'string', 'description': 'The city of the customer'}, 'age': {'type': 'number', 'description': 'The age of the customer'}, 'email': {'type': 'string', 'description': 'The email address of the customer'}, 'price_issue': {'type': 'boolean', 'description': 'The complaint is a price issue'}, 'quality_issue': {'type': 'boolean', 'description': 'The complaint is a quality issue'}, 'service_issue': {'type': 'boolean', 'description': 'The compla

In [4]:
# Generate mock data records
await gmd.generate_data_records(
    num_records_overall=100,
    records_per_batch=10,
    duplicate_records_per_batch=1,
    related_records_per_batch=1,
)
print("Generated data records")

100%|██████████| 10/10 [00:16<00:00,  1.63s/it]

Generated data records





In [5]:
# Inspect the data as JSON
print(gmd.json_object)

{'customer_complaints': [{'name': 'Alicia Johnson', 'street': '123 Maple St.', 'city': 'Springfield', 'age': 34, 'email': 'alicia.j@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2021-Q3'}, {'name': 'Bob Johnson', 'street': '125 Maple Street', 'city': 'Springfield', 'age': 36, 'email': 'bob.johnson@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2021-Q3'}, {'name': 'Charlie Brown', 'street': '456 Oak Avenue', 'city': 'Rivertown', 'age': 29, 'email': 'charlie.brown@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'B', 'quarter': '2021-Q4'}, {'name': 'Diana Prince', 'street': '789 Pine Road', 'city': 'Metropolis', 'age': 42, 'email': 'diana.prince@ex

In [6]:
# Inspect the data as dataframes (one per array field)
print(gmd.array_dfs)

{'customer_complaints':               name            street         city  age  \
0   Alicia Johnson     123 Maple St.  Springfield   34   
1      Bob Johnson  125 Maple Street  Springfield   36   
2    Charlie Brown    456 Oak Avenue    Rivertown   29   
3     Diana Prince     789 Pine Road   Metropolis   42   
4       Ethan Hunt    321 Birch Lane  Springfield   37   
..             ...               ...          ...  ...   
94      Daniel Lee     23 Elm Street  Springfield   39   
95    Laura Wilson     89 Birch Lane      Seaview   27   
96  James Martinez   56 Cedar Avenue      Hilltop   45   
97    Olivia Davis    34 Willow Road   Brookfield   38   
98   Sophia Garcia     67 Ash Street    Ridgewood   33   

                         email  price_issue  quality_issue  service_issue  \
0         alicia.j@example.com        False           True          False   
1      bob.johnson@example.com        False           True           True   
2    charlie.brown@example.com         True     

In [7]:
# Use the customer_complaints dataframe to generate mock text data (first 10 records only)
df = gmd.array_dfs["customer_complaints"][:10]
await gmd.generate_text_data(df)
print("Generated text data")

100%|██████████| 10/10 [00:15<00:00,  1.57s/it]

Generated text data





In [8]:
# Inspect texts as dataframe
print(gmd.text_df)

                                           mock_text
0  **Customer Feedback Report**\n\n**Customer Inf...
1  **Customer Feedback Report**\n\n**Customer Inf...
2  **Customer Complaint Report**\n\n**Customer In...
3  **Customer Service Report**\n\n**Customer Info...
4  **Customer Service Report**\n\n**Customer Info...
5  **Customer Service Report**\n\n**Customer Info...
6  **Customer Feedback Report**\n\n**Customer Inf...
7  **Customer Feedback Report**\n\n**Customer Inf...
8  **Customer Feedback Report**\n\n**Customer Inf...
9  **Customer Feedback Report**\n\n**Customer Det...
