# Generate Mock Data

Demonstrates use of the Intelligence Toolkit library to generate mock data, both structured records and unstructured texts.

See [readme](https://github.com/microsoft/intelligence-toolkit/blob/main/app/workflows/generate_mock_data/README.md) for more details.

In [1]:
import sys
sys.path.append("..")
import os
from toolkit.generate_mock_data import GenerateMockData
from toolkit.AI.openai_configuration import OpenAIConfiguration
import pandas as pd
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create the workflow object
gmd = GenerateMockData()
# Set the AI configuration
ai_configuration = OpenAIConfiguration(
    {
        "api_type": "OpenAI",
        "api_key": os.environ["OPENAI_API_KEY"],
        "model": "gpt-4o",
    }
)
gmd.set_ai_configuration(ai_configuration)
# Load the data schema
schema_path = "../example_outputs/generate_mock_data/customer_complaints/customer_complaints_schema.json"
json_schema = json.loads(open(schema_path, "r").read())
# Set the schema
gmd.set_schema(json_schema)
print("Loaded data schema")
print(json_schema)

Loaded data schema
{'$schema': 'http://json-schema.org/draft/2020-12/schema', 'title': 'Customer complaints', 'description': 'An example schema storing an array of customer complaints', 'type': 'object', 'properties': {'customer_complaints': {'type': 'array', 'description': 'The list of customers and their complaints', 'items': {'type': 'object', 'description': 'An object list item', 'properties': {'name': {'type': 'string', 'description': 'The name of the customer'}, 'street': {'type': 'string', 'description': 'The street of the customer, including property name/number'}, 'city': {'type': 'string', 'description': 'The city of the customer'}, 'age': {'type': 'number', 'description': 'The age of the customer'}, 'email': {'type': 'string', 'description': 'The email address of the customer'}, 'price_issue': {'type': 'boolean', 'description': 'The complaint is a price issue'}, 'quality_issue': {'type': 'boolean', 'description': 'The complaint is a quality issue'}, 'service_issue': {'type':

In [3]:
# Generate mock data records
await gmd.generate_data_records(
    num_records_overall=100,
    records_per_batch=10,
    duplicate_records_per_batch=1,
    related_records_per_batch=1,
)
print("Generated data records")

100%|██████████| 10/10 [00:20<00:00,  2.08s/it]

Generated data records





In [4]:
# Inspect the data as JSON
print(gmd.json_object)

{'customer_complaints': [{'name': 'Alicia Johnson', 'street': '123 Maple St.', 'city': 'Springfield', 'age': 34, 'email': 'alicia.johnson@example.com', 'price_issue': True, 'quality_issue': False, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2021-Q3'}, {'name': 'Robert Johnson', 'street': '124 Maple Street', 'city': 'Springfield', 'age': 36, 'email': 'robert.johnson@example.com', 'price_issue': False, 'quality_issue': False, 'service_issue': True, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2021-Q3'}, {'name': 'Emily Clark', 'street': '456 Oak Avenue', 'city': 'Riverside', 'age': 28, 'email': 'emily.clark@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'B', 'quarter': '2021-Q4'}, {'name': 'Michael Brown', 'street': '789 Pine Road', 'city': 'Greenfield', 'age': 42, 'email': 'mich

In [5]:
# Inspect the data as dataframes (one per array field)
print(gmd.array_dfs)

{'customer_complaints':                name               street         city  age  \
0    Alicia Johnson        123 Maple St.  Springfield   34   
1    Robert Johnson     124 Maple Street  Springfield   36   
2       Emily Clark       456 Oak Avenue    Riverside   28   
3     Michael Brown        789 Pine Road   Greenfield   42   
4     Jessica Smith       321 Cedar Lane     Lakeside   30   
..              ...                  ...          ...  ...   
95       Liam Davis      789 Pine Avenue     Hilltown   41   
96    Sophia Wilson         321 Oak Lane   Greenfield   27   
97   James Martinez       654 Maple Road    Riverside   50   
98  Isabella Garcia  987 Birch Boulevard    Sunnydale   39   
99   Ethan Martinez     123 Cedar Street    Riverbend   33   

                         email  price_issue  quality_issue  service_issue  \
0   alicia.johnson@example.com         True          False          False   
1   robert.johnson@example.com        False          False           True   


In [6]:
# Use the customer_complaints dataframe to generate mock text data (first 10 records only)
df = gmd.array_dfs["customer_complaints"][:10]
await gmd.generate_text_data(df)
print("Generated text data")

100%|██████████| 10/10 [00:28<00:00,  2.89s/it]

Generated text data





In [7]:
# Inspect texts as dataframe
print(gmd.text_df)

                                           mock_text
0  **Customer Feedback Report**\n\n**Customer Inf...
1  ---\n\n**Customer Service Report**\n\n**Custom...
2  **Customer Feedback Report**\n\n**Customer Inf...
3  **Customer Service Report**\n\n**Customer Info...
4  **Customer Feedback Report**\n\n**Customer Inf...
5  ---\n\n**Customer Feedback Report**\n\n**Custo...
6  **Customer Feedback Report**\n\n**Customer Inf...
7  **Customer Feedback Report**\n\n**Customer Inf...
8  **Customer Service Report**\n\n**Customer Info...
9  **Customer Complaint Report**\n\n**Customer De...
