# Extract Record Data

Demonstrates use of the Intelligence Toolkit library to extract schema-aligned data records from unstructured texts.

See [readme](https://github.com/microsoft/intelligence-toolkit/blob/main/app/workflows/extract_data_records/README.md) for more details.

In [1]:
import sys
sys.path.append("..")
import os
from toolkit.extract_record_data import ExtractRecordData
from toolkit.AI.openai_configuration import OpenAIConfiguration
import pandas as pd
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create the workflow object
erd = ExtractRecordData()
# Set the AI configuration
ai_configuration = OpenAIConfiguration(
    {
        "api_type": "OpenAI",
        "api_key": os.environ["OPENAI_API_KEY"],
        "model": "gpt-4o",
    }
)
erd.set_ai_configuration(ai_configuration)
# Load the data schema
schema_path = "../example_outputs/extract_record_data/customer_complaints/customer_complaints_schema.json"
json_schema = json.loads(open(schema_path, "r").read())
# Set the schema
erd.set_schema(json_schema)
print("Loaded data schema")
print(json_schema)

Loaded data schema
{'$schema': 'http://json-schema.org/draft/2020-12/schema', 'title': 'Customer complaints', 'description': 'An example schema storing an array of customer complaints', 'type': 'object', 'properties': {'customer_complaints': {'type': 'array', 'description': 'The list of customers and their complaints', 'items': {'type': 'object', 'description': 'An object list item', 'properties': {'name': {'type': 'string', 'description': 'The name of the customer'}, 'street': {'type': 'string', 'description': 'The street of the customer, including property name/number'}, 'city': {'type': 'string', 'description': 'The city of the customer'}, 'age': {'type': 'number', 'description': 'The age of the customer'}, 'email': {'type': 'string', 'description': 'The email address of the customer'}, 'price_issue': {'type': 'boolean', 'description': 'The complaint is a price issue'}, 'quality_issue': {'type': 'boolean', 'description': 'The complaint is a quality issue'}, 'service_issue': {'type':

In [3]:
# Load the text data (first 10 texts only)
text_data_path = "../example_outputs/extract_record_data/customer_complaints/customer_complaints_texts.csv"
text_data = pd.read_csv(text_data_path)[:10]
print("Loaded text data")
print(text_data)

Loaded text data
                                           mock_text
0  **Customer Service Representative:** Good afte...
1  **Customer Service Representative:** Good afte...
2  **Customer Service Representative:** Good afte...
3  **Customer Service Representative:** Good afte...
4  **Customer Service Representative:** Good afte...
5  **Customer Service Representative:** Good afte...
6  **Customer Service Representative:** Good afte...
7  **Customer Service Representative:** Good afte...
8  **Customer Service Representative:** Good afte...
9  **Customer Service Representative:** Good afte...


In [4]:
# Extract data records
await erd.extract_record_data(
    input_texts=text_data['mock_text'].tolist()
)
print("Extracted data records")

100%|██████████| 10/10 [00:04<00:00,  2.34it/s]

Extracted data records





In [5]:
# Inspect the data as JSON
print(erd.json_object)

{'customer_complaints': [{'name': 'Bob Johnson', 'street': '123 Maple Street', 'city': 'Springfield', 'age': 36, 'email': 'bob.johnson@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': False, 'product_code': 'A', 'quarter': '2023-Q2'}, {'name': 'Alice Johnson', 'street': '456 Oak Avenue', 'city': 'Springfield', 'age': 0, 'email': 'alice.j@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': True, 'description_issue': False, 'product_code': 'A', 'quarter': '2023-Q3'}, {'name': 'Alice Smith', 'street': '789 Pine Road', 'city': 'Springfield', 'age': 0, 'email': 'alice.smith@example.com', 'price_issue': False, 'quality_issue': True, 'service_issue': False, 'delivery_issue': False, 'description_issue': True, 'product_code': 'A', 'quarter': '2023-Q2'}, {'name': 'Alice Johnson', 'street': '123 Maple Street', 'city': 'Shelbyville', 'age': 0, 'email': 'alice.johnson@ano

In [6]:
# Inspect the data as dataframes (one per array field)
print(erd.array_dfs)

{'customer_complaints':             name               street          city  age  \
0    Bob Johnson     123 Maple Street   Springfield   36   
1  Alice Johnson       456 Oak Avenue   Springfield    0   
2    Alice Smith        789 Pine Road   Springfield    0   
3  Alice Johnson     123 Maple Street   Shelbyville    0   
4  Alice Johnson     123 Maple Street   Springfield    0   
5  Charlie Brown       321 Elm Street   Shelbyville    0   
6   Diana Prince       987 Cedar Lane    Metropolis    0   
7    Evan Wright  654 Birch Boulevard        Gotham   30   
8    Fiona Apple     111 Apple Street     Star City   29   
9  George Martin     222 Music Avenue  Central City    0   

                           email  price_issue  quality_issue  service_issue  \
0        bob.johnson@example.com        False           True          False   
1            alice.j@example.com        False           True          False   
2        alice.smith@example.com        False           True          False   