In [1]:
import os
import json 
import pandas as pd 
from openai import OpenAI
pd.set_option('display.max_colwidth', None)

try:
    from dotenv import load_dotenv
except ImportError:
    print("Installing python-dotenv package...")
    os.system('pip install python-dotenv')
    from dotenv import load_dotenv
load_dotenv()

client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])

## Cleaning unformatted data using function calling 

User input data is often not formatted consistently. Data Analysts usually clean up the data using rule based parsing using string manipulation and regex. This process can get quite tedious as one has to account for all potential formats.

Using a LLM to parse the data can significantly streamline data cleaning. 

In this notebook, we clean up some artificial user input data that includes order items from an imaginary ecommerce company and their respective dimensions.  

### The data 

The data does not adhere to any one format. To a human its parseable, but creating a set of rules that would be able to parse even 5 distinct formats is going to be difficult. 

In [2]:
df = pd.read_csv('data/item_dimension_tickets.csv')
df

Unnamed: 0,ticket_content
0,"OI-842342, length = 73cm, width = 45cm, height = 55cm"
1,"#Item-325364, 46x34x56 cm"
2,"#OI-43253252, l-45cm,w-34cm,h-67cm"
3,#452453 34inx56cmx2ft
4,"OrderItem#373578 96,56,23"


### The function call

However, LLMs are smart enough to semantically parse messy text data. 

Function calling is a tool in the ChatCompletion endpoint. It allows you to specify the output format of the completion request as a JSON object. 

Lets define a python function that accepts a ticket and returns a dictionary of the contents from it that we want extracted. 

In [3]:
def call_and_clean(text: str, model: str = "gpt-4-turbo-preview") -> dict:
    """
    Use OpenAI function tool to extract order item id and dimensions with their units from user tickets.

    Args:
    text (str): The text that you want to parse and clean.
    model (str): The OpenAI model alias.

    Returns:
    dict: The cleaned and parsed text output from the model.
    """

    # View this as a structured output format you want  
    tools = [{"type": "function",
            "function": {
                "name": "get_item_id_and_dimensions",
                "description": "Gets the order item number and dimensions from a user ticket about an item.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "order_item_id": {
                            "type": "number",
                            "description": "The order item identifier, its only the number that usually follows a hashtag, or oi tag."
                        },
                        "length": { "type": "number" },
                        "width": { "type": "number" },
                        "height": { "type": "number" },
                        "length_units": { "type": "string", "enum": ["cm", "in", "ft"] },
                        "width_units": { "type": "string", "enum": ["cm", "in", "ft"] },
                        "height_units": { "type": "string", "enum": ["cm", "in", "ft"] }
                    },
                    "required": ["order_item_id", "length", "width", "height", "length_units", "width_units", "height_units"]
                }
            }
        }
    ]
    
    system_prompt = """
    Given an item ticket, parse the ticket such that the get_item_id_and_dimensions function can be called for the contents of the ticket.

    If no dimensions are provided assume they are in cm. Sometimes the units might be spelled incorrectly, infer the unit in those cases.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role":"system", "content": system_prompt},
            {"role": "user", "content": f"Item Ticket: {text}"}
        ],
        tools=tools 
    )

    return json.loads(response.choices[0].message.tool_calls[0].function.arguments)

### Clean the data 

In [4]:
df['parsed_ticket_content'] = df['ticket_content'].apply(call_and_clean)

In [5]:
df

Unnamed: 0,ticket_content,parsed_ticket_content
0,"OI-842342, length = 73cm, width = 45cm, height = 55cm","{'height': 55, 'height_units': 'cm', 'length': 73, 'length_units': 'cm', 'order_item_id': 842342, 'width': 45, 'width_units': 'cm'}"
1,"#Item-325364, 46x34x56 cm","{'height': 56, 'height_units': 'cm', 'length': 46, 'length_units': 'cm', 'order_item_id': 325364, 'width': 34, 'width_units': 'cm'}"
2,"#OI-43253252, l-45cm,w-34cm,h-67cm","{'height': 67, 'height_units': 'cm', 'length': 45, 'length_units': 'cm', 'order_item_id': 43253252, 'width': 34, 'width_units': 'cm'}"
3,#452453 34inx56cmx2ft,"{'height': 2, 'height_units': 'ft', 'length': 34, 'length_units': 'in', 'order_item_id': 452453, 'width': 56, 'width_units': 'cm'}"
4,"OrderItem#373578 96,56,23","{'height': 23, 'height_units': 'cm', 'length': 96, 'length_units': 'cm', 'order_item_id': 373578, 'width': 56, 'width_units': 'cm'}"
