In [None]:
%pip install langchain
%pip install openai
%pip install pandas

In [None]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
import pandas as pd
import json
import os

In [None]:
# https://platform.openai.com/
os.environ["OPENAI_API_KEY"] = "your api key"
print(os.environ["OPENAI_API_KEY"])

In [None]:
chat_model = ChatOpenAI(temperature=0, openai_api_key=os.environ["OPENAI_API_KEY"])

In [None]:
response_schemas = [
    ResponseSchema(name="raw_data", description="This is the expense raw data as a string"),
    ResponseSchema(name="parsed_data", description="This is the parsed data as a list of dictionaries and th guide you'll use to parse the raw data"),
]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [None]:
format_instructions = output_parser.get_format_instructions()
print(format_instructions)

In [None]:
template = """
You will be given a series of records with expenses raw data.
Omit rows with no amount data.
Find the best corresponding match for the amount, description and date.
The closest match will be the one with the closest semantic meaning. Not just string similarity.

{format_instructions}

Wrap your final output with closed and open brackets (a list of json objects)

raw_data INPUT:
{raw_data}

PARSED DATA INPUT:
{parsed_data}

YOUR RESPONSE:
"""

In [None]:
prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(template)  
    ],
    input_variables=["raw_data", "parsed_data"],
    partial_variables={"format_instructions": format_instructions}
)

In [None]:
df = pd.read_csv('./structured_data.csv')
parsed_data = df.to_dict('records')
parsed_data

In [None]:
raw_data = ""

with open('input.csv', 'r') as f:
    raw_data = f.read()

raw_data

In [None]:
_input = prompt.format_prompt(raw_data=raw_data, parsed_data=parsed_data)
print (f"There are {len(_input.messages)} message(s)")
print (f"Type: {type(_input.messages[0])}")
print ("---------------------------")
print (_input.messages[0].content)

In [None]:
output = chat_model(_input.to_messages())

In [None]:
print (type(output))
print (output)


In [None]:
if "```json" in output.content:
    json_string = output.content.split("```json")[1].strip()
else:
    json_string = output.content

In [None]:
structured_data = json.loads(json_string)
structured_data

In [None]:
pd.DataFrame(structured_data)