In [55]:
import json
import os
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate
from langchain_aws import BedrockLLM, BedrockChat
from langchain_core.output_parsers import StrOutputParser


In [56]:
# Load the question from config.json file
with open('config.json', 'r') as config_file:
    config = json.load(config_file)
question = config['question']

In [57]:
# Function to prepare batch input with corresponding file names
def prepare_batch_input(input_folder, question):
    batch_inputs = []
    filenames = []
    
    # Loop through all markdown files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".md"):
            # Read the content of the markdown file
            file_path = os.path.join(input_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                markdown_text = file.read()
            
            # Prepare the input dictionary for each file
            batch_input = {
                'question': question,
                'markdown_text': markdown_text
            }
            batch_inputs.append(batch_input)
            filenames.append(filename)
    
    return batch_inputs, filenames


In [58]:
# Model Import
model = BedrockChat(
    credentials_profile_name='default',
    model_id='meta.llama3-70b-instruct-v1:0',
    model_kwargs={
        "prompt": "string",
        "temperature": 0,
        "top_p": 0.9,
        "max_gen_len": 2048
    }
)

In [None]:
prompt = 

In [None]:
messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "<markdown text>"
                },
                {
                    "type": "text",
                    "text": "</markdown text>"
                },
                {
                    "type": "text",
                    "text": prompt
                }
            ]
        }
    ]

In [59]:
# Prompt
prompt = ChatPromptTemplate.from_template(
    "Using the markdown text provided below, answer the user's question clearly and concisely:\n\n"
    "User's Question:\n{question}\n\n"
    "Markdown Text:\n{markdown_text}\n\n"
    "Keep all the extracted information from the responses in separate columns in the output excel file for each extracted thing and the out should be as csv file "
    "Your response should be  in the csv format ,accurate, specific, and directly address the user's inquiry."
)

In [60]:
# Chain
chain = prompt | model | StrOutputParser()

In [61]:
# Inputs
input_folder = 'input_files'
question = question

In [62]:
# Prepare batch inputs
batch_inputs, filenames = prepare_batch_input(input_folder, question)
print(batch_inputs)
print(filenames)

[{'question': 'Please extract the following things such as name of the item,price,total bill and the card used for payment from the given markdown file.', 'markdown_text': '# Welcome to Chick-fil-A Loyola In-Line (#02841) Chicago, IL\n\nPhone: 312-266-8888\nOperator: Lauren Sillich\n\n## CUSTOMER COPY\n9/28/2022 8:41:23 PM\n\nOrder Number: 11127591\n\n| Item | Price |\n| --- | --- |\n| 1 Meal-SpcdyIx+PJ | $10.59 |\n| Spcy Dix +PJ | - |\n| Fries LG | $0.40 |\n| Coke MD | - |\n\n| Description | Amount |\n| --- | --- |\n| Sub. Total: | $10.99 |\n| Tax: | $1.29 |\n| Total: | $12.28 |\n\nChange: $0.00\nMastercard: $12.28\nRegister: 4\nTran Seq No: 11127591\n\nWe are hiring GREAT people like you!\nText "CFA" to (312) 561-4919\nwww.chick-fil-a.com/forgottoscan\nRestroom Code: 8407\n\nSep 28 2022 8:41 pm\n\nCard Number: ************4526\nCard Type: MASTER CARD\nApproval: NDRQAN\nAID: A0000000041010'}, {'question': 'Please extract the following things such as name of the item,price,total bill a

In [63]:
# Response
responses = chain.batch(batch_inputs)



In [64]:
# Function to process responses and generate output data
def process_responses(responses, filenames):
    output_data = []

    # Ensure responses is always a list
    if not isinstance(responses, list):
        responses = [responses]

    # Ensure that the number of responses and filenames match
    if len(responses) != len(filenames):
        raise ValueError("The number of responses does not match the number of filenames")

    # Loop through responses and filenames to create output data
    # for response, filename in zip(responses, filenames):
    #     output_data.append({
    #         'Filename': filename,
    #         'Extracted Information': response 
    #     })

    return responses,filenames

In [65]:
# Process the responses
final_response = process_responses(responses, filenames)

In [68]:
final_response[0][0]

'Here is the extracted information in CSV format:\n\n"Name of the Item","Price","Total Bill","Card Used for Payment"\n"Meal-SpcdyIx+PJ","$10.59","$12.28","Mastercard"\n"Spicy Dix +PJ","-","$12.28","Mastercard"\n"Fries LG","$0.40","$12.28","Mastercard"\n"Coke MD","-","$12.28","Mastercard"\n\nLet me know if this meets your requirements!'

In [48]:
for i in range(4):
    print(final_response[i]['Extracted Information'])
    

Here are the items with their respective prices extracted from the markdown text:

1. Meal-SpcdyIx+PJ - $10.59
2. Fries LG - $0.40
3. Coke MD - (no price listed)

Note that the prices for "Spicy Dix +PJ" are not listed, so I couldn't include them in the extracted list.
Here are the items and their respective prices extracted from the markdown file:

1. Coffee - $5.75
2. Cappuccino - $7.00
3. Danish - $5.00
4. Apple Turnover - $6.00
Here are the items with their respective prices extracted from the markdown text:

1. 4 Bud Aluminum Bottle - $32.00
2. 2 Topgolf Gameplay - $74.00
3. 2 Membership Charge - $10.00
4. Topgolf Gameplay - $37.00

Let me know if you need any further assistance!
Here are the items with their respective prices extracted from the markdown text:

1. BVO BAG WM - $3.99
2. KETIL HABANERO LIME CHIPS - $3.99
3. MANGO BERRY SMALL - $5.00
4. OATMEAL RAISIN COOKIE GCT - $10.99
5. BAG REFUND - -$0.10


In [69]:
# Convert the response to a DataFrame
df = pd.DataFrame(final_response)
print(df.iloc[0,1])


Here is the extracted information in CSV format:

"Item","Price","Total Bill","Card Used"
"Coffee","5.75","31.28","Mastercard EMV"
"Cappuccino","7.00","31.28","Mastercard EMV"
"Danish","5.00","31.28","Mastercard EMV"
"Apple Turnover","6.00","31.28","Mastercard EMV"

Let me know if this meets your requirements!


In [70]:
# Output file path
output_file = 'extracted_output.xlsx'

In [71]:
# Write the DataFrame to an Excel file
df.to_excel(output_file, index=False)

print(f"Output written to {output_file}")

Output written to extracted_output.xlsx
