# This notebook is to be used to test new functionalities of the rag system

In [2]:
import os
import logging
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
from PyPDF2.errors import PdfStreamError
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm

import os
import sys
from load_dotenv import load_dotenv

load_dotenv()
%load_ext autoreload
%autoreload 2
sys.path.append("../rag/")

In [4]:
document_path = "../../data"

def prepare_source(documents):
    for doc in tqdm(documents):
        source_data_split = doc.metadata["source"].split("\\")

        source_data_split = list(dict.fromkeys(source_data_split))
        k = [source_data_split.remove(l) for l in ["..", "data", "documents"] if l in source_data_split]
            
        source = '/'.join([str(elem) for elem in source_data_split])
        doc.metadata["source"] = source

def load_documents(DATA_PATH):
    documents = []
    
    def process_directory(dir_path):
        nonlocal documents
        
        items = os.listdir(dir_path)
        subdirs = [d for d in items if os.path.isdir(os.path.join(dir_path, d))]
        pdf_files = [f for f in items if f.lower().endswith('.pdf')]
        txt_files = [f for f in items if f.lower().endswith('.txt')]
        
        if not items or (not pdf_files and not txt_files and not subdirs):
            logging.info(f"No relevant files or subdirectories in {dir_path}, skipping...")
            return
        
        if pdf_files:
            logging.info(f"PDF files found in {dir_path}: {pdf_files}")
            document_loader = PyPDFDirectoryLoader(dir_path)
            try:
                # Try to load the documents and append to the documents list
                loaded_docs = document_loader.load()
                documents.extend(loaded_docs)
                logging.info(f"Successfully loaded {len(loaded_docs)} documents from PDFs in {dir_path}.")
            except PdfStreamError as e:
                logging.error(f"Error loading PDF in {dir_path}: {str(e)} - File might be corrupted.")
            except Exception as e:
                # Catch any other exceptions that may occur during PDF loading
                logging.error(f"Unexpected error loading PDFs in {dir_path}: {str(e)}")

        for subdir in subdirs:
            process_directory(os.path.join(dir_path, subdir))
    
    process_directory(DATA_PATH)
    logging.info(f"Total documents loaded: {len(documents)}")
    prepare_source(documents)
    logging.info(f"Formatted source correctly")

    return documents

In [5]:
document_loader = PyPDFDirectoryLoader(document_path).load()

In [6]:
document_loader

[Document(metadata={'source': '..\\..\\data\\Bodeguilla.pdf', 'page': 0}, page_content=''),
 Document(metadata={'source': '..\\..\\data\\Bodeguilla.pdf', 'page': 1}, page_content=''),
 Document(metadata={'source': '..\\..\\data\\Bodeguilla.pdf', 'page': 2}, page_content=''),
 Document(metadata={'source': '..\\..\\data\\Bodeguilla.pdf', 'page': 3}, page_content='MALLORQUÍ\nTapes\n—Pernil ibèric de gla Juan Manuel D.O. Guijuelo (80g)\nCecina de bou, esparrecs, formatge Roncal i oli d’amontillat\nOstra Regal Special nº2 al natural (u.)\nOstra amb gaspatxo blanc de meló (u.)\nOstra a sa brasa amb gamba vermella a l’all\nCarpaccio de gamba vermella, fonoll marí i emulsió de ses caps\nTàrtar de tonyina, tomàtiga, alvocat a sa brasa i algues\nSteak tartar de vedella vella de muntanya\nEnsaladilla tradicional de patata amb gamba y mahonesa\nBurrata, albergínia rostida, tomàtigues escalivades i llimona  \nAmanida de tomàtigues de temporada, all verd de pinyons i bacallà\nEspàrrecs verds a sa br

In [67]:
print(document_loader[3].page_content)

MALLORQUÍ
Tapes
—Pernil ibèric de gla Juan Manuel D.O. Guijuelo (80g)
Cecina de bou, esparrecs, formatge Roncal i oli d’amontillat
Ostra Regal Special nº2 al natural (u.)
Ostra amb gaspatxo blanc de meló (u.)
Ostra a sa brasa amb gamba vermella a l’all
Carpaccio de gamba vermella, fonoll marí i emulsió de ses caps
Tàrtar de tonyina, tomàtiga, alvocat a sa brasa i algues
Steak tartar de vedella vella de muntanya
Ensaladilla tradicional de patata amb gamba y mahonesa
Burrata, albergínia rostida, tomàtigues escalivades i llimona  
Amanida de tomàtigues de temporada, all verd de pinyons i bacallà
Espàrrecs verds a sa brasa amb romesco casolà i salsa tàrtara
Les Braves de La Bodeguilla
Ou a baixa temperatura amb salsa de foie i trufa fresca
Croquetes meloses de pernil de Guijuelo (6u.)
Zamburinyes amb picadillo i vinagre de Chardonnay (6u.)
Calamars fregits amb allioli de prebes de Padró
Fideuà de carrabiner i allioli dels seus corals
Pop rostit amb ou, papada ibèrica i patata a s’all
Canel

In [12]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("../../data/The Merchants.pdf")
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [13]:
pages

[Document(metadata={'source': '../../data/The Merchants.pdf', 'page': 0}, page_content="STARTERS & SHARING\nMAIN COURSE\nTo be enjoyed with a minimum of two courses per person beginning from when your food order is taken. \nIt is available for the alloted time of your reservation.\nWe are happy to provide information pertaining to allergens and intolerance on request. V - vegetarian, VG - vegan-8,&<\x03/8&<\x03%85*(5\n6WXIIHG\x03FKHGGDU\x03FKHHVH\x03SDWW\\\x0f\x03EDFRQ\x03\nWRPDWR\x03\t\x03SLFNOHV\n6·025(6\n$33/(\x03\t\x035$63%(55<\x03&580%/(\n6(/(&7,21\x032)\x03,&(\x03&5($06\x03\t\x03625%(76\n2 Courses €\x16\x1c / 3 courses €\x17\x18\nAdd frƺƺ\x03ˢowing of house wine or cava for €17.50 per personCHAMP MASH POT ATOES - €\x1a\x11\x13\x13\nGARDEN SALAD - €\x1a\x11\x13\x13\nFRIES WITH TRUFFLE AND PARMESAN - €\x1a\x11\x13\x13\nSAUCES - €3.\x18\x13 ( Peppecorn, Chimichurri, Bernaise \x03, Blue cheese)\nDESSERTS340g Ribeye steak, half Canadian lobster, f ries, salad & Garlic\x03\n(Supplement

In [16]:
print(pages[0].page_content)

STARTERS & SHARING
MAIN COURSE
To be enjoyed with a minimum of two courses per person beginning from when your food order is taken. 
It is available for the alloted time of your reservation.
We are happy to provide information pertaining to allergens and intolerance on request. V - vegetarian, VG - vegan-8,&</8&<%85*(5
6WXIIHGFKHGGDUFKHHVHSDWW\EDFRQ
WRPDWR	SLFNOHV
6·025(6
$33/(	5$63%(55<&580%/(
6(/(&7,212),&(&5($06	625%(76
2 Courses € / 3 courses €
Add frƺƺˢowing of house wine or cava for €17.50 per personCHAMP MASH POT ATOES - €
GARDEN SALAD - €
FRIES WITH TRUFFLE AND PARMESAN - €
SAUCES - €3. ( Peppecorn, Chimichurri, Bernaise , Blue cheese)
DESSERTS340g Ribeye steak, half Canadian lobster, f ries, salad & Garlic
(Supplement of €)325.%(//<  52$67('6,5/2,1
5(()	%(()681'$<52$67
(Served with potatoes, vegetables, Yorkshire pudding & gravy)
SIDES & SAUCES
(Supplement charges apply on brunch menu)3($5	*2$7 &+((6(
:DWHUFUHVVURFNHWJDUOLF

# Using unstructured to read documents

In [4]:
import os
from load_dotenv import load_dotenv

load_dotenv()
unstructured_api = "https://api.unstructuredapp.io/general/v0/general"

In [86]:
import os, json

import unstructured_client
from unstructured_client.models import operations, shared

client = unstructured_client.UnstructuredClient(
    api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"),
    server_url=os.getenv("UNSTRUCTURED_API_URL"),
)

filename = "../../data/Cucum.pdf"
with open(filename, "rb") as f:
    data = f.read()

req = operations.PartitionRequest(
    partition_parameters=shared.PartitionParameters(
        files=shared.Files(
            content=data,
            file_name=filename,
        ),
        strategy=shared.Strategy.HI_RES,
        languages=['eng'],
        split_pdf_page=True,            # If True, splits the PDF file into smaller chunks of pages.
        split_pdf_allow_failed=True,    # If True, the partitioning continues even if some pages fail.
        split_pdf_concurrency_level=15  # Set the number of concurrent request to the maximum value: 15.
    ),
)


In [87]:

res = client.general.partition(request=req)
element_dicts = [element for element in res.elements]

# Print the processed data's first element only.
print(element_dicts)





INFO: Preparing to split document for partition.
INFO: Starting page number set to 1
INFO: Allow failed set to 1
INFO: Concurrency level set to 15
INFO: Splitting pages 1 to 12 (12 total)
INFO: Determined optimal split size of 2 pages.
INFO: Partitioning 6 files with 2 page(s) each.
INFO: Partitioning set #1 (pages 1-2).
INFO: Partitioning set #2 (pages 3-4).
INFO: Partitioning set #3 (pages 5-6).
INFO: Partitioning set #4 (pages 7-8).
INFO: Partitioning set #5 (pages 9-10).
INFO: Partitioning set #6 (pages 11-12).
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Req

[{'type': 'Image', 'element_id': '11c2284ebaf5b459865f030ab5db5daf', 'text': 'CUCUM BEACH HOUSE', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'filename': 'Cucum.pdf'}}, {'type': 'Image', 'element_id': '41ebc3fa5c81ba06b24297ca36e688c5', 'text': 'ZUMOS MATURALES HATHRLICHE SAFTE ... MATURAL IUICES HASTA 18:00H / BIS 18:00 UHR / TILL 6 P.M. DE 390 frisch gepresster Orangensaft / fresh orange juice TROPICAL mango, fresa y nara 590 Mango, Erdbeere, Orange / mango, strawberry, orange EL SOL zanahoria, jengibre, naranja y miel Karotte, Ingwer, Orange, Honig / carrot, ginger, orange, honey *‘l‘g‘ R !‘l\\ A\\ EMPEZANDO BIEN EL DIA .. DESAYUNDS .. FRUHSTUCK .. BREAKFAST HASTA 12:00H / BIS 12:00 UHR / TILL 12 NOON DESAYUNO CUCUM 1690 fresCo con ceboliing, variado de. cbrsseizo e e Aucheilel Lachs, Frlschklse ‘mit Schnittiauch, tulielsaloml. Kr&uretsalamt Iogmm mlr Frichten, verschiedene Backwaren, frisch Oranqensa n, gerducherter Kochschinken, K: herb sala

In [88]:
element_dicts

[{'type': 'Image',
  'element_id': '11c2284ebaf5b459865f030ab5db5daf',
  'text': 'CUCUM BEACH HOUSE',
  'metadata': {'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 1,
   'filename': 'Cucum.pdf'}},
 {'type': 'Image',
  'element_id': '41ebc3fa5c81ba06b24297ca36e688c5',
  'text': 'ZUMOS MATURALES HATHRLICHE SAFTE ... MATURAL IUICES HASTA 18:00H / BIS 18:00 UHR / TILL 6 P.M. DE 390 frisch gepresster Orangensaft / fresh orange juice TROPICAL mango, fresa y nara 590 Mango, Erdbeere, Orange / mango, strawberry, orange EL SOL zanahoria, jengibre, naranja y miel Karotte, Ingwer, Orange, Honig / carrot, ginger, orange, honey *‘l‘g‘ R !‘l\\ A\\ EMPEZANDO BIEN EL DIA .. DESAYUNDS .. FRUHSTUCK .. BREAKFAST HASTA 12:00H / BIS 12:00 UHR / TILL 12 NOON DESAYUNO CUCUM 1690 fresCo con ceboliing, variado de. cbrsseizo e e Aucheilel Lachs, Frlschklse ‘mit Schnittiauch, tulielsaloml. Kr&uretsalamt Iogmm mlr Frichten, verschiedene Backwaren, frisch Oranqensa n, gerducherter Kochsc

In [89]:
json_elements = json.dumps(element_dicts, indent=2)

with open("../../data/output/Cucum_data.json", "w") as file:
    file.write(json_elements)

In [83]:
import pandas as pd
pd.DataFrame(element_dicts)

Unnamed: 0,type,element_id,text,metadata
0,Title,3089f82c45e8e26409ed9cabd911ab84,SUNDAY ROAST SET MENU,"{'filetype': 'application/pdf', 'languages': [..."
1,Title,0511964e7e5d58b160d7845764d029a5,STARTERS & SHARING,"{'filetype': 'application/pdf', 'languages': [..."
2,NarrativeText,8b3750007f8f1b00be82b97c235c04c6,LOBSTER & PRAWNS CROQUETAS,"{'filetype': 'application/pdf', 'languages': [..."
3,NarrativeText,265a88458ec173a70ddb2a584cd575d4,"Avocado puree, TNT sauce","{'filetype': 'application/pdf', 'languages': [..."
4,NarrativeText,dae0ffb9e88d00c4d6db21ca19253d68,GRILLED LAMB CUTLETS,"{'filetype': 'application/pdf', 'languages': [..."
5,NarrativeText,bc13362081559cce2df5f73dd5c48c7d,"Sesame oil, sriracha, soy, pistachio, blood or...","{'filetype': 'application/pdf', 'languages': [..."
6,Title,27dfa2b71567c1abeaaf6176b8f74d16,EGGS BENEDICT,"{'filetype': 'application/pdf', 'languages': [..."
7,NarrativeText,90bd57aa41b9d6860cc8af7cad3d94fe,Holandaise sauce with spinach or ham,"{'filetype': 'application/pdf', 'languages': [..."
8,Title,3bd236451dd7c03c87588a30bd01d115,PEAR & GOAT CHEESE,"{'filetype': 'application/pdf', 'languages': [..."
9,NarrativeText,0d38724c47c3a9df423f2973d36a37c8,"Watercress, rocket, garlic, balsamic, honey, a...","{'filetype': 'application/pdf', 'languages': [..."


In [79]:
print(element_dicts[1]["text"])

ZUMOS MATURALES HATHRLICHE SAFTE ... MATURAL IUICES HASTA 18:00H / BIS 18:00 UHR / TILL 6 P.M. DE 390 frisch gepresster Orangensaft / fresh orange juice TROPICAL mango, fresa y nara 590 Mango, Erdbeere, Orange / mango, strawberry, orange EL SOL zanahoria, jengibre, naranja y miel Karotte, Ingwer, Orange, Honig / carrot, ginger, orange, honey *‘l‘g‘ R !‘l\ A\ EMPEZANDO BIEN EL DIA .. DESAYUNDS .. FRUHSTUCK .. BREAKFAST HASTA 12:00H / BIS 12:00 UHR / TILL 12 NOON DESAYUNO CUCUM 1690 fresCo con ceboliing, variado de. cbrsseizo e e Aucheilel Lachs, Frlschklse ‘mit Schnittiauch, tulielsaloml. Kr&uretsalamt Iogmm mlr Frichten, verschiedene Backwaren, frisch Oranqensa n, gerducherter Kochschinken, K: herb salami cookedham smoked cool ream cheese with d\lves. cheese vanefy l!ulﬂe salami, e S e LR T 990 D%“v’am croissant, frisches Obst, Croissant, Marmelade, Nutella, Butter fresh fruits, croissant, jam, nutella, butter TOSTADA CON 9.90 = Jamon York y queso Mahan roas( mit Tomalen Olivendl, geko

In [None]:
element_dicts[1]["text"]

In [74]:
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf("../../data/Cucum.pdf")

elements_fast = partition_pdf("../../data/Cucum.pdf", strategy="fast")

PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?

In [29]:
from collections import Counter
display(Counter(type(element) for element in elements))
print("")
# The composition of elements can be different for elements derived with the "fast" strategy
display(Counter(type(element) for element in elements_fast))

Counter({unstructured.documents.elements.Text: 21,
         unstructured.documents.elements.NarrativeText: 4,
         unstructured.documents.elements.Title: 3,
         unstructured.documents.elements.Header: 1,
         unstructured.documents.elements.Footer: 1})




Counter({unstructured.documents.elements.Text: 21,
         unstructured.documents.elements.NarrativeText: 4,
         unstructured.documents.elements.Title: 3,
         unstructured.documents.elements.Header: 1,
         unstructured.documents.elements.Footer: 1})

In [30]:
display(*[(type(element), element.text) for element in elements[10:13]])

(unstructured.documents.elements.Text,
 '(cid:55)(cid:49)(cid:55)(cid:3)(cid:51)(cid:53)(cid:36)(cid:58)(cid:49)(cid:54)(cid:3) (cid:55)(cid:76)(cid:74)(cid:72)(cid:85)(cid:3)(cid:83)(cid:85)(cid:68)(cid:90)(cid:81)(cid:86)(cid:3)(cid:76)(cid:81)(cid:3)(cid:87)(cid:72)(cid:80)(cid:83)(cid:88)(cid:85)(cid:68)(cid:3)(cid:69)(cid:68)(cid:87)(cid:87)(cid:72)(cid:85)(cid:3)(cid:90)(cid:76)(cid:87)(cid:75)(cid:3)(cid:55)(cid:49)(cid:55)(cid:3)(cid:86)(cid:68)(cid:88)(cid:70)(cid:72) (Supplement of €(cid:27))')

(unstructured.documents.elements.Title, 'MAIN COURSE')

(unstructured.documents.elements.Text,
 '(cid:45)(cid:56)(cid:44)(cid:38)(cid:60)(cid:3)(cid:47)(cid:56)(cid:38)(cid:60)(cid:3)(cid:37)(cid:56)(cid:53)(cid:42)(cid:40)(cid:53) (cid:54)(cid:87)(cid:88)(cid:73)(cid:73)(cid:72)(cid:71)(cid:3)(cid:70)(cid:75)(cid:72)(cid:71)(cid:71)(cid:68)(cid:85)(cid:3)(cid:70)(cid:75)(cid:72)(cid:72)(cid:86)(cid:72)(cid:3)(cid:83)(cid:68)(cid:87)(cid:87)(cid:92)(cid:15)(cid:3)(cid:69)(cid:68)(cid:70)(cid:82)(cid:81)(cid:3) (cid:87)(cid:82)(cid:80)(cid:68)(cid:87)(cid:82)(cid:3)(cid:9)(cid:3)(cid:83)(cid:76)(cid:70)(cid:78)(cid:79)(cid:72)(cid:86)')

In [31]:
print("\n\n".join([str(el) for el in elements]))

SUNDAY ROAST SET MENU

STARTERS & SHARING

(cid:47)(cid:50)(cid:37)(cid:54)(cid:55)(cid:40)(cid:53)(cid:3)(cid:9)(cid:3)(cid:51)(cid:53)(cid:36)(cid:58)(cid:49)(cid:54)(cid:3)(cid:38)(cid:53)(cid:50)(cid:52)(cid:56)(cid:40)(cid:55)(cid:36)(cid:54)

(cid:36)(cid:89)(cid:82)(cid:70)(cid:68)(cid:71)(cid:82)(cid:3)(cid:83)(cid:88)(cid:85)(cid:72)(cid:72)(cid:15)(cid:3)(cid:55)(cid:49)(cid:55)(cid:3)(cid:86)(cid:68)(cid:88)(cid:70)(cid:72)

(cid:51)(cid:40)(cid:36)(cid:53)(cid:3)(cid:9)(cid:3)(cid:42)(cid:50)(cid:36)(cid:55)(cid:3)(cid:38)(cid:43)(cid:40)(cid:40)(cid:54)(cid:40) (cid:58)(cid:68)(cid:87)(cid:72)(cid:85)(cid:70)(cid:85)(cid:72)(cid:86)(cid:86)(cid:15)(cid:3)(cid:85)(cid:82)(cid:70)(cid:78)(cid:72)(cid:87)(cid:15)(cid:3)(cid:74)(cid:68)(cid:85)(cid:79)(cid:76)(cid:70)(cid:15)(cid:3)(cid:69)(cid:68)(cid:79)(cid:86)(cid:68)(cid:80)(cid:76)(cid:70)(cid:15)(cid:3)(cid:75)(cid:82)(cid:81)(cid:72)(cid:92)(cid:15)(cid:3)(cid:68)(cid:79)(cid:80)(cid:82)(cid:81)(cid:71)(cid:86)(cid:15)

In [3]:
import os

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

client = UnstructuredClient(
    api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"),
    server_url=os.getenv("UNSTRUCTURED_API_URL"),
)

filename = "../../data/The Merchants.pdf"
file = open(filename, "rb")
req = shared.PartitionParameters(
    # Note that this currently only supports a single file
    files=shared.Files(
        content=file.read(),
        file_name=filename,
    ),
    strategy=shared.Strategy.OCR_ONLY,
    languages=["eng"],
    split_pdf_page=True,
    split_pdf_allow_failed=True,
    split_pdf_concurrency_level=15
)

try:
    res = client.general.partition(req)
    print(res.elements[0])
except SDKError as e:
    print(e)


INFO: Preparing to split document for partition.
INFO: Starting page number set to 1
INFO: Allow failed set to 1
INFO: Concurrency level set to 15
INFO: Splitting pages 1 to 1 (1 total)
INFO: Determined optimal split size of 2 pages.
INFO: Document has too few pages (1) to be split efficiently. Partitioning without split.


{'type': 'Title', 'element_id': '3089f82c45e8e26409ed9cabd911ab84', 'text': 'SUNDAY ROAST SET MENU', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'filename': 'The Merchants.pdf'}}


In [37]:
res.elements

[{'type': 'Title',
  'element_id': '3089f82c45e8e26409ed9cabd911ab84',
  'text': 'SUNDAY ROAST SET MENU',
  'metadata': {'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 1,
   'filename': 'The Merchants.pdf'}},
 {'type': 'Title',
  'element_id': '0511964e7e5d58b160d7845764d029a5',
  'text': 'STARTERS & SHARING',
  'metadata': {'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 1,
   'filename': 'The Merchants.pdf'}},
 {'type': 'Title',
  'element_id': '8b3750007f8f1b00be82b97c235c04c6',
  'text': 'LOBSTER & PRAWNS CROQUETAS',
  'metadata': {'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 1,
   'filename': 'The Merchants.pdf'}},
 {'type': 'Title',
  'element_id': '2d47a585cd30f9d644a2bd5aa4c5560d',
  'text': 'PEAR & GOAT CHEESE',
  'metadata': {'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 1,
   'filename': 'The Merchants.pdf'}},
 {'type': 'NarrativeText',
  'element_id': 'c5ac95f35563c

In [38]:
import PyPDF2

# creating a pdf reader object
reader = PyPDF2.PdfReader('../../data/The Merchants.pdf')

# print the number of pages in pdf file
print(len(reader.pages))

# print the text of the first page
print(reader.pages[0].extract_text())

1
STARTERS & SHARING
MAIN COURSE
To be enjoyed with a minimum of two courses per person beginning from when your food order is taken. 
It is available for the alloted time of your reservation.
We are happy to provide information pertaining to allergens and intolerance on request. V - vegetarian, VG - vegan-8,&</8&<%85*(5
6WXIIHGFKHGGDUFKHHVHSDWW\EDFRQ
WRPDWR	SLFNOHV
6·025(6
$33/(	5$63%(55<&580%/(
6(/(&7,212),&(&5($06	625%(76
2 Courses € / 3 courses €
Add frƺƺˢowing of house wine or cava for €17.50 per personCHAMP MASH POT ATOES - €
GARDEN SALAD - €
FRIES WITH TRUFFLE AND PARMESAN - €
SAUCES - €3. ( Peppecorn, Chimichurri, Bernaise , Blue cheese)
DESSERTS340g Ribeye steak, half Canadian lobster, f ries, salad & Garlic
(Supplement of €)325.%(//<  52$67('6,5/2,1
5(()	%(()681'$<52$67
(Served with potatoes, vegetables, Yorkshire pudding & gravy)
SIDES & SAUCES
(Supplement charges apply on brunch menu)3($5	*2$7 &+((6(
:DWHUFUHVVURFNHWJDUOL

In [6]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader
local_path = "../../data/Bodeguilla.pdf"

# Local PDF file uploads
if local_path:
  loader = UnstructuredPDFLoader(file_path=local_path)
  data = loader.load()
else:
  print("Upload a PDF file")

In [7]:
print(data[0].page_content)

M A L L O R Q U Í

Tapes —

Pernil ibèric de gla Juan Manuel D.O. Guijuelo (80g)

Cecina de bou, esparrecs, formatge Roncal i oli d’amontillat

Ostra Regal Special nº2 al natural (u.)

Ostra amb gaspatxo blanc de meló (u.)

Ostra a sa brasa amb gamba vermella a l’all

Carpaccio de gamba vermella, fonoll marí i emulsió de ses caps

Tàrtar de tonyina, tomàtiga, alvocat a sa brasa i algues

Steak tartar de vedella vella de muntanya

Ensaladilla tradicional de patata amb gamba y mahonesa

Burrata, albergínia rostida, tomàtigues escalivades i llimona

Amanida de tomàtigues de temporada, all verd de pinyons i bacallà

Espàrrecs verds a sa brasa amb romesco casolà i salsa tàrtara

Les Braves de La Bodeguilla

Ou a baixa temperatura amb salsa de foie i trufa fresca

Croquetes meloses de pernil de Guijuelo (6u.)

Zamburinyes amb picadillo i vinagre de Chardonnay (6u.)

Calamars fregits amb allioli de prebes de Padró

Fideuà de carrabiner i allioli dels seus corals

Pop rostit amb ou, papada ibè

In [8]:
import re
def cidToChar(cidx):
    return chr(int(re.findall(r'\(cid\:(\d+)\)',cidx)[0]) + 29)


In [9]:
txt = ""
for x in data[0].page_content.split('\n'):
  if x != '' and x != '(cid:3)':         # merely to compact the output
    abc = re.findall(r'\(cid\:\d+\)',x)
    if len(abc) > 0:
        for cid in abc: x=x.replace(cid, cidToChar(cid))
    txt = txt+"\n"+repr(x).strip("'")

In [12]:
print(txt)


M A L L O R Q U Í
Tapes —
Pernil ibèric de gla Juan Manuel D.O. Guijuelo (80g)
Cecina de bou, esparrecs, formatge Roncal i oli d’amontillat
Ostra Regal Special nº2 al natural (u.)
Ostra amb gaspatxo blanc de meló (u.)
Ostra a sa brasa amb gamba vermella a l’all
Carpaccio de gamba vermella, fonoll marí i emulsió de ses caps
Tàrtar de tonyina, tomàtiga, alvocat a sa brasa i algues
Steak tartar de vedella vella de muntanya
Ensaladilla tradicional de patata amb gamba y mahonesa
Burrata, albergínia rostida, tomàtigues escalivades i llimona
Amanida de tomàtigues de temporada, all verd de pinyons i bacallà
Espàrrecs verds a sa brasa amb romesco casolà i salsa tàrtara
Les Braves de La Bodeguilla
Ou a baixa temperatura amb salsa de foie i trufa fresca
Croquetes meloses de pernil de Guijuelo (6u.)
Zamburinyes amb picadillo i vinagre de Chardonnay (6u.)
Calamars fregits amb allioli de prebes de Padró
Fideuà de carrabiner i allioli dels seus corals
Pop rostit amb ou, papada ibèrica i patata a s’a

In [45]:
data[0].page_content

'SUNDAY ROAST SET MENU\n\nSTARTERS & SHARING\n\n(cid:47)(cid:50)(cid:37)(cid:54)(cid:55)(cid:40)(cid:53)(cid:3)(cid:9)(cid:3)(cid:51)(cid:53)(cid:36)(cid:58)(cid:49)(cid:54)(cid:3)(cid:38)(cid:53)(cid:50)(cid:52)(cid:56)(cid:40)(cid:55)(cid:36)(cid:54)\n\n(cid:36)(cid:89)(cid:82)(cid:70)(cid:68)(cid:71)(cid:82)(cid:3)(cid:83)(cid:88)(cid:85)(cid:72)(cid:72)(cid:15)(cid:3)(cid:55)(cid:49)(cid:55)(cid:3)(cid:86)(cid:68)(cid:88)(cid:70)(cid:72)\n\n(cid:51)(cid:40)(cid:36)(cid:53)(cid:3)(cid:9)(cid:3)(cid:42)(cid:50)(cid:36)(cid:55)(cid:3)(cid:38)(cid:43)(cid:40)(cid:40)(cid:54)(cid:40) (cid:58)(cid:68)(cid:87)(cid:72)(cid:85)(cid:70)(cid:85)(cid:72)(cid:86)(cid:86)(cid:15)(cid:3)(cid:85)(cid:82)(cid:70)(cid:78)(cid:72)(cid:87)(cid:15)(cid:3)(cid:74)(cid:68)(cid:85)(cid:79)(cid:76)(cid:70)(cid:15)(cid:3)(cid:69)(cid:68)(cid:79)(cid:86)(cid:68)(cid:80)(cid:76)(cid:70)(cid:15)(cid:3)(cid:75)(cid:82)(cid:81)(cid:72)(cid:92)(cid:15)(cid:3)(cid:68)(cid:79)(cid:80)(cid:82)(cid:81)(cid:71)(cid:86

In [58]:
len(["Pernil ibèric de gla Juan Manuel D.O. Guijuelo (80g)",
"Cecina de bou, esparrecs, formatge Roncal i oli d’amontillat",
"Ostra Regal Special nº2 al natural (u.)",
"Ostra amb gaspatxo blanc de meló (u.)"
"Ostra a sa brasa amb gamba vermella a l’all",
"Carpaccio de gamba vermella, fonoll marí i emulsió de ses caps",
"Tàrtar de tonyina, tomàtiga, alvocat a sa brasa i algues",
"Steak tartar de vedella vella de muntanya",
"Ensaladilla tradicional de patata amb gamba y mahonesa",
"Burrata, albergínia rostida, tomàtigues escalivades i llimona",
"Amanida de tomàtigues de temporada, all verd de pinyons i bacallà",
"Espàrrecs verds a sa brasa amb romesco casolà i salsa tàrtara",
"Les Braves de La Bodeguilla",
"Ou a baixa temperatura amb salsa de foie i trufa fresca",
"Croquetes meloses de pernil de Guijuelo (6u.)",
"Zamburinyes amb picadillo i vinagre de Chardonnay (6u.)",
"Calamars fregits amb allioli de prebes de Padró",
"Fideuà de carrabiner i allioli dels seus corals",
"Pop rostit amb ou, papada ibèrica i patata a s’all",
"Caneló de pollastre camperol, salsa de trompetes i poma",
"Ploma ibèrica a sa brasa, porro fumat, mostassa i festucs"])

20

# Enhacement of menu

In [None]:
system_message_content = """
    You are a helpful assistant, expert of cutomer service for restaurants.

"""

In [None]:
completion = client.chat.completions.create(
    model="gpt-4o-2024-08-06",  # model supporting structured outputs
    messages=[{
        "role": "system", 
        "content": system_message_content
    },
    {
    "role": "user",
    "content": user_message_content
    }],
    response_format={
        "type": "json_schema",
        "json_schema": intermediate_step_schema
    }
)

NameError: name 'txt' is not defined

In [11]:
# load packages
from openai import OpenAI

import json
import os
from jsonschema import validate
from jsonschema.exceptions import ValidationError

sys.path.append("../../")
# load restaurant information inputs
from inputs.restaurant_info.name import name
from inputs.restaurant_info.location import location
from inputs.restaurant_info.cuisine import cuisine
from inputs.restaurant_info.nationalities import nationalities


# load schemas
from enhance.schemas.given_menu_schema import given_menu_schema
from enhance.schemas.intermediate_step_schema import intermediate_step_schema
from enhance.schemas.enhanced_menu_schema import enhanced_menu_schema

# load functions
from enhance.validate import dish_exists, dish_exists_in_menu

client = OpenAI()

system_message_content = """
You are a helpful assistant designed to enhance restaurant menus.

# Information
You will receive prompts with, and only with, the following input information:
- ** Restaurant's Name **: a String of the name of the restaurant you are helping.
- ** Restaurant's Location **: a JSON object including the neighborhood, city, state, and country of the restaurant you are helping. 
- ** Restaurant's Cuisine **: a String of the chosen cuisine of the restaurant you are helping.
- ** Restaurant's Top Three Guest Nationalities **: an array of Strings of the top three nationalities which visit the restaurant you are helping.
- ** Restaurant's Menu(s) **: a JSON object including the category, name, ingredients, and price of all items offered at the restaurant.

# Instructions
Your response, an enhanced version of every single menu item in the given menu, should strictly be in the format of a valid JSON object.

Generate an enhanced version of every single menu item in the given menu with the following information:
- ** Name **: The name of the menu item you are enriching.
- ** Recommended Upsells **: Enrich our restaurant's menu by creating an Array of Strings for Upselling Recommendations: each string must be only the name of another menu item to upsell and **must not contain any descriptions or sentences**.
- ** Narrative **: Enrich our restaurant's menu by creating a narrative String: an intricate and engaging narrative that connects each menu item with the cultural backgrounds of the three most frequent nationalities among our guests.
- ** Appeal **: Enrich our restaurant's menu by creating an appeal String: Highlight the textures and sensory experience of each menu item, emphasizing what makes them appealing on a tactile and sensory level to encourage customers to try them.
"""
given_menu = txt
user_message_content = f"""
<name> {name} </name>
<location> {location} </location>
<cuisine> {cuisine} </cuisine>
<nationalities> {nationalities} </nationalities>
<menu> {given_menu} </menu>
"""

try:
     validate(instance=given_menu, schema=given_menu_schema)
     print("JSON Schema validated. Input JSON data fits requirements.")
     while True:
          print("Running enhancements...\n")
          completion = client.chat.completions.create(
               model="gpt-4o-2024-08-06",  # model supporting structured outputs
               messages=[{
                    "role": "system", 
                    "content": system_message_content
               },
               {
               "role": "user",
               "content": user_message_content
               }],
               response_format={
                    "type": "json_schema",
                    "json_schema": intermediate_step_schema
               }
          )
          intermediate_step = completion.choices[0].message.content
          intermediate_step_data = json.loads(intermediate_step)  # turn the returned String into a JSON object
          valid = True  # checker variable as we ensure generated content is valid
          file_path = os.path.join('outputs', 'enhanced_menus', 'intermediate_step_output.json')
          os.makedirs(os.path.dirname(file_path), exist_ok=True)  # ensure the file path exists
          try:
               with open(file_path, 'w') as json_file:
                    json.dump(intermediate_step_data, json_file, indent=4)  # write the output to JSON file in the outputs/enhanced_menus folder
                    print(f"Intermediate step generated. Recommended Upsells, Narratives, and Appeals created.\nGenerated content stored in {file_path}.\nEnsuring accuracy of information...\n")
          except FileNotFoundError:
               print(f"The directory for {file_path} does not exist.")
               valid = False
               break

          for menu_item in intermediate_step_data['menu_items']:
               if not dish_exists_in_menu(given_menu, menu_item['name']):  # if generated dish does not exist, regenerate content
                    valid = False
                    print(f"Generated menu item {menu_item['name']} does not exist.")
                    break
               for upsell in menu_item['recommended_upsells']:
                    if not dish_exists_in_menu(given_menu, upsell):  # if generated upsell does not exist, regenerate content
                         valid = False
                         print(f"Recomended upsell {upsell} for generated menu item {menu_item['name']} does not exist.")
                         break
               #print(f'Validating {menu_item['narrative']['nationality']} is in nationalities {nationalities} list...')
               if menu_item['narrative']['nationality'] not in nationalities:
                    valid = False
                    print(f"Mentioned nationality {menu_item['narrative']['nationality']} is not mentioned in guest nationalities.")
                    break
               print(f"\t{menu_item['narrative']['nationality']} confirmed in nationalities list.")
          if valid:
               break
          else:
               print("Generated enhancements are invalid. Trying again...")
    
except ValidationError as e:  # if inputted JSON menu does not follow the schema correctly
     print("Inputted menu JSON data is invalid.")
     print(f"Error message: {e.message}")
     print(f"Invalid data path: {'/'.join(map(str, e.path))}")
     print(f"Schema path: {'/'.join(map(str, e.schema_path))}")

if valid:
     print("\nGenerated enhancements validated. Consolidating enhanced menu...")

JSON Schema validated. Input JSON data fits requirements.
Running enhancements...



INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Intermediate step generated. Recommended Upsells, Narratives, and Appeals created.
Generated content stored in outputs\enhanced_menus\intermediate_step_output.json.
Ensuring accuracy of information...

	American confirmed in nationalities list.
Recomended upsell Grilled green asparagus with romescu and tartar sauce for generated menu item Cecina de bou, esparrecs, formatge Roncal i oli d’amontillat does not exist.
	Italian confirmed in nationalities list.
	French confirmed in nationalities list.
	American confirmed in nationalities list.
Recomended upsell Red prawn and aioli Fideuà for generated menu item Carpaccio de gamba vermella, fonoll marí i emulsió de ses caps does not exist.
	Italian confirmed in nationalities list.
	French confirmed in nationalities list.
	American confirmed in nationalities list.
	Italian confirmed in nationalities list.
	French confirmed in nationalities list.
	American confirmed in nationalities list.
Recomended upsell Deep-fried calamari with "Padron" pepp

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Intermediate step generated. Recommended Upsells, Narratives, and Appeals created.
Generated content stored in outputs\enhanced_menus\intermediate_step_output.json.
Ensuring accuracy of information...

Recomended upsell Traditional potato salad with shrimps and mayonnaise for generated menu item Pernil ibèric de gla Juan Manuel D.O. Guijuelo (80g) does not exist.
	American confirmed in nationalities list.
Recomended upsell Burrata with roasted eggplant for generated menu item Cecina de bou, esparrecs, formatge Roncal i oli d’amontillat does not exist.
	Italian confirmed in nationalities list.
	French confirmed in nationalities list.
	American confirmed in nationalities list.
Recomended upsell Zamburiñas with picadillo for generated menu item Ostra a sa brasa amb gamba vermella a l’all does not exist.
	Italian confirmed in nationalities list.
Recomended upsell Burata and roasted eggplant for generated menu item Carpaccio de gamba vermella, fonoll marí i emulsió de ses caps does not exis

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Intermediate step generated. Recommended Upsells, Narratives, and Appeals created.
Generated content stored in outputs\enhanced_menus\intermediate_step_output.json.
Ensuring accuracy of information...

	American confirmed in nationalities list.
	Italian confirmed in nationalities list.
Recomended upsell Arroz del ‘Senyoret’ for generated menu item Ostra Regal Special nº2 al natural does not exist.
	French confirmed in nationalities list.
	American confirmed in nationalities list.
Recomended upsell Mix of Calamari, red shrimp, and Croquetas meloses de pernil de Guijuelo for generated menu item Ostra a sa brasa amb gamba vermella a l’all does not exist.
	Italian confirmed in nationalities list.
	French confirmed in nationalities list.
	Italian confirmed in nationalities list.
	American confirmed in nationalities list.
	Italian confirmed in nationalities list.
	French confirmed in nationalities list.
	French confirmed in nationalities list.
Generated enhancements are invalid. Trying again

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Intermediate step generated. Recommended Upsells, Narratives, and Appeals created.
Generated content stored in outputs\enhanced_menus\intermediate_step_output.json.
Ensuring accuracy of information...

	American confirmed in nationalities list.
	Italian confirmed in nationalities list.
	French confirmed in nationalities list.
	American confirmed in nationalities list.
	Italian confirmed in nationalities list.
	French confirmed in nationalities list.
	American confirmed in nationalities list.
	Italian confirmed in nationalities list.
	French confirmed in nationalities list.
	American confirmed in nationalities list.

Generated enhancements validated. Consolidating enhanced menu...
