In [3]:
import google.generativeai as genai
import os

genai.configure(api_key=os.getenv("API_KEY"))

In [None]:
model = genai.GenerativeModel("gemini-1.5-flash")
response = model.generate_content("Write a story about a magic backpack.")
print(response.text)

In [44]:
import json

def extract_venue_meta(text):
    model = genai.GenerativeModel("gemini-1.5-pro-latest",
                                generation_config={"response_mime_type": "application/json"})
        
    prompt = """ Extract the meta data (venue_name and ingredients) from the above provided Text, return in JSON format.
    Notes: 
    1. The ingredients should be small ingredients that usually available in the Australian brochure stores, do not list dish names here.
    2. Venue type should be either restaurant or deli. 
    Use this JSON schema:
    Meta = {'venue_name': str, 'venue_type': str, 'ingredients': list[str]}
    Return: list[Meta]
    """
    result = model.generate_content(text+prompt)
    return json.loads(result.text)

    

In [42]:
import os

raw_folder_path = './venue_raw_data'
menu_folder_path = './venue_menu_data'

for filename in os.listdir(raw_folder_path):
    # Check if the file is a .txt file
    if filename.endswith(".txt"):
        file_path = os.path.join(raw_folder_path, filename)
        
        # Open and read the content of the file
        with open(file_path, 'r') as file:
            file_content = file.read()
        
        venue_menu_json = extract_venue_meta(file_content)
        json_filename = filename.replace('.txt', '.json')
        json_file_path = os.path.join(menu_folder_path, json_filename)
        
        # Save the content as a JSON file
        with open(json_file_path, 'w') as json_file:
            json.dump(venue_menu_json, json_file, indent=4)

In [46]:
import os
import json
import pysolr

# Initialize Solr connection
solr_url = 'http://localhost:8983/solr/venue_menu'  # Replace with your Solr URL
solr = pysolr.Solr(solr_url, always_commit=True, timeout=10)

# Folder containing your JSON files
venue_menu_data_folder = 'venue_menu_data'

# Function to process and index the data
def index_venue_menu_data(folder):
    # Loop over all files in the directory
    for filename in os.listdir(folder):
        if filename.endswith('.json'):
            file_path = os.path.join(folder, filename)

            # Read and parse the JSON file
            with open(file_path, 'r') as file:
                try:
                    data = json.load(file)

                    # Build Solr documents, mapping fields to dynamic fields
                    solr_doc = {
                        'venue_name_t': data.get('venue_name', ''),
                        'venue_type_s': data.get('venue_type', ''),
                        'ingredients_txt': data.get('ingredients', ''),
                    }

                    # Send document to Solr for indexing
                    solr.add([solr_doc])

                    print(f'Successfully indexed {filename}')

                except Exception as e:
                    print(f'Error indexing {filename}: {e}')

# Call the function to index data
index_venue_menu_data(venue_menu_data_folder)

Successfully indexed Restaurant Hubert.json
Successfully indexed Luna Lu.json
Successfully indexed Deli Ziosa.json
Successfully indexed Wintergarden Deli.json
Successfully indexed Joe's Sandwich Bar.json


In [61]:
import pdfplumber
import json

def extract_products_from_pdf(pdf_file_path, json_file_path):
    try:
        pdf_data = []

        # Open the PDF file using pdfplumber
        with pdfplumber.open(pdf_file_path) as pdf:
            # Extract text from each page
            for page_number in range(len(pdf.pages)):
                page = pdf.pages[page_number]
                text = page.extract_text()  # Use extract_text() from pdfplumber

                # Create a dictionary for each page
                page_data = {
                    'original_file': pdf_file_path,
                    'page_number': page_number + 1,  # Page numbers start at 1
                    'page_text': text.replace('\n', ' ') if text else ''  # Handle None case
                }
                pdf_data.append(page_data)  # Append to the list
        
        # Save to JSON file
        with open(json_file_path, 'w') as json_file:
            json.dump(pdf_data, json_file, indent=4)
        
        print(f'Text extracted and saved to {json_file_path}')
    
    except Exception as e:
        print(f'An error occurred: {e}')

# Example usage
pdf_file_path = './catalogue_data/PremierQualityFoodsBrochure2021.pdf'  # Replace with your PDF file path
json_file_path = './catalogue_data/catalogue.json'            # Specify the desired JSON output file name
extract_products_from_pdf(pdf_file_path, json_file_path)


Text extracted and saved to ./catalogue_data/catalogue.json


In [62]:
# Initialize Solr connection
solr_url = 'http://localhost:8983/solr/catalogues'  # Replace with your Solr URL
solr = pysolr.Solr(solr_url, always_commit=True, timeout=10)

# Folder containing your JSON files
catalogue_data_folder = 'catalogue_data'

# Function to process and index the data
def index_catalogue_data(folder):
    # Loop over all files in the directory
    for filename in os.listdir(folder):
        if filename.endswith('.json'):
            file_path = os.path.join(folder, filename)

            # Read and parse the JSON file
            with open(file_path, 'r') as file:
                try:
                    data = json.load(file)
                    for page_data in data:
                        # Build Solr documents, mapping fields to dynamic fields
                        solr_doc = {
                            'original_file_s': page_data.get('original_file', ''),
                            'page_number_i': page_data.get('page_number', ''),
                            'page_text_txt': page_data.get('page_text', ''),
                        }

                        # Send document to Solr for indexing
                        solr.add([solr_doc])

                    print(f'Successfully indexed {filename}')

                except Exception as e:
                    print(f'Error indexing {filename}: {e}')

# Call the function to index data
index_catalogue_data(catalogue_data_folder)

Successfully indexed catalogue.json


In [69]:
import requests

def get_venue_meta(query):
    # Construct the Solr query URL
    query_url = "http://localhost:8983/solr/venue_menu/select"

    # Set up the parameters for the query
    params = {
        'q': query,                 # The query string
        'rows': 1,                  # Limit the number of rows returned to 1
        'wt': 'json',              # Specify the response format
    }

    try:
        # Send the GET request to Solr
        response = requests.get(query_url, params=params)
        response.raise_for_status()  # Raise an error for bad responses

        # Parse the JSON response
        data = response.json()

        return data['response']['docs'][0]

    except requests.exceptions.RequestException as e:
        print(f"Error querying Solr: {e}")
        return None


query1 = "*:*"                             # Example query to get all documents

venue_meta = get_venue_meta(query1)


In [82]:
def get_catalogues(query):
    # Construct the Solr query URL
    query_url = "http://localhost:8983/solr/catalogues/select"

    # Set up the parameters for the query
    params = {
        'q': query,                 # The query string
        'rows': 3,                  # Limit the number of rows returned to 1
        'wt': 'json',
        'defType': 'edismax',         # Use the edismax query parser
        'qf': 'page_text_txt',        # Specify the field(s) to search
        'mm': '1',                             # Specify the response format
    }

    try:
        # Send the GET request to Solr
        response = requests.get(query_url, params=params)
        response.raise_for_status()  # Raise an error for bad responses

        # Parse the JSON response
        data = response.json()

        return data['response']['docs']

    except requests.exceptions.RequestException as e:
        print(f"Error querying Solr: {e}")
        return None

query2 = ' '.join(venue_meta['ingredients_txt'])
related_catalogues = get_catalogues(query2)  

In [83]:
related_catalogues

[{'original_file_s': './catalogue_data/PremierQualityFoodsBrochure2021.pdf',
  'page_number_i': 25,
  'page_text_txt': ['Salads & Dips Small Coleslaw  V                               1kg fla Cabbage, carrot and onion in a thick and creamy mayonnaise v o Large Coleslaw  V                               2kg u Cabbage, carrot and onion in a thick and creamy mayonnaise rs Rustic Coleslaw  V                               1kg Sliced red cabbage and grated white cabbage with carrot and red onion in extra thick mayonnaise Minted Coleslaw VG                              1kg Red cabbage, white cabbage, carrot and onion in a minty paprika sauce  with vegan mayonnaise VG Kaleslaw                                                1kg Sliced Curly Kale, red cabbage, carrot & red onion in a seasoned vegan mayonnaise Cous Cous & Mediterranean Vegetables  S   H V                  1kg Cous Cous and Mediterranean vegetables with coriander in a traditional French dressing Moroccan Cous Cous  S   H  V         

In [98]:
def extract_venue_meta(venue_meta, related_catalogues):
    model = genai.GenerativeModel("gemini-1.5-pro-latest")
                    
        
    prompt = """ Your task is to follow the instructions below and suggest suitable products from the catalogue for the venue.

    1. List 15 products from the catalogue.
    2. Based on the venue metadata provided to you, and the venue type and the ingredients listed on their menu, pick 5 products from the catalogue to make the suggestion.
    3. Your answer should be based on the template below; please do not add other information.
    4. Your answer should be based on the Venue metadata and Product Catalogue; please do not create items or names from your own knowledge.
    5. The [original_file] should only show the file name, not the entire file path.

    --------- Response template ---------
    [venue_name] is a [venue_type] venue.
    My suggested products from the [original_file] are:
    1. [product name 1], [product 1 page_number]
    2. [product name 2], [product 2 page_number]
    3. [product name 3], [product 3 page_number]
    4. [product name 4], [product 4 page_number]
    5. [product name 5], [product 5 page_number]
    """
    result = model.generate_content(prompt + f'Venue meta data: {venue_meta}, Product Catalogue: {related_catalogues}')
    return result.text

In [99]:
res = extract_venue_meta(venue_meta, related_catalogues)

In [101]:
print(res)

Restaurant Hubert is a restaurant venue.
My suggested products from the ./catalogue_data/PremierQualityFoodsBrochure2021.pdf are:
1. Moroccan Cous Cous, 25
2. Potato Salad, 25
3. Rustic Coleslaw, 25
4. Marinated Chicken Tikka, 29
5. Hoi Sin Duck, 29

