In [132]:
from typing import Tuple
import pandas as pd
import re
import fitz

def extract_page_info(page, page_number):

    text_data = []

    soup = page.get_text("dict")

    blocks = soup['blocks']

    for block in blocks:
        if "lines" not in block:
            continue

        for line in block["lines"]:
            for span in line["spans"]:
                text = span["text"].strip()

                if not text:
                    continue

                flags = span.get("flags", 0)

                match_ = re.search(r'[A-Za-z]+[-_]?(\d+)(?![pt])', span['font'])

                text_data.append({'text': text,
                                'page_number': page_number + 1,
                                'only_number': bool(re.search(r'^\d+$', text.strip())),
                                'font_family': span['font'],
                                'font_size': round(span["size"], 1),
                                'is_bold': bool(re.search(r'bold|heavy', span["font"].lower())),
                                'font_intensity': match_.group(1) if match_ else None,
                                'is_italic': bool(re.search(r'italic|oblique', span["font"].lower())),
                                'is_capitalized': text.isupper(),
                                'bbox': span['bbox'],
                                'color': span['color']})
    
    return pd.DataFrame(text_data)


pages = fitz.open('MENU.pdf')
page_info = []
for page_nr, page in enumerate(pages):
    df = extract_page_info(page, page_nr)
    page_info.append(df)


all_pages = pd.concat(page_info)



class MenuClassifier:
    def __init__(self):
        self.patterns = {
            'dish_id': dict(
                font_family='MuseoSans-500',
                font_size=15.0,
                only_number=True,
                font_intensity='500',
                color=-703431
            ),
            'dish_name': dict(
                font_family='MuseoSans-500',
                only_number=False,
                font_size=13.0,
                font_intensity='500',
                color=-14475488
            ),
            'dish_description': dict(
                font_family='MuseoSans-300',
                only_number=False,
                font_size=10.0,
                font_intensity='300',
                color=-14475488
            ),
            'allergens': dict(
                font_family='MuseoSans-300Italic',
                only_number=False,
                font_size=9.0,
                font_intensity='300',
                color=-14475488
            )
        }

    def classify(self, row):
        text_props = {
            'font_family': row['font_family'],
            'only_number': row['only_number'],
            'font_size': row['font_size'],
            'font_intensity': row['font_intensity'],
            'color': row['color'],
            'is_italic': row['is_italic']
        }
        
        for text_type, pattern in self.patterns.items():
            if all(text_props.get(key) == val for key, val in pattern.items()):
                return text_type
        return 'Unclassified'
    

In [133]:
classifier = MenuClassifier()
all_pages['category'] = all_pages.apply(lambda row: classifier.classify(row), axis=1)

import numpy as np
all_pages = all_pages[all_pages['category'] != 'Unclassified']
all_pages['group'] = np.where(all_pages['category'] == 'dish_id', all_pages['text'], np.nan)
all_pages['group'] = all_pages['group'].fillna(method='bfill')
all_pages = all_pages[all_pages['page_number'] > 3]


# Pivot the data to get one row per dish
dish_data = pd.pivot_table(
   all_pages,
   index='group',
   columns='category',
   values='text',
   aggfunc='first'
).reset_index()

# Reorder columns
columns = ['dish_id', 'dish_name', 'dish_description', 'allergens']
dish_data = dish_data[columns]

  all_pages['group'] = all_pages['group'].fillna(method='bfill')


In [136]:
dish_data['dish_id'] = dish_data['dish_id'].astype(int)

dish_data.sort_values('dish_id', ascending=True).to_excel('menu_v2.xlsx', index=False)


# LEARN

In [43]:
import fitz
import re
import pandas as pd
doc = fitz.open('MENU.pdf')

# This is in points
# 1 inch = 2.54 cm
# 72 points = 1 inch

# x0, y0, x1, y1

text_data = []

def points_to_cm(p: int):
    size = p * (2.54 / 72)
    print(size)
    return size

page_number = 3

def extract_page_info(page):
    page = doc[page_number]

    soup = page.get_text("dict")

    blocks = soup['blocks']

    for block in blocks:
        if "lines" not in block:
            print(block)
            continue

        for line in block["lines"]:
            for span in line["spans"]:
                text = span["text"].strip()

                if not text:
                    continue

                flags = span.get("flags", 0)

                match_ = re.search(r'[A-Za-z]+[-_]?(\d+)(?![pt])', span['font'])

                text_data.append({'text': text,
                                'page_number': page_number + 1,
                                'font_family': span['font'],
                                'font_size': round(span["size"], 1),
                                'is_bold': bool(re.search(r'bold|heavy', span["font"].lower())),
                                'font_intensity': match_.group(1) if match_ else None,
                                'is_italic': bool(re.search(r'italic|oblique', span["font"].lower())),
                                'is_capitalized': text.isupper(),
                                'bbox': span['bbox'],
                                'color': span['color']})
    
    return pd.DataFrame(text_data)



page = doc[page_number]

soup = page.get_text("dict")

#points_to_cm(soup['width'])
#points_to_cm(soup['height'])

blocks = soup['blocks']

for block in blocks:
    if "lines" not in block:
        print(block)
        continue

    for line in block["lines"]:
        for span in line["spans"]:
            text = span["text"].strip()

            if not text:
                continue

            flags = span.get("flags", 0)

            match_ = re.search(r'[A-Za-z]+[-_]?(\d+)(?![pt])', span['font'])

            text_data.append({'text': text,
                              'page_number': page_number + 1,
                              'font_family': span['font'],
                              'font_size': round(span["size"], 1),
                              'is_bold': bool(re.search(r'bold|heavy', span["font"].lower())),
                              'font_intensity': match_.group(1) if match_ else None,
                              'is_italic': bool(re.search(r'italic|oblique', span["font"].lower())),
                              'is_capitalized': text.isupper(),
                              'bbox': span['bbox'],
                              'color': span['color']})
            
            





            
   


    

{'number': 6, 'type': 1, 'bbox': (51.428466796875, 434.91448974609375, 261.2954406738281, 583.1021728515625), 'width': 420, 'height': 297, 'ext': 'png', 'colorspace': 3, 'xres': 96, 'yres': 96, 'bpc': 8, 'transform': (209.86697387695312, 0.0, -0.0, 148.1876983642578, 51.428466796875, 434.91448974609375), 'size': 193853, 'image': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01\xa4\x00\x00\x01)\x08\x02\x00\x00\x00S\x00\xc1d\x00\x00\x00\tpHYs\x00\x00\x0e\xc4\x00\x00\x0e\xc4\x01\x95+\x0e\x1b\x00\x02\xf4\xefIDATx\x9c\xec\x9d\x07\x9c\x14\xe5\xf9\xc7\x05\xee\xb6\xce\xccM\xbd\xad\xec\xee5\xafp\xe0\xc1\xf5\xb6{mo\xaf\xf7\xca\xd1A\x90\x0e"\xa2`GEQ@\xb0 *\x16\xec\rk\x8c\x891\x891\x7fS\xd4\xc4$&1&\x16,t8\xe0\x8e+[\xff\xcf\xfb\xce\xec\xde^\xc3\x03\x0f\xc4\x84\xf7\xf3c\x98m3s\xbb\xef~\xf7\xf7\xbc\xef\xf3\xbe\xefE\xbe\x0b\xe5\xfb\x15o\x7f\xb9\xb1\xbc\xc3\xe8{\x1c\xd9\xeb\x97g\xb0|#\xd5p\xd75r\x8d\xf8\xa2\xbf\xf3\xea\xbe\xf3T\xc3=y\xc4\xef\x1dz\xbf<H\x1eQno@n\xbfF\xf4\xde\x9fmy\x83uV>\x9f\xef_\x1b\x

In [44]:
text_data[2]

# Title of the dish: font-size, 13.0, 'MuseoSans-500', font-intensity 500, is_capit

{'text': 'Edamame',
 'page_number': 4,
 'font_family': 'MuseoSans-500',
 'font_size': 13.0,
 'is_bold': False,
 'font_intensity': '500',
 'is_italic': False,
 'is_capitalized': False,
 'bbox': (80.64360046386719,
  174.60699462890625,
  140.6125946044922,
  188.93299865722656),
 'color': -14475488}

In [45]:
text_data[3]

{'text': 'Vainas de soja.',
 'page_number': 4,
 'font_family': 'MuseoSans-300',
 'font_size': 10.0,
 'is_bold': False,
 'font_intensity': '300',
 'is_italic': False,
 'is_capitalized': False,
 'bbox': (74.13310241699219,
  188.79029846191406,
  140.61309814453125,
  199.72030639648438),
 'color': -14475488}

In [46]:
text_data[4]

{'text': 'Alérgenos: 6',
 'page_number': 4,
 'font_family': 'MuseoSans-300Italic',
 'font_size': 9.0,
 'is_bold': False,
 'font_intensity': '300',
 'is_italic': True,
 'is_capitalized': False,
 'bbox': (89.06099700927734,
  201.8905792236328,
  143.05197143554688,
  211.7275848388672),
 'color': -14475488}

In [47]:
import pandas as pd
pd.DataFrame(text_data)

Unnamed: 0,text,page_number,font_family,font_size,is_bold,font_intensity,is_italic,is_capitalized,bbox,color
0,6,4,NunitoSans-12ptExtraLigh,10.0,False,1,False,False,"(30.000099182128906, 806.2068481445312, 36.000...",-14475488
1,ENTRANTES,4,MuseoSans-100,48.0,False,100,False,True,"(56.692901611328125, 70.58364868164062, 312.29...",-4416392
2,Edamame,4,MuseoSans-500,13.0,False,500,False,False,"(80.64360046386719, 174.60699462890625, 140.61...",-14475488
3,Vainas de soja.,4,MuseoSans-300,10.0,False,300,False,False,"(74.13310241699219, 188.79029846191406, 140.61...",-14475488
4,Alérgenos: 6,4,MuseoSans-300Italic,9.0,False,300,True,False,"(89.06099700927734, 201.8905792236328, 143.051...",-14475488
5,1,4,MuseoSans-500,15.0,False,500,False,False,"(133.27720642089844, 156.884521484375, 140.612...",-703431
6,Takoyaki / 2 Pzs.,4,MuseoSans-500,13.0,False,500,False,False,"(84.44609832763672, 582.1968994140625, 182.830...",-14475488
7,"Albóndigas de pulpo con mayonesa, salsa",4,MuseoSans-300,10.0,False,300,False,False,"(84.44609832763672, 596.3801879882812, 277.156...",-14475488
8,teriyaki y tiras de bonito seco.,4,MuseoSans-300,10.0,False,300,False,False,"(84.44609832763672, 606.3801879882812, 219.526...",-14475488
9,"Alérgenos: 1,3,4,6,14",4,MuseoSans-300Italic,9.0,False,300,True,False,"(84.44609832763672, 619.48046875, 170.22415161...",-14475488


In [33]:
blocks[0]['lines'][0]["spans"]

[{'size': 10.0,
  'flags': 4,
  'font': 'NunitoSans-12ptExtraLigh',
  'color': -14475488,
  'ascender': 1.0360000133514404,
  'descender': -0.27000001072883606,
  'text': '6',
  'origin': (30.000099182128906, 816.5668334960938),
  'bbox': (30.000099182128906,
   806.2068481445312,
   36.000099182128906,
   819.266845703125)}]

# METHOD 2: GET SIZE

In [24]:
from pypdf import PdfReader

# source: https://stackoverflow.com/questions/46232984/how-to-get-pdf-file-metadata-page-size-using-python

pdf = PdfReader("MENU.pdf")
page = pdf.pages[1]

cm_per_inch = 2.54
points = 72

width_in_user_space_units = page.mediabox.width
height_in_user_space_units = page.mediabox.height

width_in_cm = float(width_in_user_space_units) / points * cm_per_inch
height_in_cm = float(height_in_user_space_units) / points * cm_per_inch

21.000014444444446

In [10]:
blocks['height']

841.8900146484375

In [None]:
blocks = page.get_text("dict")["blocks"]
        
        for block in blocks:
            if "lines" not in block:
                continue
            
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"].strip()
                    if not text:
                        continue
                    
                    flags = span.get("flags", 0)
                    is_underlined = bool(flags & 2**4)
                    
                    text_data.append({
                        'text': text,
                        'page': page_num + 1,  # Convert back to 1-indexed for output
                        'font_size': round(span["size"], 1),
                        'is_bold': bool(re.search(r'bold|heavy', span["font"].lower())),
                        'is_italic': bool(re.search(r'italic|oblique', span['font'].lower())),
                        'is_underlined': is_underlined,
                        'is_number': bool(re.search('\d{1,}', text.strip())),
                        'is_capitalized': text.isupper(),
                        'y_position': span['bbox'][1],
                        'x_position': span['bbox'][0],
                        'bbox': span['bbox'],
                        'color': span['color']
                    })
    

In [None]:

# This is to start getting to know your PDF



def extract_pdf_text_features(pdf_file: str, pages: Optional[List[int]] = None) -> pd.DataFrame:
    """
    Extract text features from specified pages of a PDF file.
    
    Args:
        pdf_file (str): Path to the PDF file
        pages (List[int], optional): List of specific pages to analyze (1-indexed). 
                                   If None, analyzes all pages.
    
    Returns:
        pd.DataFrame: DataFrame containing text features from the specified pages
    """
    doc = fitz.open(pdf_file)
    text_data = []
    
    # Determine pages to process
    if pages is None:
        pages = range(len(doc))
    else:
        # Convert 1-indexed to 0-indexed
        pages = [p - 1 for p in pages]
        # Validate page numbers
        max_page = len(doc) - 1
        invalid_pages = [p for p in pages if p > max_page or p < 0]
        if invalid_pages:
            raise ValueError(f"Page numbers {invalid_pages} are out of range. PDF has {len(doc)} pages.")

    # Rest of your implementation remains the same
    for page_num in pages:
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]
        
        for block in blocks:
            if "lines" not in block:
                continue
            
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"].strip()
                    if not text:
                        continue
                    
                    flags = span.get("flags", 0)
                    is_underlined = bool(flags & 2**4)
                    
                    text_data.append({
                        'text': text,
                        'page': page_num + 1,  # Convert back to 1-indexed for output
                        'font_size': round(span["size"], 1),
                        'is_bold': bool(re.search(r'bold|heavy', span["font"].lower())),
                        'is_italic': bool(re.search(r'italic|oblique', span['font'].lower())),
                        'is_underlined': is_underlined,
                        'is_number': bool(re.search('\d{1,}', text.strip())),
                        'is_capitalized': text.isupper(),
                        'y_position': span['bbox'][1],
                        'x_position': span['bbox'][0],
                        'bbox': span['bbox'],
                        'color': span['color']
                    })
    
    df = pd.DataFrame(text_data)
    return df.sort_values(['page', 'y_position'])

In [None]:
import pandas as pd
import fitz
from typing import List, Dict, Optional
import re
from pathlib import Path

In [139]:
import os
def extract_menu_images(pdf_path: str, output_dir: str = 'menu_images') -> dict:
    """
    Extracts all images from a menu PDF and saves them with meaningful names.
    
    Parameters:
        pdf_path: Path to the PDF file
        output_dir: Directory where images will be saved
        
    Returns:
        Dictionary mapping page numbers to lists of image information
        {page_number: [{'bbox': (x0,y0,x1,y1), 'path': 'saved/image/path.jpg'}, ...]}
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Open the PDF
    pdf_document = fitz.open(pdf_path)
    image_mapping = {}
    
    # Process each page
    for page_number, page in enumerate(pdf_document):
        # Get list of images on this page
        image_list = page.get_images()
        page_images = []
        
        # Process each image on the page
        for img_index, img_info in enumerate(image_list):
            # Extract the image
            xref = img_info[0]
            base_image = pdf_document.extract_image(xref)
            
            if base_image:
                # Get image data and extension
                image_data = base_image["image"]
                extension = base_image["ext"]
                
                # Generate a unique filename
                filename = f'page_{page_number + 1}_img_{img_index + 1}.{extension}'
                save_path = os.path.join(output_dir, filename)
                
                # Save the image
                with open(save_path, 'wb') as img_file:
                    img_file.write(image_data)
                
                # Store image location information
                for image_rect in page.get_image_rects(xref):
                    page_images.append({
                        'bbox': tuple(image_rect),  # (x0, y0, x1, y1)
                        'path': save_path
                    })
        
        # Store all images for this page
        if page_images:
            image_mapping[page_number + 1] = page_images
    
    return image_mapping

In [141]:
image_mapping

NameError: name 'image_mapping' is not defined

In [140]:
extract_menu_images('MENU.pdf', './menu_images')

{4: [{'bbox': (99.81705474853516,
    105.039794921875,
    596.3167724609375,
    448.65716552734375),
   'path': './menu_images/page_4_img_1.jpx'},
  {'bbox': (51.428466796875,
    434.91448974609375,
    261.2954406738281,
    583.1021728515625),
   'path': './menu_images/page_4_img_2.jpx'},
  {'bbox': (307.67523193359375,
    416.05963134765625,
    538.2969970703125,
    601.9512329101562),
   'path': './menu_images/page_4_img_3.jpx'},
  {'bbox': (48.86100387573242,
    610.8865966796875,
    314.9002990722656,
    798.6934814453125),
   'path': './menu_images/page_4_img_4.jpx'},
  {'bbox': (299.4376525878906,
    604.2426147460938,
    546.3826904296875,
    809.724609375),
   'path': './menu_images/page_4_img_5.jpx'}],
 5: [{'bbox': (43.46629333496094,
    303.30560302734375,
    271.02777099609375,
    488.0880126953125),
   'path': './menu_images/page_5_img_1.jpx'},
  {'bbox': (-1.0989317893981934,
    105.039794921875,
    85.03948211669922,
    448.65716552734375),
   'path'