In [1]:
import fitz
import pandas as pd
import re

In [2]:
# random_page = doc[3].get_text("dict")

# def points_to_cm(p: int):
#     size = p * (2.54 / 72)
#     return size

# points_to_cm(random_page['height'])

In [3]:
def extract_page_info(page, page_number):

    doc = page.get_text("dict")
    blocks = doc['blocks']
    text_data = []

    for block in blocks:
        if 'lines' not in block:
            continue

        for line in block['lines']:
            for span in line['spans']:
                text = span['text'].strip()

                if not text:
                    continue

                flags = span.get('flags', 0)

                match_ = re.search(r'[A-Za-z]+[-_]?(\d+)(?![pt])', span['font'])

                text_data.append({'text': text,
                                'page_number': page_number + 1,
                                'only_number': bool(re.search(r'^\d{1,}$', text.strip())),
                                'font_family': span['font'],
                                'font_size': span['size'],
                                'is_bold': bool(re.search(r'bold|heavy', span['font'], flags=re.I)),
                                'font_intensity': match_.group(1) if match_ else None,
                                'is_italic': bool(re.search(r'italic|oblique', span['font'].lower())),
                                'is_capitalized': text.isupper(),
                                'bbox': span['bbox'],
                                'color': span['color']})
    

    return pd.DataFrame(text_data)
                






    

In [6]:
doc = fitz.open('MENU.pdf')

pages = []

for pnr, page in enumerate(doc):
    page_df = extract_page_info(page, pnr)
    pages.append(page_df)
    



In [7]:
full_menu = pd.concat(pages)

In [15]:



class MenuClassifier:
    def __init__(self):
        self.patterns = {
            'dish_id': dict(
                font_family="MuseoSans-500",
                only_number=True,
                font_size=15.0,
                font_intensity='500',
                color=-703431),
            'dish_name': dict(
                font_family="MuseoSans-500",
                only_number=False,
                font_size=13.0,
                font_intensity='500',
                color=-14475488
            ),
            'dish_description': dict(
                font_family="MuseoSans-300",
                only_number=False,
                font_size=10.0,
                font_intensity='300',
                color=-14475488
            ),
            'allergens': dict(
                font_family="MuseoSans-300Italic",
                only_number=False,
                font_size=9.0,
                font_intensity='300',
                color=-14475488
            )
        }
    
    def classify(self, row):
        text_props = {
            'font_family': row['font_family'],
            'only_number': row['only_number'],
            'font_size': row['font_size'],
            'font_intensity': row['font_intensity'],
            'color': row['color']
        }

        for text_type, pattern in self.patterns.items():
            if all(text_props.get(key) == val for key, val in pattern.items()):
                return text_type
        return 'Unclassified'
            


In [18]:
classifier = MenuClassifier()

full_menu['category'] = full_menu.apply(lambda row: classifier.classify(row), axis=1)

In [19]:
full_menu.query("page_number == 4")[['text', 'category']]

Unnamed: 0,text,category
0,6,Unclassified
1,ENTRANTES,Unclassified
2,Edamame,dish_name
3,Vainas de soja.,dish_description
4,Alérgenos: 6,allergens
5,1,dish_id
6,Takoyaki / 2 Pzs.,dish_name
7,"Albóndigas de pulpo con mayonesa, salsa",dish_description
8,teriyaki y tiras de bonito seco.,dish_description
9,"Alérgenos: 1,3,4,6,14",allergens


In [20]:
import numpy as np

full_menu_filtered = full_menu[full_menu['category'] != 'Unclassified']

full_menu_filtered['group'] = np.where(full_menu_filtered['category'] == 'dish_id', full_menu_filtered['text'], np.nan) 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_menu_filtered['group'] = np.where(full_menu_filtered['category'] == 'dish_id', full_menu_filtered['text'], np.nan)


In [23]:
full_menu_filtered = full_menu_filtered[full_menu_filtered['page_number'] > 3]

In [25]:
full_menu_filtered['group'] = full_menu_filtered['group'].fillna(method='bfill')

  full_menu_filtered['group'] = full_menu_filtered['group'].fillna(method='bfill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_menu_filtered['group'] = full_menu_filtered['group'].fillna(method='bfill')


In [38]:
def concat_with_space(series):
    # Filter out any None or NaN values
    valid_strings = series.dropna()
    
    # If we have no valid strings, return None
    if len(valid_strings) == 0:
        return None
        
    # If we only have one string, return it as is
    if len(valid_strings) == 1:
        return valid_strings.iloc[0]
        
    # Otherwise, join all strings with spaces
    return ' '.join(valid_strings)

# Now use this function in the pivot table
pivotted_table = pd.pivot_table(
    full_menu_filtered,
    index='group',
    columns='category',
    values='text',
    aggfunc=concat_with_space  # Our custom concatenation function
).reset_index()

In [34]:
full_menu_filtered[full_menu_filtered['text'].str.contains('bonito seco')]

Unnamed: 0,text,page_number,only_number,font_family,font_size,is_bold,font_intensity,is_italic,is_capitalized,bbox,color,category,group
8,teriyaki y tiras de bonito seco.,4,False,MuseoSans-300,10.0,False,300,False,False,"(84.44609832763672, 606.3801879882812, 219.526...",-14475488,dish_description,2


In [39]:
columns = ['dish_id', 'dish_name', 'dish_description', 'allergens']

dish_data = pivotted_table[columns]

In [40]:
dish_data['dish_id'] = dish_data['dish_id'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dish_data['dish_id'] = dish_data['dish_id'].astype(int)


In [43]:
dish_data.sort_values(by='dish_id').query('dish_id == 1')['dish_description'].iloc[0]

'Vainas de soja.'