Create a format of recipe that can  
* has an author friendly input
* user friendly output
* supports data structure itself

In [326]:
def expand(command_line, pattern, captures=[], last=True):
    def mapper(m):
        key = m.group(0)
        val = pattern[key]
        if key in captures:
            return f'(?P<{key}>{val})'
        else:
            if last:
                return val
            else:
                return f'({val})'
    
    script = re.sub("|".join(pattern.keys()), mapper, command_line)
    if last:
        return script.replace(' ', '')
    else:
        return script

s1 = expand(r'\*\s (quantity \s)? name (\s argsenv)?', 
            pattern=dict(quantity = r'amount (\s? units)?',
                         argsenv = r'\[ args \]'),
            captures=[], last=False)

s2 = expand(s1, 
            pattern = dict(amount = r'\d+\.?\d*', 
            units='|'.join(['ml', 'l', 'g', 'kg', 'vnt', 'vnt.']), 
            name=r'([a-zA-Ząčęėįšųūž]+\s)*[a-zA-Ząčęėįšųūž]+',
            args='.+'), 
            captures=['amount', 'units', 'name', 'args'])

s3 = expand(r'amount \s? units', 
            pattern = dict(
                amount = r'\d+\.?\d*', 
                units='|'.join(['ml', 'l', 'g', 'kg', 'vnt', 'vnt.'])),
            captures=['amount', 'units'])

print(re.fullmatch(s2, '* 30.7 g vf agurkų [item: agurkai, weight: 200g]').groupdict())
print(re.fullmatch(s3, '30.7g').groupdict())

{'amount': '30.7', 'units': 'g', 'name': 'vf agurkų', 'args': 'item: agurkai, weight: 200g'}
{'amount': '30.7', 'units': 'g'}


In [333]:
def match_args(args):
    '''"item: arbūzas, weight: 200g" -> {'type': 'item', 'name': 'arbūzas', 'amount': '200', 'units': 'vnt'}'''
    weight_pattern = expand(r'amount \s? units', 
                            pattern = dict(amount = r'\d+\.?\d*', 
                                           units='|'.join(['ml', 'l', 'g', 'kg'])),
                            captures=['amount', 'units'])
    d = dict()
    for s in args.split(', '):
        arg, val = s.split(': ')
        if arg in ('item', 'recipe'):
            d['name'] = val
            d['type'] = arg
        elif arg == 'weight':
            d = {**d, **re.fullmatch(weight_pattern, val).groupdict()}
    return d

match_args('item: arbūzas, weight: 200 ml')

{'name': 'arbūzas', 'type': 'item', 'amount': '200', 'units': 'ml'}

In [344]:
def match_recipe_line(line, default_type='item'):
    #like: `* 200gr ridikėlių [item: ridikėliai]` arba `* 2 paprikos [weight: 150g]`
    s1 = expand(r'\*\s header (\s argsenv)?', 
               pattern=dict(header = r'(quantity \s)? name',
                            argsenv = r'\[ args \]'),
               captures=['header'], last=False)

    s2 = expand(s1, 
                pattern = dict(quantity = r'amount (\s? units)?',
                               name=r'([a-zA-Ząčęėįšųūž]+\s)*[a-zA-Ząčęėįšųūž]+',
                               args='.+'), 
                captures=['name', 'args'], last=False)
    recipe_pattern = expand(s2,
                            pattern = dict(amount = r'\d+\.?\d*', 
                                           units='|'.join(['ml', 'l', 'g', 'kg', 'vnt', 'vnt.'])),
                            captures=['amount', 'units'])
    match = re.fullmatch(recipe_pattern, line)
    if match:
        d = match.groupdict()
        if d['args']:
            d = {**d, **match_line_args(d['args'])}
        if 'type' not in d:
            d['type'] = default_type
        d.pop('args')
        return d
    else:
        raise ValueError(f'line = {line} does not match')

#match_recipe_line('* 200gr ridikėlių [item: ridikėliai]')
print(match_recipe_line('* 30.7 g raugintų agurkėlių [item: rauginti agurkai, weight: 200g]'))
print(match_recipe_line('* 200g ridikėlių [item: ridikėliai]'))

{'header': '30.7 g raugintų agurkėlių', 'amount': '200', 'units': 'g', 'name': 'rauginti agurkai', 'type': 'item'}
{'header': '200g ridikėlių', 'amount': '200', 'units': 'g', 'name': 'ridikėliai', 'type': 'item'}


In [346]:
def parse_recipe(name):
    with open(name, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        recipe_ingredients = []
        for line in lines:
            line = line.rstrip()
            if line.startswith('*'):
                recipe_ingredients.append(match_recipe_line(line, default_type='item'))
            elif line.startswith(' ') or line.startswith('\t'):
                if 'details' not in recipe_ingredients[-1]:
                    recipe_ingredients[-1]['details'] = []
                recipe_ingredients[-1]['details'].append(match_recipe_line(line.lstrip()))
            else:
                raise ValueError('Unexpected start character in line:, line')     
    return recipe_ingredients


    
recipe_data = parse_recipe(r'recipes\Šaltibarščiai.txt')
import pandas as pd
pd.DataFrame(recipe_data)

Unnamed: 0,header,amount,units,name,type,details
0,1kg paruoštų bulvių,1.0,kg,bulvės,item,"[{'header': 'keptų', 'amount': None, 'units': ..."
1,1kg kefyro,1.0,kg,kefyras,item,"[{'header': 'paprasto', 'amount': None, 'units..."
2,400ml pieno,400.0,ml,pienas UAT,item,"[{'header': 'paprasto pieno', 'amount': None, ..."
3,2 burokėliai,200.0,g,burokėliai,item,
4,2 trumpavaisiai agurkai,200.0,g,trumpavaisiai agurkai,item,
5,3 kiaušiniai,3.0,,kiaušiniai,item,
6,70g svogūnų laiškų,70.0,g,svogūnų laiškai,item,
7,džiovinti krapai,,,džiovinti krapai,item,
8,druska,,,druska,item,


In [1]:
from recipe_formatter import normalise_recipe, parse_recipe
normalise_recipe(parse_recipe(r'recipes\Šaltibarščiai.txt'))

Unnamed: 0,amount,units,name
0,1.0,kg,Bulvės
1,1.0,kg,Kefyras
2,0.4,l,Pienas UAT
3,0.2,kg,Burokėliai
4,0.2,kg,Trumpavaisiai agurkai
5,3.0,vnt.,Kiaušiniai
6,0.07,kg,Svogūnų laiškai
7,,,Džiovinti krapai
8,,,Druska
