# Data

- https://vk.com/u_samovaraa?w=wall-81871567_61842
- https://vk.com/vkys_nos?w=wall-41960737_13333
- https://vk.com/receptik_kulinar?w=wall-59496329_52708
- https://vk.com/lisimnik_cake?w=wall-82240292_25648
- https://vk.com/kingcook?w=wall-59442940_11047
- https://vk.com/u_samovaraa?w=wall-81871567_61917
- https://vk.com/quickrecipes?w=wall-61337543_5892
- https://vk.com/namenuru?w=wall-36303114_56579
- https://vk.com/vegan_cookbook?w=wall-43818640_25903
- https://vk.com/multivarka_cook?w=wall-51300483_11948

In [None]:
from glob import iglob as list_paths


def load_text(path):
    with open(path) as file:
        return file.read()


texts = [
    load_text(_)
    for _ in list_paths('texts/*.txt')
]

In [None]:
def make_translation(source, target):
    assert len(source) == len(target)
    return {
        ord(a): ord(b)
        for a, b in zip(source, target)
    }


DASHES_TRANSLATION = make_translation(
    '‑–—−',
    '----'
)


def preprocess(text):
    text = text.replace('\xa0', ' ')
    text = text.replace('\xad', '')
    text = text.translate(DASHES_TRANSLATION)
    return text


texts = [preprocess(_) for _ in texts]

In [None]:
print(texts[0])

In [None]:
import re


def find_ingredient_sections(text):
    return re.findall(r'ингредиенты:(.+?)приготовление', text, re.I | re.S)



def maybe_ingredient(line):
    match = re.search(r'\d', line)
    size = len(line) <= 50
    return match and size


lines = []
for text in texts:
    sections = find_ingredient_sections(text)
    for section in sections:
        for line in section.splitlines():
            if maybe_ingredient(line):
                lines.append(line)

In [None]:
from random import seed, sample

seed(1)
sample(lines, 20)

In [None]:
len(lines)

# Grammar

In [None]:
from yargy.tokenizer import MorphTokenizer


TOKENIZER = MorphTokenizer()


list(TOKENIZER('Соль - 2 ст.ложки'))

In [None]:
from ipymarkup import show_markup

from yargy import rule, Parser
from yargy.predicates import eq


MEASURE = rule(eq('100'), eq('г'))
parser = Parser(MEASURE)
seed(1)
for line in sample(lines, 100):
    matches = list(parser.findall(line))
    spans = [_.span for _ in matches]
    show_markup(line, spans)

In [None]:
from yargy import or_
from yargy.predicates import type, normalized
from yargy.pipelines import morph_pipeline

INT = type('INT')

NAME = morph_pipeline([
    'мл',
    'литр',

    'г',
    'гр',
    'грамм',

    'шт',
    'штука',
    'пачка',

    'ст',
    'чаш',
    'стакан',
    'горсть',

    'зубчик',
    'зуб',
    
    'ст.л',
    'ст.ложка',
    'столовая ложка',

    'ч.л',
    'ч.ложка',
    'чайная ложка',
])

UNIT = rule(
    NAME,
    eq('.').optional()
)

MEASURE = rule(INT, UNIT)

parser = Parser(MEASURE)
seed(1)
for line in sample(lines, 100):
    matches = list(parser.findall(line))
    spans = [_.span for _ in matches]
    show_markup(line, spans)

In [None]:
from yargy.predicates import in_


FLOAT = rule(
    INT,
    in_('.,'),
    INT
)

FRACTION = rule(
    INT,
    '/',
    INT
)

RANGE = rule(
    INT,
    '-',
    INT
)

AMOUNT = or_(
    rule(INT),
    FLOAT,
    FRACTION,
    RANGE
)

MEASURE = rule(AMOUNT, UNIT)

parser = Parser(MEASURE)
seed(1)
for line in sample(lines, 100):
    matches = list(parser.findall(line))
    spans = [_.span for _ in matches]
    show_markup(line, spans)

In [None]:
R = or_(
    rule('a', 'b'),
    rule('c').optional().repeatable()
)

R.normalized.as_bnf

In [None]:
from yargy.predicates import gram


NOUN = gram('NOUN')
ADJF = or_(
    gram('ADJF'),
    gram('PRTF')
)

MODIFIER = ADJF.repeatable()

PRODUCT = rule(
    MODIFIER.optional(),
    NOUN,
    MODIFIER.optional()
)

parser = Parser(or_(PRODUCT, MEASURE))
seed(1)
for line in sample(lines, 100):
    matches = list(parser.findall(line))
    spans = [_.span for _ in matches]
    show_markup(line, spans)

In [None]:
SEP = in_('-:')

MEASURE = rule(
    AMOUNT,
    UNIT.optional()
)

INGREDIENT = or_(
    rule(
        MEASURE,
        SEP.optional(),
        PRODUCT
    ),
    rule(
        PRODUCT,
        SEP.optional(),
        MEASURE
    )
)

parser = Parser(INGREDIENT)
seed(1)
for line in sample(lines, 100):
    matches = list(parser.findall(line))
    spans = [_.span for _ in matches]
    show_markup(line, spans)

# Interpretation

In [None]:
from yargy.interpretation import fact


Measure = fact(
    'Measure',
    ['amount', 'unit']
)

AMOUNT = eq('100').interpretation(
    Measure.amount.custom(int)
)

UNIT = normalized('грамм').interpretation(
    Measure.unit.normalized()
)

MEASURE = rule(AMOUNT, UNIT).interpretation(
    Measure
)

parser = Parser(MEASURE)
match = parser.match('100 граммов')
match.tree.as_dot

In [None]:
match.fact

In [None]:
from fractions import Fraction

Measure = fact(
    'Measure',
    ['amount', 'unit']
)
Product = fact(
    'Product',
    ['name', 'modifiers']
)
Ingredient = fact(
    'Ingredient',
    ['measure', 'product']
)


DIGIT = INT.interpretation(
    Measure.amount.custom(int)
)

FLOAT = rule(
    INT,
    in_('.,'),
    INT
).interpretation(
    Measure.amount.custom(lambda _: _.replace(',', '.'))
)

def parse_fraction(value):
    n, d = value.split('/')
    return Fraction(int(n), int(d))

FRACTION = rule(
    INT,
    '/',
    INT
).interpretation(
    Measure.amount.custom(parse_fraction)
)

def parse_range(value):
    a, b = value.split('-')
    return int(a), int(b)

RANGE = rule(
    INT,
    '-',
    INT
).interpretation(
    Measure.amount.custom(parse_range)
)

AMOUNT = or_(
    DIGIT,
    FLOAT,
    FRACTION,
    RANGE
)

NAME = morph_pipeline([
    'мл',
    'литр',

    'г',
    'гр',
    'грамм',

    'шт',
    'штука',
    'пачка',

    'ст',
    'чаш',
    'стакан',
    'горсть',

    'зубчик',
    'зуб',
    
    'ст.л',
    'ст.ложка',
    'столовая ложка',

    'ч.л',
    'ч.ложка',
    'чайная ложка',
]).interpretation(
    Measure.unit.normalized()
)

UNIT = rule(
    NAME,
    eq('.').optional()
)

MEASURE = rule(
    AMOUNT,
    UNIT.optional()
).interpretation(
    Measure
)

NOUN = gram('NOUN')
ADJF = or_(
    gram('ADJF'),
    gram('PRTF')
)

MODIFIER = ADJF.repeatable().interpretation(
    Product.modifiers.normalized()
)

PRODUCT = rule(
    MODIFIER.optional(),
    NOUN.interpretation(Product.name.normalized()),
    MODIFIER.optional()
).interpretation(
    Product
)

SEP = in_('-:')

INGREDIENT = or_(
    rule(
        MEASURE.interpretation(Ingredient.measure),
        SEP.optional(),
        PRODUCT.interpretation(Ingredient.product)
    ),
    rule(
        PRODUCT.interpretation(Ingredient.product),
        SEP.optional(),
        MEASURE.interpretation(Ingredient.measure)
    )
).interpretation(
    Ingredient
)

parser = Parser(INGREDIENT)
seed(1)
for line in sample(lines, 100):
    matches = list(parser.findall(line))
    spans = [_.span for _ in matches]
    show_markup(line, spans)
    if matches:
        match = matches[0]
        display(match.tree.as_dot)
        display(match.fact)