In [20]:
mmcif_grammar = r"""
// MMCIF parser grammar

start : data (entry | loop)+

key : "_" CNAME ("." CNAME)+ -> key

value.0 :  STRING | TEXT -> value
STRING.1 : SQ | DQ | SC
TEXT.0 : /\S+/
SQ : "'" _STRING_ESC_INNER "'"
DQ : "\"" _STRING_ESC_INNER "\""
SC : /\r?\n\;.*?\r?\n\;/s

data : "data_" value? NEWLINE -> data

entry.2 : key value NEWLINE -> entry

loop : loop_header loop_key+ loop_value+
loop_header : "loop_" _NL
loop_key.1 : key _NL
loop_value.0 : value+ _NL

_COMMENT : "#" /[^\n]*/ _NL
_NL : NEWLINE

%import common.NEWLINE
%import common.WS_INLINE
%import common.CNAME
%import common._STRING_ESC_INNER
%import common.CR
%import common.LF
%import common.SIGNED_INT
%import common.SIGNED_FLOAT

%ignore (WS_INLINE | _COMMENT)
"""

In [21]:
from pathlib import Path
from lark import Lark

In [22]:
cif_path = Path("data/mmcif/4JPP.cif")

In [23]:
parser = Lark(
    grammar=mmcif_grammar,
    lexer="dynamic",
    ordered_sets=False,
)

In [None]:
%%time
tree = parser.parse(cif_path.read_text())
with open("4jpp.txt", "w") as pretty:
    pretty.write(tree.pretty())