In [1]:
 %load_ext autoreload

In [2]:
from parsimonious.examples.grammar_syntax_extension import AttrsTokenGrammar

In [3]:
x12 = """ISA*00*          *00*          *ZZ*EMEDNYBAT      *ZZ*ETIN           *100101*1000*^*00501*006000600*0*T*:~
GS*HP*EMEDNYBAT*ETIN*20100101*1050*6000600*X*005010X221A1~
ST*835*1740~
BPR*I*45.75*C*ACH*CCP*01*111*DA*33*1234567890**01*111*DA*22*20100101~
TRN*1*10100000000*1000000000~
REF*EV*ETIN~
DTM*405*20100101~
N1*PR*NYSDOH~
N3*OFFICE OF HEALTH INSURANCE PROGRAMS*CORNING TOWER, EMPIRE STATE PLAZA~
N4*ALBANY*NY*122370080~
PER*BL*PROVIDER SERVICES*TE*8003439000*UR*www.emedny.org~
N1*PE*MAJOR MEDICAL PROVIDER*XX*9999999995~
REF*TJ*000000000~
LX*1~
CLP*PATIENT ACCOUNT NUMBER*1*34.25*34.25**MC*1000210000000030*11~
NM1*QC*1*SUBMITTED LAST*SUBMITTED FIRST****MI*LL99999L~
NM1*74*1*CORRECTED LAST*CORRECTED FIRST~
REF*EA*PATIENT ACCOUNT NUMBER~
DTM*232*20100101~
DTM*233*20100101~
AMT*AU*34.25~
SVC*HC:V2020:RB*6*6**1~
DTM*472*20100101~
AMT*B6*6~
SVC*HC:V2700:RB*2.75*2.75**1~
DTM*472*20100101~
AMT*B6*2.75~
SVC*HC:V2103:RB*5.5*5.5**1~
DTM*472*20100101~
AMT*B6*5.5~
SVC*HC:S0580*20*20**2~
DTM*472*20100101~
AMT*B6*20~
CLP*PATIENT ACCOUNT NUMBER*2*34*0**MC*1000220000000020*11~
NM1*QC*1*SUBMITTED LAST*SUBMITTED FIRST****MI*LL88888L~
NM1*74*1*CORRECTED LAST*CORRECTED FIRST~
REF*EA*PATIENT ACCOUNT NUMBER~
DTM*232*20100101~
DTM*233*20100101~
SVC*HC:V2020*12*0**0~
DTM*472*20100101~
CAS*CO*29*12~
SVC*HC:V2103*22*0**0~
DTM*472*20100101~
CAS*CO*29*22~
CLP*PATIENT ACCOUNT NUMBER*2*34.25*11.5**MC*1000230000000020*11~
NM1*QC*1*SUBMITTED LAST*SUBMITTED FIRST****MI*LL77777L~
NM1*74*1*CORRECTED LAST*CORRECTED FIRST~
REF*EA*PATIENT ACCOUNT NUMBER~
DTM*232*20100101~
DTM*233*20100101~
AMT*AU*11.5~
SVC*HC:V2020:RB*6*6**1~
DTM*472*20100101~
AMT*B6*6~
SVC*HC:V2103:RB*5.5*5.5**1~
DTM*472*20130917~
AMT*B6*5.5~
SVC*HC:V2700:RB*2.75*0**0~
DTM*472*20100101~
CAS*CO*251*2.75~
LQ*HE*N206~
SVC*HC:S0580*20*0**0~
DTM*472*20100101~
CAS*CO*251*20~
LQ*HE*N206~
SE*65*1740~
GE*1*6000600~
IEA*1*006000600~
"""


In [6]:
import regex as re
from enum import Enum
from dataclasses import dataclass
from typing import List


class Token(str):
    def __new__(cls, token_type, s):
        obj = super().__new__(cls, s)
        obj.type = token_type
        return obj

    @property
    def value(self):
        return self

    
def lex(data: str) -> list:
    data_elem_sep, repetition_sep, component_sep = (
        data[3], data[82], data[104]
    )
    # Allow multi-character segment terminators so the more
    # readable ~\n is valid. "GS" is always the second segment
    # of all x12 documents.
    seg_terminator = data[105: data.index(f'GS{data_elem_sep}', 105)]
    sep_to_token = {
        data_elem_sep: Token("DATA_SEP", data_elem_sep),
        repetition_sep: Token("REPEAT_SEP", repetition_sep),
        seg_terminator: Token("SEG_TERM", seg_terminator),
        component_sep: Token("COMPONENT_SEP", component_sep),
    }
    separator_re = re.compile(
        r"(\L<separators>)",
        separators=list(sep_to_token),
    )
    prev_endpoint = 0
    tokens = []
    for match in separator_re.finditer(data):
        start, endpoint = match.span()
        # Everything between this endpoint and the last is a
        # value that contains data. Even zero-length elements
        # are meaningful.
        tokens.append(Token("TEXT", data[prev_endpoint:start]))
        s = match.group()
        tokens.append(sep_to_token[s])
        prev_endpoint = start + len(s)
    return tokens


tokens = lex(x12)
print(tokens)


['ISA', '*', '00', '*', '          ', '*', '00', '*', '          ', '*', 'ZZ', '*', 'EMEDNYBAT      ', '*', 'ZZ', '*', 'ETIN           ', '*', '100101', '*', '1000', '*', '', '^', '', '*', '00501', '*', '006000600', '*', '0', '*', 'T', '*', '', ':', '', '~\n', 'GS', '*', 'HP', '*', 'EMEDNYBAT', '*', 'ETIN', '*', '20100101', '*', '1050', '*', '6000600', '*', 'X', '*', '005010X221A1', '~\n', 'ST', '*', '835', '*', '1740', '~\n', 'BPR', '*', 'I', '*', '45.75', '*', 'C', '*', 'ACH', '*', 'CCP', '*', '01', '*', '111', '*', 'DA', '*', '33', '*', '1234567890', '*', '', '*', '01', '*', '111', '*', 'DA', '*', '22', '*', '20100101', '~\n', 'TRN', '*', '1', '*', '10100000000', '*', '1000000000', '~\n', 'REF', '*', 'EV', '*', 'ETIN', '~\n', 'DTM', '*', '405', '*', '20100101', '~\n', 'N1', '*', 'PR', '*', 'NYSDOH', '~\n', 'N3', '*', 'OFFICE OF HEALTH INSURANCE PROGRAMS', '*', 'CORNING TOWER, EMPIRE STATE PLAZA', '~\n', 'N4', '*', 'ALBANY', '*', 'NY', '*', '122370080', '~\n', 'PER', '*', 'BL', '*', 

In [7]:
from parsimonious.examples.grammar_syntax_extension import AttrsTokenGrammar

SEGMENT_GRAMMAR = AttrsTokenGrammar(r"""
    x12 = isa segment*
    isa = TEXT[@value="ISA"] (DATA_SEP other)* SEG_TERM
    other = (TEXT / COMPONENT_SEP / REPEAT_SEP)*
    segment = elem (DATA_SEP elem)* SEG_TERM
    elem = value (REPEAT_SEP value)*
    value = TEXT (COMPONENT_SEP TEXT)*
""")

foo = SEGMENT_GRAMMAR.parse(tokens)

In [8]:
print(foo)

<Node called "x12" matching "['ISA', '*', '00', '*', '          ', '*', '00', '*', '          ', '*', 'ZZ', '*', 'EMEDNYBAT      ', '*', 'ZZ', '*', 'ETIN           ', '*', '100101', '*', '1000', '*', '', '^', '', '*', '00501', '*', '006000600', '*', '0', '*', 'T', '*', '', ':', '', '~\n', 'GS', '*', 'HP', '*', 'EMEDNYBAT', '*', 'ETIN', '*', '20100101', '*', '1050', '*', '6000600', '*', 'X', '*', '005010X221A1', '~\n', 'ST', '*', '835', '*', '1740', '~\n', 'BPR', '*', 'I', '*', '45.75', '*', 'C', '*', 'ACH', '*', 'CCP', '*', '01', '*', '111', '*', 'DA', '*', '33', '*', '1234567890', '*', '', '*', '01', '*', '111', '*', 'DA', '*', '22', '*', '20100101', '~\n', 'TRN', '*', '1', '*', '10100000000', '*', '1000000000', '~\n', 'REF', '*', 'EV', '*', 'ETIN', '~\n', 'DTM', '*', '405', '*', '20100101', '~\n', 'N1', '*', 'PR', '*', 'NYSDOH', '~\n', 'N3', '*', 'OFFICE OF HEALTH INSURANCE PROGRAMS', '*', 'CORNING TOWER, EMPIRE STATE PLAZA', '~\n', 'N4', '*', 'ALBANY', '*', 'NY', '*', '122370080', '

In [9]:
from parsimonious.grammar import NodeVisitor
from typing import Any
from dataclasses import dataclass

import pdb, sys; sys.breakpointhook = pdb.set_trace


@dataclass
class Segment:
    seg_id: str
    children: Any


class SegmentVisitor(NodeVisitor):
    def generic_visit(self, node, children):
        values = [c for c in children if c is not None]
        if len(values) == 1:
            return values[0]
        return values

    def visit_value(self, node, children):
        values = []
        for value in children:
            if value != []:
                values.append(value)
        if len(values) == 1:
            return values[0]
        return values

    def visit_TEXT(self, node, children):
        [tok] = node.text
        return tok

    def visit_elem(self, node, children):
        breakpoint()
        1 + 1
    
    def visit_DATA_SEP(self, node, children):
        return None

    def visit_SEG_TERM(self, node, children):
        return None

    def visit_COMPONENT_SEP(self, node, children):
        return None
    
    def visit_elem(self, node, children):
        values = []
        for value in children:
            if value != []:
                values.append(value)
        if len(values) == 1:
            return values[0]
        return values

    def visit_isa(self, node, children):
        breakpoint()
        raise NotImplementedError

    def visit_segment(self, node, children):
        seg_id, elems, _ = children
        return Segment(seg_id, elems)

    def visit_x12(self, node, children):
        breakpoint()
        raise NotImplementedError
        1 + 1

SegmentVisitor().visit(SEGMENT_GRAMMAR.parse(tokens))

> [0;32m/var/folders/0x/v2lbxd814bv46ngy5zhtsf700000gn/T/ipykernel_76242/2422239883.py[0m(58)[0;36mvisit_isa[0;34m()[0m
[0;32m     56 [0;31m    [0;32mdef[0m [0mvisit_isa[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mnode[0m[0;34m,[0m [0mchildren[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     57 [0;31m        [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 58 [0;31m        [0;32mraise[0m [0mNotImplementedError[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     59 [0;31m[0;34m[0m[0m
[0m[0;32m     60 [0;31m    [0;32mdef[0m [0mvisit_segment[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mnode[0m[0;34m,[0m [0mchildren[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> children
['ISA', ['00', '          ', '00', '          ', 'ZZ', 'EMEDNYBAT      ', 'ZZ', 'ETIN           ', '100101', '1000', ['', [], ''], '00501', '006000600', '0', 'T', ['', [], '']], None]
ipdb> q


BdbQuit: 