In [15]:
from tokenize import tokenize, COMMENT, STRING, TokenInfo
from pathlib import Path
from black import lib2to3_parse
from dataclasses import dataclass
from typing import Tuple, Optional
import re

In [11]:
# black parser

file_tokens = {}
for _file in Path("example/example").rglob("*.py"):
    print(_file)
    with open(_file, "r") as fp:
        file_tokens[str(_file)] = lib2to3_parse(fp.read())

file_tokens

example/example/main.py
example/example/__init__.py


{'example/example/main.py': Node(file_input, [Node(simple_stmt, [Leaf(STRING, '"""Example Repo\n\nTODO: module docstring\n"""'), Leaf(NEWLINE, '\n')]), Node(funcdef, [Leaf(NAME, 'def'), Leaf(NAME, 'foo'), Node(parameters, [Leaf(LPAR, '('), Leaf(RPAR, ')')]), Leaf(COLON, ':'), Node(suite, [Leaf(NEWLINE, '\n'), Leaf(INDENT, ''), Node(simple_stmt, [Leaf(STRING, '"""Foo example.\n\n    TODO: function docstring\n    """'), Leaf(NEWLINE, '\n')]), Node(simple_stmt, [Leaf(NAME, 'pass'), Leaf(NEWLINE, '\n')]), Leaf(DEDENT, '')])]), Node(funcdef, [Leaf(NAME, 'def'), Leaf(NAME, 'bar'), Node(parameters, [Leaf(LPAR, '('), Leaf(RPAR, ')')]), Leaf(COLON, ':'), Node(suite, [Leaf(NEWLINE, '\n'), Leaf(INDENT, ''), Node(simple_stmt, [Leaf(STRING, '"""Bar.\n    """'), Leaf(NEWLINE, '\n')]), Node(simple_stmt, [Leaf(NAME, 'pass'), Leaf(NEWLINE, '\n')]), Leaf(DEDENT, '')])]), Node(simple_stmt, [Node(expr_stmt, [Leaf(NAME, 'BAZ'), Leaf(EQUAL, '='), Leaf(NUMBER, '1')]), Leaf(NEWLINE, '\n')]), Node(simple_stmt,

In [12]:
# tokenize stream

file_tokens = {}

for _file in Path("example/example").rglob("*.py"):
    with open(_file, "rb") as fp:
        file_tokens[str(_file)] = list(tokenize(fp.readline))

file_tokens

{'example/example/main.py': [TokenInfo(type=63 (ENCODING), string='utf-8', start=(0, 0), end=(0, 0), line=''),
  TokenInfo(type=3 (STRING), string='"""Example Repo\n\nTODO: module docstring\n"""', start=(1, 0), end=(4, 3), line='"""Example Repo\n\nTODO: module docstring\n"""\n'),
  TokenInfo(type=4 (NEWLINE), string='\n', start=(4, 3), end=(4, 4), line='"""\n'),
  TokenInfo(type=62 (NL), string='\n', start=(5, 0), end=(5, 1), line='\n'),
  TokenInfo(type=61 (COMMENT), string='# TODO: comment', start=(6, 0), end=(6, 15), line='# TODO: comment\n'),
  TokenInfo(type=62 (NL), string='\n', start=(6, 15), end=(6, 16), line='# TODO: comment\n'),
  TokenInfo(type=62 (NL), string='\n', start=(7, 0), end=(7, 1), line='\n'),
  TokenInfo(type=61 (COMMENT), string='# TODO: multiline', start=(8, 0), end=(8, 17), line='# TODO: multiline\n'),
  TokenInfo(type=62 (NL), string='\n', start=(8, 17), end=(8, 18), line='# TODO: multiline\n'),
  TokenInfo(type=61 (COMMENT), string='#       comment', start=(9

In [18]:
for _file, tokens in file_tokens.items():
    for i, token in enumerate(tokens):
        if token.type in [COMMENT, STRING] and "TODO:" in token.string:
            print(_file, i, token)

example/example/main.py 1 TokenInfo(type=3 (STRING), string='"""Example Repo\n\nTODO: module docstring\n"""', start=(1, 0), end=(4, 3), line='"""Example Repo\n\nTODO: module docstring\n"""\n')
example/example/main.py 4 TokenInfo(type=61 (COMMENT), string='# TODO: comment', start=(6, 0), end=(6, 15), line='# TODO: comment\n')
example/example/main.py 7 TokenInfo(type=61 (COMMENT), string='# TODO: multiline', start=(8, 0), end=(8, 17), line='# TODO: multiline\n')
example/example/main.py 20 TokenInfo(type=3 (STRING), string='"""Foo example.\n\n    TODO: function docstring\n    """', start=(13, 4), end=(16, 7), line='    """Foo example.\n\n    TODO: function docstring\n    """\n')


In [83]:
@dataclass(frozen=True)
class Todo:
    scope: Optional[str]
    content: str
    file: Path
    start: Tuple[int,int]
    end: Tuple[int,int]

TODO_REGEX = re.compile(r'.*TODO(:?\((?P<scope>.*?)\)){0,1}:(?P<content>.*)', re.M|re.DOTALL)
# TODO_REGEX = re.compile(r'.*TODO:(?P<content>.*)', re.M|re.DOTALL)

def parse_todo_string(file: Path, token: TokenInfo) -> Optional[Todo]:
    if token.type != STRING:
        raise ValueError("token must be a string!")
    if match := TODO_REGEX.match(token.string):
        return Todo(match.groupdict()["scope"], match.groupdict()["content"], file, token.start, token.end)


In [84]:
todo = parse_todo_string(Path("example/example/main.py"), file_tokens["example/example/main.py"][1])

In [85]:
todo

Todo(scope=None, content=' module docstring\n"""', file=PosixPath('example/example/main.py'), start=(1, 0), end=(4, 3))

In [86]:
TODO_REGEX.match(file_tokens["example/example/main.py"][1].string)

<re.Match object; span=(0, 43), match='"""Example Repo\n\nTODO: module docstring\n"""'>

In [87]:
file_tokens["example/example/main.py"][1].string

'"""Example Repo\n\nTODO: module docstring\n"""'