In [None]:
#default_exp core

# Inspect Metadata of Markdown Files
> Inspect metadata such as front matter, word count, etc. of markdown files.

In [None]:
#export
from fastcore.all import globtastic, Path, merge, L, AttrDict, str_enum
from fastcore.script import call_parse
import re
import yaml
import json
import sys
from functools import partial
from collections import Counter

In [None]:
#hide
from fastcore.test import test_eq, test_fail

In [None]:
#export
_re_fm = re.compile(r'^---\s*(.*?)---\s*', flags=re.DOTALL)

def _load_yml(yml): 
    if not yml: return {}
    else: return yaml.load(yml, Loader=yaml.FullLoader)

def get_meta(fname:str):
    "get metadata and front matter from `fname`."
    txt = Path(fname).read_text()
    fm = _re_fm.findall(txt)
    fm = _load_yml(fm[0] if fm else {})
    fm['mdseo-ignore'] = list(L(fm.get('mdseo-ignore', [])))
    clean_txt = re.sub('<[^<]+?>', '', _re_fm.sub('', txt))
    ignore = 'all' in fm['mdseo-ignore'] or 'mdseo-ignore-all' in txt
    if not ignore:
        return merge(dict(fname=fname, 
                          n_words=len(clean_txt.split())), 
                     fm)

In [None]:
assert get_meta('test_files/front_matter2.md')['mdseo-ignore'] == ['title', 'body', 'desc']

In [None]:
test_eq(get_meta('test_files/front_matter_test_docs.md'),
        {'fname': 'test_files/front_matter_test_docs.md',
         'n_words': 2,
         'key2': 'value2',
         'slug': 'custom/pathfor/site/this-is-waytoolong/wedontwantthis',
         'key': 'value',
         'mdseo-ignore': []}
       )

In [None]:
#export
def meta_list(srcdir:str):
    "Get list of all metadata for markdown files in `srcdir`."
    docs = globtastic(srcdir, file_glob='*.md', 
                      skip_folder_re='^[.]',
                      skip_file_re='^[_.]')
    return docs.map(get_meta).filter()

In [None]:
assert len(meta_list('.')) > 1

In [None]:
#export
def find_dupe(srcdir:str, key):
    "find duplicate values in front matter."
    c = Counter()
    for m in meta_list(srcdir):
        if 'dupe_title' in m['mdseo-ignore']: continue
        val = m.get(key)
        if val: c.update({f'{val}': 1})
    
    return [el[0] for el in c.items() if el[1] >= 2] if c else []

In [None]:
assert find_dupe('.', 'title')
assert not find_dupe('.', 'foo')

In [None]:
#export
@call_parse
def chk_dupe_title(srcdir:str='.', # directory of files to check
                  ):
    "Check for duplicate titles. Ignore with front matter `mdseo-ignore: [dupe_title]`"
    dupes = find_dupe(srcdir, 'title')
    msg = '\n\t'.join(dupes)
    if dupes: raise Exception(f"The following titles were found in multiple posts:\n\t{msg}")

In [None]:
test_fail(chk_dupe_title)

In [None]:
#export
alias_map = {'description':['desc'],
             'slug': [],
             'image': ['img'],
             'authors': ['author']}
_en = str_enum('_en', *alias_map.keys())

In [None]:
#export
def _intersect(d, key): return set(d['mdseo-ignore']).intersection(set(alias_map.get(key, []) + [key]))

def _missing_fm(d, key):
    if _intersect(d, key): return False
    else: return key not in d

In [None]:
#hide
_test_fm = get_meta('test_files/front_matter2.md')
_test_fm

{'fname': 'test_files/front_matter2.md',
 'n_words': 0,
 'key2': 'value2',
 'slug': 'custom/pathfor/site',
 'key': 'value',
 'title': 'a title is here',
 'mdseo-ignore': ['title', 'body', 'desc']}

In [None]:
#hide
assert _intersect(_test_fm, 'description')
assert not _missing_fm(_test_fm, 'description') # this is in mdseo-ignore so its not considered missing
assert _missing_fm(_test_fm, 'authors')

In [None]:
#export
def _min_len_err(d, key, n):
    if _intersect(d, key): return False
    # Return true if it is less than n length. Ignore with front matter `mdseo-ignore: [chk_fm slug]`.
    else: return key in d and not f"chk_fm {key}" in d["mdseo-ignore"] and len(d[key]) < n 

def _max_len_err(d, key, n):
    if _intersect(d, key): return False
    # Return true if greater than n length. Ignore with front matter `mdseo-ignore: [chk_fm slug]`.
    else: return key in d and not f"chk_fm {key}" in d["mdseo-ignore"] and len(d[key]) > n 

def _checker(func, msg:str, srcdir:str):
    fnames = meta_list(srcdir).filter(func).attrgot('fname')
    files = '\n\t'.join(fnames)
    if fnames: raise Exception(f"The following files {msg}:\n\t{files}")

In [None]:
key = "slug"
maxlen = 60
srcdir = "/Users/eddie/Dev/outerbounds-docs/docs"

_checker(
    partial(_max_len_err, key=key, n=maxlen), 
    f"have the field `{key}` in their front matter that is greater than {maxlen} characters", srcdir
)

In [None]:
assert not _min_len_err(_test_fm, 'description', 10000) #this is in mdeseo-ignore so rule is ignored
assert not _min_len_err(_test_fm, 'image', 10) # this key doesn't exist

assert len(_test_fm['slug']) == 19

assert _min_len_err(_test_fm, 'slug', 500) # 50 > 5
assert _max_len_err(_test_fm, 'slug', 5) # 19 > 5


assert not _min_len_err(_test_fm, 'slug', 5)
assert not _max_len_err(_test_fm, 'slug', 39)

_test_ignore_short_slug_fm = get_meta('test_files/front_matter_ignore_short_slug.md')
_test_ignore_long_slug_fm = get_meta('test_files/front_matter_ignore_long_slug.md')

n = 10
assert len(_test_ignore_short_slug_fm['slug']) < n
assert not _min_len_err(_test_ignore_short_slug_fm, 'slug', n)
_test_ignore_short_slug_fm['mdseo-ignore'] = []
assert _min_len_err(_test_ignore_short_slug_fm, 'slug', n)

n = 60
assert len(_test_ignore_long_slug_fm['slug']) > n
assert not _max_len_err(_test_ignore_long_slug_fm, 'slug', n)
_test_ignore_long_slug_fm['mdseo-ignore'] = []
assert _max_len_err(_test_ignore_long_slug_fm, 'slug', n)

In [None]:
hasattr(_en, 'authors')

True

In [None]:
#export
@call_parse
def chk_fm(key:_en, # front matter field to check
           srcdir:str='.', # directory of files to check
           minlen:int=None, #the minimum character length allowed for the field
           maxlen:int=None  #the maximum character length allowed for the field
          ):
    '''
    Check front matter for various rules.
    Ignore with front matter `mdseo-ignore: [chk_fm <key>]` - e.g. `mdseo-ignore: [chk_fm slug]`.
        Filtering happens in `_max_len_err` and `_min_len_err`. 
    '''
    if not hasattr(_en, key): raise Exception(f'No rule exists for {key}')
    if minlen:
        return _checker(partial(_min_len_err, key=key, n=minlen), 
                        f"have the field `{key}` in their front matter that is less than {minlen} characters", srcdir)
    elif maxlen:
        return _checker(partial(_max_len_err, key=key, n=maxlen), 
                        f"have the field `{key}` in their front matter that is greater than {maxlen} characters", srcdir)
    
    _checker(partial(_missing_fm, key=key), f"do not have the field `{key}` in their front matter", srcdir)

In [None]:
test_fail(partial(chk_fm, key='description'))
test_fail(partial(chk_fm, key='authors'))

In [None]:
#export
def _lt_n(d, n):
    if 'len' in d['mdseo-ignore'] or 'length' in d['mdseo-ignore']: return False
    return d['n_words'] < n

@call_parse
def chk_len(n:int=50, # minimum number of words a document should contain
            srcdir:str='.', # directory of files to check 
           ):
    "Check if docs contain less than `n` words. Ignore with front matter `mdseo-ignore: [length]`"
    return _checker(partial(_lt_n, n=n), "contain less than 50 words", srcdir)

In [None]:
test_fail(chk_len)