In [57]:
import re

In [103]:
def find_url_in_line(line):
    """
    Return stripped URL if `line` contains a URL, else returns None.
    """
    url_re = re.compile(r'^(?P<url>https?://.*?)$')
    match = url_re.match(line)
    if match:
        raw_url = match.groupdict()['url']
        url = raw_url.rstrip('\ \t')
        return url
    else:
        return None


def is_url(line):
    if line is None:
        return False
    elif find_url_in_line(line) is None:
        return False
    else:
        return True


def is_blank(line):
    if line is None:
        return True
    if len(line.strip()) == 0:
        return True
    else:
        return False

def strip_title_line(line):
    return line.lstrip('[ ').rstrip(':\ ]')


def scripted_title(filename):
    """
    Returns the automatically-inserted H5 title e.g. 
    some_topic 
    -->
    ===== Some topic =====
    """
    title = filename.replace("_"," ").capitalize()
    return "===== " + title + " ====="


In [104]:
STATE_TEXT = 's:text'
STATE_LINK = 's:link'
STATE_EXTRA = 's:extra'
 
def process_source(source, filename):
    """
    Splits off links (title, url, notes) from source, and return rest.
    Level 5 wiki headings containing `filename` will also be skippled.
    """
    # setup results
    links_data = []
    rest_of_source = []
    
    state = STATE_TEXT
    lines = source.splitlines()
    for n, line in enumerate(lines):
        thisline = line
        if n < len(lines)-1:
            nextline = lines[n+1]
        else:
            nextline = None        
        
        #
        # text state
        if state == STATE_TEXT:
            if is_url(nextline):
                state = STATE_LINK
                link = {}
                link['title'] = strip_title_line(thisline)
            else:
                if line.lower() == scripted_title(filename).lower():
                    pass # skip auto-scripted titles
                else:
                    rest_of_source.append(thisline)
        #
        # link
        elif state == STATE_LINK:
            link['url'] = find_url_in_line(thisline)
            if is_blank(nextline):
                links_data.append(link)
                state = STATE_TEXT
                link = None
            else:
                state = STATE_EXTRA        
        #
        # extra text after link
        elif state == STATE_EXTRA:
            thisline = thisline.rstrip('\ ')
            link['notes'] = link.get('notes', '') + thisline
            if is_blank(nextline):
                state = STATE_TEXT
                links_data.append(link)
                link = None
            else:
                link['notes'] += '\n'
        #
        else:
            raise ValueError('Should never be here')

    return links_data, rest_of_source

In [105]:
import os
PROJECT_SOURCES_DIR = '/Users/ivan/Projects/Minireference/website/miniref/data/pages/comp'
PROJECT_DEST_DIR = '/Users/ivan/Projects/RC/hackers-manual/pages'

from utils.yaml_export import links_to_yaml_string

for raw_path, subfolders, filenames in os.walk(PROJECT_SOURCES_DIR):
    dest_dir = raw_path.replace(PROJECT_SOURCES_DIR,PROJECT_DEST_DIR)
    
    # create subfolders
    for subfolder in subfolders:
        dest_subdir = os.path.join(dest_dir, subfolder)
        if not os.path.exists(dest_subdir):
            os.mkdir(dest_subdir)
    #
    for filename in filenames:
        split_filename, ext = os.path.splitext(filename)
        if ext == '.txt':
            new_filename = split_filename + '.md'
            with open(os.path.join(raw_path,filename)) as source_file:
                source = source_file.read()
                links, rest_of_source = process_source(source, split_filename)
                links_data = {'links': links}
                links_str = links_to_yaml_string(links_data)
                
                with open(os.path.join(dest_dir,new_filename), 'w') as dest_file:
                    dest_file.write('---\n')
                    dest_file.write(links_str)
                    dest_file.write('---\n\n')
                    sparse_rest_of_source = '\n'.join(rest_of_source)
                    # remove excessive blank lines
                    rest_of_source_str = re.sub(r'(\n\n)+', '\n\n', sparse_rest_of_source)
                    dest_file.write(rest_of_source_str)
                    dest_file.write('\n')
        else:
            print('Unrecognized file:', filename, ' in ', raw_path)
