In [8]:
import re

In [45]:
source = open('allsource.txt').read()

In [46]:
len(source.split('\n'))

4987

In [47]:
def find_url_in_line(line):
    """
    Return stripped URL if `line` contains a URL, else returns None.
    """
    url_re = re.compile(r'^(?P<url>https?://.*?)$')
    match = url_re.match(line)
    if match:
        raw_url = match.groupdict()['url']
        url = raw_url.rstrip('\ \t')
        return url
    else:
        return None


def is_url(line):
    if line is None:
        return False
    elif find_url_in_line(line) is None:
        return False
    else:
        return True


def is_blank(line):
    if line is None:
        return True
    if len(line.strip()) == 0:
        return True
    else:
        return False

def strip_title_line(line):
    return line.lstrip('[ ').rstrip('\ ]')


def scripted_title(filename):
    """
    Returns the automatically-inserted H5 title e.g. 
    some_topic 
    -->
    ===== Some topic =====
    """
    title = filename.replace("_"," ").capitalize()
    return "===== " + title + " ====="


In [52]:
STATE_TEXT = 's:text'
STATE_LINK = 's:link'
STATE_EXTRA = 's:extra'
 
def process_source(source, filename):
    """
    Splits off links (title, url, notes) from source, and return rest.
    Level 5 wiki headings containing `filename` will also be skippled.
    """
    # setup results
    links_data = []
    rest_of_source = []
    
    state = STATE_TEXT
    lines = source.splitlines()
    for n, line in enumerate(lines):
        thisline = line
        if n < len(lines)-1:
            nextline = lines[n+1]
        else:
            nextline = None        
        
        #
        # text state
        if state == STATE_TEXT:
            if is_url(nextline):
                state = STATE_LINK
                link = {}
                link['title'] = strip_title_line(thisline)
            else:
                if line.lower() == scripted_title(filename).lower() or is_blank(thisline):
                    pass # skip auto-scripted titles, and blank lines
                else:
                    rest_of_source.append(thisline)
        #
        # link
        elif state == STATE_LINK:
            link['url'] = find_url_in_line(thisline)
            if is_blank(nextline):
                links_data.append(link)
                state = STATE_TEXT
                link = None
            else:
                state = STATE_EXTRA        
        #
        # extra text after link
        elif state == STATE_EXTRA:
            thisline = thisline.rstrip('\ ')
            link['extra'] = link.get('extra', '') + thisline
            if is_blank(nextline):
                state = STATE_TEXT
                links_data.append(link)
                link = None
            else:
                link['extra'] += '\n'
        #
        else:
            raise ValueError('Should never be here')

    return links_data, rest_of_source

In [48]:
links_data, rest_of_source = process_source(source, 'javascript_backbone')

http://www.comp.nus.edu.sg/~stevenha/visualization/index.html
http://stackoverflow.com/questions/487258/plain-english-explanation-of-big-o
http://www.algorist.com/
https://www.siam.org/pdf/news/637.pdf
http://cstheory.stackexchange.com/questions/19759/core-algorithms-deployed/19773#19773
http://en.docsity.com/news/interesting-facts/great-algorithms-revolutionized-computing/
https://github.com/nzakas/computer-science-in-javascript/
http://bigocheatsheet.com/
http://www.caseyrule.com/projects/sounds-of-sorting/
https://queue.acm.org/detail.cfm?id=2855183
http://www3.cs.stonybrook.edu/~algorith/video-lectures/
https://news.ycombinator.com/item?id=11712198
http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-046j-design-and-analysis-of-algorithms-spring-2015/lecture-videos/
http://algo-visualizer.jasonpark.me/
http://www.ilikebigbits.com/blog/2014/4/21/the-myth-of-ram-part-i
https://rosettacode.org/wiki/Sorting_algorithms/Quicksort#Python
https://news.ycombinator.com/it

In [49]:
len(rest_of_source)

3368

In [50]:
len(links_data)

763

In [51]:
open('rest_of_source.txt','w').write('\n'.join(rest_of_source))

121167

In [2]:
d

{'as': 'asa', 'bs': 'bsb'}

In [4]:
d.get('22as', '')

''

In [37]:
"alksj3k3mM".lower()

'alksj3k3mm'