In [None]:
def partition(covers):
    # covers: {w: {r,...}}
    # invcovers: {r: {w,...}}
    pass

def connected(w, covers, invcovers, result):
    if w not in result:
        result.add(w)
        for r in covers[w]:
            for w2 in invcovers[r]:
                connected(w2, covers, invcovers, result)
    return result

for (W, L, legend) in ALL:
    covers = eliminate_dominated(regex_covers(W, L))
    invcovers = invert_multimap(covers)
    start = list(covers)[2]
    P = connected(start, covers, invcovers, set())
    print legend, len(P), len(covers), len(covers)-len(P)

Finding Shorter Regexes: Trying Multiple Times
----
    
Why run just two versions of `findregex`?  Why not run 1000 variations, and then pick the best solution?  Of course, I don't want to write 1000 different functions by hand; I want an automated way of varying each run.  I can think of three easy things to vary:
    
* The number '4' in the `score` function.  That is, vary the tradeoff between number of winners matched and number of characters.
* The tie-breaker.  In case of a tie, Python's `max` function always picks the first one.  Let's make it choose a different 'best' regex from among all those that tie.
* The greediness. Don't be so greedy (picking the best) every time.  Occasionally pick a not-quite-best component, and see if that works out better.
    
The first of these is easy; we just use the `random.choice` function to choose an integer, `K`, to serve as the tradeoff factor.  

The second is easy too.  We could write an alternative to the `max` function, say `max_random_tiebreaker`.  That would work, but an easier approach is to build the tiebreaker into the `score` function.  In addition to awarding points for matching winners and the number of characters, we will have add in a tiebreaker: a random number between 0 and 1.  Since all the scores are otherwise integers, this will not change the order of the scores, but it will break ties.

The third we can accomplish by allowing the random factor to be larger than 1 (allowing us to pick a component that is not the shortest) or even larger than `K` (allowing us to pick a component that does not cover the most winners). 
    
I will factor out the function `greedy_search` to do a single computation oof a covering regex, while keeping the name `findregex` for the top level function that now calls `greedy_search` 1000 times and chooses the best (shortest length) result.

def findregex(winners, losers, tries=1000):
    "Find a regex that matches all winners but no losers (sets of strings)."
    # Repeatedly call 'findregex1' the given number of tries; pick the shortest result
    covers = regex_covers(winners, losers)
    results = [greedy_search(winners, covers) for _ in range(tries)]
    return min(results, key=len)

def greedy_search(winners, covers):
    # On each iteration, add the 'best' component in covers to 'result',
    # remove winners covered by best, and remove from 'pool' any components
    # that no longer match any remaining winners.
    winners = set(winners) # Copy input so as not to modify it.
    pool = set(covers)
    result = []
        
    def matches(regex, strings): return {w for w in covers[regex] if w in strings}
    
    K = random.choice((2, 3, 4, 4, 5, 6))
    T = random.choice((1., 1.5, 2., K+1., K+2.))
    def score(c): 
        return K * len(matches(c, winners)) - len(c) + random.uniform(0., T)
        
    while winners:
        best = max(pool, key=score)
        result.append(best)
        winners -= covers[best]
        pool -= {c for c in pool if covers[c].isdisjoint(winners)}
    return OR(result)

In [None]:
def factorial1(n):
    if (n <= 1):
        return 1
    else:
        return n * factorial1(n-1)

def factorial2(n, partial_solution=1):
    if (n <= 1):
        return partial_solution
    else:
        return factorial2(n-1, n * partial_solution)
    
assert factorial1(6) == factorial2(6) == 720

In [None]:
def findregex(winners, losers, calls=100000):
    "Find the shortest disjunction of regex components that covers winners but not losers."
    covers = regex_covers(winners, losers)
    best = '^(' + OR(winners) + ')$'
    state = Struct(best=best, calls=calls)
    return bb_search('', covers, state).best

def bb_search(regex, covers, state):
    """Recursively build a shortest regex from the components in covers."""
    if state.calls > 0:
        state.calls -= 1
        regex, covers = simplify_covers(regex, covers)
        if not covers:
            state.best = min(regex, state.best, key=len)
        elif len(OR2(regex, min(covers, key=len))) < len(state.best):
            # Try with and without the greedy-best component
            def score(c): return 4 * len(covers[c]) - len(c)
            best = max(covers, key=score)
            covered = covers[best]
            covers.pop(best)
            bb_search(OR2(regex, best), {c:covers[c]-covered for c in covers}, state)
            bb_search(regex, covers, state)
    return state

class Struct(object):
    "A mutable structure with specified fields and values."
    def __init__(self, **kwds): vars(self).update(kwds)
    def __repr__(self): return '<%s>' % vars(self)

In [None]:
def findregex(winners, losers, calls=100000):
    "Find the shortest disjunction of regex components that covers winners but not losers."
    covers = regex_covers(winners, losers)
    solution = '^(' + OR(winners) + ')$'
    solution, calls = bb_search('', covers, solution, calls)
    return solution

def bb_search(regex, covers, solution, calls):
    """Recursively build a shortest regex from the components in covers."""
    if calls > 0:
        calls -= 1
        regex, covers = simplify_covers(regex, covers)
        if not covers: # Solution is complete
            solution = min(regex, solution, key=len)
        elif len(OR2(regex, min(covers, key=len))) < len(solution):
            # Try with and without the greedy-best component
            def score(c): return 4 * len(covers[c]) - len(c)
            r = max(covers, key=score) # Best component
            covered = covers[r] # Set of winners covered by r
            covers.pop(r)
            solution, calls = bb_search(OR2(regex, r), 
                                        {c:covers[c]-covered for c in covers}, 
                                        solution, calls)
            solution, calls = bb_search(regex, covers, solution, calls)
    return solution, calls

In [None]:
def findregex(winners, losers, calls=100000):
    "Find the shortest disjunction of regex components that covers winners but not losers."
    global SOLUTION, CALLS
    SOLUTION = '^(' + OR(winners) + ')$'
    CALLS = calls
    return bb_search(None, regex_covers(winners, losers))

def bb_search(regex, covers):
    """Recursively build a shortest regex from the components in covers."""
    global SOLUTION, CALLS
    CALLS -= 1
    regex, covers = simplify_covers(regex, covers)
    if not covers: # Solution is complete
        SOLUTION = min(regex, SOLUTION, key=len)
    elif CALLS >= 0 and len(OR(regex, min(covers, key=len))) < len(SOLUTION):
        # Try with and without the greedy-best component
        def score(c): return 4 * len(covers[c]) - len(c)
        r = max(covers, key=score) # Best component
        covered = covers[r] # Set of winners covered by r
        covers.pop(r)
        bb_search(OR(regex, r), {c:covers[c]-covered for c in covers})
        bb_search(regex, covers)
    return SOLUTION
    
def OR(*regexes):
    "OR together regexes. Ignore 'None' components."
    return '|'.join(r for r in regexes if r is not None)


def invert_multimap(multimap):
    result = collections.defaultdict(list)
    for key in multimap:
        for val in multimap[key]:
            result[val].append(key)
    return result

In [None]:
## For debugging

def findregex(winners, losers, calls=100000):
    "Find the shortest disjunction of regex components that covers winners but not losers."
    solution = '^(' + OR(winners) + ')$'
    covers = regex_covers(winners, losers)
    b = BranchBound(solution, calls)
    b.search(None, covers)
    print b.calls, 'calls', len(b.solution), 'len'
    return b.solution


def triage_covers(partial, covers):
    "Simplify covers by eliminating dominated regexes, and picking ones that uniquely cover a winner."
    previous = None
    while covers != previous:
        previous = covers
        # Eliminate regexes that are dominated by another regex
        covers = eliminate_dominated(covers) # covers =   {regex: {winner,...}}
        coverers = invert_multimap(covers)   # coverers = {winner: {regex,...}}
        # For winners covered by only one component, move winner from covers to regex
        singletons = {coverers[w][0] for w in coverers if len(coverers[w]) == 1}
        if singletons:
            partial = OR(partial, OR(singletons))
            covered = {w for c in singletons for w in covers[c]}
            covers = {c:covers[c]-covered for c in covers if c not in singletons}
    return partial, covers


, and to , who suggested looking at [WFSTs](http://www.openfst.org/twiki/bin/view/FST/WebHome)

In [None]:
def regex_covers(winners, losers):
    """Generate regex components and return a dict of {regex: {winner...}}.
    Each regex matches at least one winner and no loser."""
    losers_str = '\n'.join(losers)
    wholes = {'^'+winner+'$' for winner in winners}
    parts = {d for w in wholes for p in subparts(w) for d in dotify(p)}
    chars = set(cat(winners))
    pairs = {A+'.'+rep_char+B for A in chars for B in chars for rep_char in '+*?'}
    reps = {r for p in parts for r in repetitions(p)}
    pool = wholes | parts | pairs | reps                         
    searchers = [re.compile(c, re.MULTILINE).search for c in pool]
    covers = {r: set(filter(searcher, winners)) 
              for (r, searcher) in zip(pool, searchers)
              if not searcher(losers_str)}
    covers = eliminate_dominated(covers)
    return covers
    return add_character_class_components(covers)

def add_character_class_components(covers):
    for (B, Ms, E) in combine_splits(covers):
        N = len(Ms)
        or_size = N*len(B+'.'+E) + N-1  # N=3 => 'B1E|B2E|B3E'
        class_size = len(B+'[]'+E) + N  # N=3 => 'B[123]E'
        winners = {w for m in Ms for w in Ms[m]}
        if class_size < or_size:
            covers[B + make_char_class(Ms) + E] = winners
    return covers

def split3(word):
    "Splits a word into 3 parts, all ways, with middle part having 0 or 1 character."
    return [(word[:i], word[i:i+L], word[i+L:]) 
            for i in range(len(word)+1) for L in (0, 1)
            if not word[i:i+L].startswith(('.', '+', '*', '?'))]

def combine_splits(covers):
    "Convert covers = {BME: {w...}} into a list of [(B, {M...}, E, {w...}]."
    table = collections.defaultdict(dict) # table = {(B, E): {M: {w...}}}
    for r in covers:
        for (B, M, E) in split3(r):
            table[B, E][M] = covers[r]
    return [(B, Ms, E) for ((B, E), Ms) in table.items()
            if len(Ms) > 1]

def make_char_class(chars):
    chars = set(chars)
    return '[%s]%s' % (cat(chars), ('?' if '' in chars else ''))

covers = regex_covers(boys, girls)
old = set(covers)
print len(covers)
covers = add_character_class_components(covers)
print len(covers)
print set(covers) - old

print dict(combine_splits({'..a': {1,2,3}, '..b': {4,5,6}, '..c':{7}}))

Consider the two components `'..a'` and `'..b'`.  If we wanted to cover all the winners that both of these match, we could use `'..a|..b'`, or we could share the common prefix and introduce a *character class* to get `'..[ab]'`.  Since the former is 7 characters and the later is only 6, the later would be preferred.  It would be an even bigger win to replace `'..az|..bz|..cz'` with `'..[abc]z'`; that reduces the count from 14 to 8. Similarly, replacing `'..az|..bz|..z'` with `'..[ab]?z'` saves 5 characters.

There seems to be potential savings with character classes.  But how do we know which characters from which components to combine into classes? To keep things from getting out of control, I'm going to only look at components that are left after we eliminate dominated.  That is not an ideal approach&mdash;there may well be some components that are dominated on their own, but could be part of an optimal solution when combined with other components into a character class.  But I'm going to keep it simple.


Searching: Better Bounds
----

Branch and bound prunes the search tree whenever it is on a branch that is guaranteed to result in a solution that is no better than the best solution found so far.  Currently we estimate the best possible solution along the current branch by taking the length of the partial solution and adding the length of the shortest component in `covers`.  We do that because we know for sure that we need at least one component, but we don't know for sure how many components we'll need (nor how long each of them will be.  So our estimate is often severely underestimates the true answer, which means we don't cut off search some places where we could, if only we had a better estimate.
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
Here's one way to get a better bound. We'll define the following quantities:

+ *P* = the length of the partial solution, plus the "|", if needed.  So if the partial solution is `None`, then *P* will be zero, otherwise *P* is the length plus 1.
+ *S* = the length of the shortest regex component in `covers`.
+ *W* = the number of winners still in `covers`.
+ *C* = the largest number of winners covered by any regex in `covers`.

If we assume The current estimate is *P* + *S*.  We can see that a better estimate is *P* + *S* &times; ceil(*W* / *C*).

In [None]:
import math

class BranchBound(object):
    "Hold state information for a branch and bound search."
    def __init__(self, solution, calls):
        self.solution, self.calls = solution, calls
    
    def search(self, covers, partial=None):
        "Recursively extend partial regex until it matches all winners in covers."
        if self.calls <= 0: 
            return self.solution
        self.calls -= 1
        covers, partial = simplify_covers(covers, partial)
        if not covers: # Nothing left to cover; solution is complete
            self.solution = min(partial, self.solution, key=len)
        else:
            P = 0 if not partial else len(partial) + 1
            S = len(min(covers, key=len))
            C = max(len(covers[r]) for r in covers)
            W = len(set(w for r in covers for w in covers[r]))
            if P + S * math.ceil(W / C) < len(self.solution):
                # Try with and without the greedy-best component
                def score(r): return 4 * len(covers[r]) - len(r)
                r = max(covers, key=score) # Best component
                covered = covers[r] # Set of winners covered by r
                covers.pop(r)
                self.search({c:covers[c]-covered for c in covers}, OR(partial, r))
                self.search(covers, partial)
        return self.solution