So far, we've been keeping track of match locations using character offsets. But this won't work once we convert the TEI to HTML. 
So we have to add markers in the TEI which we can use later to highlight matches. 

Let's use [the anchor tag](https://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-anchor.html). 





In [3]:
import pandas as pd
import re
from bs4 import BeautifulSoup

In [4]:
df = pd.read_csv('../../../criticism-analysis/fw-matches.csv')

Split up FW into chunks according to these locations. 

If a location is used more than once, we'll have to find a way to handle that. 

In [5]:
df['Text A'][0]

'../texts/corpus-joyce-finnegans-wake-tei/finnegans-wake-corrected.xml'

In [6]:
with open("./../finnegans-wake-corrected.xml") as f: 
    fw = f.read()

In [7]:
len(fw)

1465021

In [8]:
# Melt the data frame such that each match is its own row
melted = {}
for i, row in df.iterrows(): 
    locs = eval(row['Locations in A']) # Is a string
    for loc in locs: 
        data = row.to_dict()
        thisLoc = {"start": loc[0], "end": loc[1]}
        # Make one for each of start, end
        for locType in thisLoc: 
            data = row.to_dict() # Don't really know why I have to do this
            data['loc'] = thisLoc[locType]
            data['locType'] = locType
            index = str(i) + str(loc) + locType
            melted[index] = data


In [9]:
dfMelted = pd.DataFrame(melted).T

In [10]:
tagLocations = [match.span() for match in list(re.finditer('<.*?>', fw))]

In [11]:
def avoidTags(loc, tagLocations=tagLocations): 
    """ If a given location is inside a tag, move it until it's outside. """
    for tagStart, tagEnd in tagLocations: 
        if loc >= tagStart and loc <= tagEnd: 
            # This location is inside a tag
            print(loc, ' is inside a tag.')
            return tagEnd
    return loc

In [12]:
avoidTags(585861)

585861  is inside a tag.


585891

In [13]:
dfMelted['loc'].apply(avoidTags)

1208681  is inside a tag.
940508  is inside a tag.
425458  is inside a tag.
1246846  is inside a tag.
657372  is inside a tag.
279507  is inside a tag.
343592  is inside a tag.
350877  is inside a tag.
380573  is inside a tag.
1458924  is inside a tag.
1460047  is inside a tag.
1153319  is inside a tag.
962339  is inside a tag.
776232  is inside a tag.
1146398  is inside a tag.
445923  is inside a tag.
452075  is inside a tag.
693604  is inside a tag.
1108743  is inside a tag.
1460034  is inside a tag.
530677  is inside a tag.
783383  is inside a tag.
830205  is inside a tag.
830326  is inside a tag.
280329  is inside a tag.
281386  is inside a tag.
385037  is inside a tag.
405233  is inside a tag.
429706  is inside a tag.
961731  is inside a tag.
962339  is inside a tag.
1346423  is inside a tag.
431766  is inside a tag.
431766  is inside a tag.
274032  is inside a tag.
1125421  is inside a tag.
431766  is inside a tag.
331088  is inside a tag.
330878  is inside a tag.
646366  is insi

0(334922, 335011)start       334922
0(334922, 335011)end         335011
1(478048, 478132)start       478048
1(478048, 478132)end         478132
2(5475, 5947)start             5475
                              ...  
1138(408888, 408993)end      408993
1138(788319, 788414)start    788319
1138(788319, 788414)end      788414
1139(366179, 366282)start    366179
1139(366179, 366282)end      366282
Name: loc, Length: 5540, dtype: int64

In [14]:
(dfMelted['loc'] == dfMelted['loc'].apply(avoidTags)).value_counts()

1208681  is inside a tag.
940508  is inside a tag.
425458  is inside a tag.
1246846  is inside a tag.
657372  is inside a tag.
279507  is inside a tag.
343592  is inside a tag.
350877  is inside a tag.
380573  is inside a tag.
1458924  is inside a tag.
1460047  is inside a tag.
1153319  is inside a tag.
962339  is inside a tag.
776232  is inside a tag.
1146398  is inside a tag.
445923  is inside a tag.
452075  is inside a tag.
693604  is inside a tag.
1108743  is inside a tag.
1460034  is inside a tag.
530677  is inside a tag.
783383  is inside a tag.
830205  is inside a tag.
830326  is inside a tag.
280329  is inside a tag.
281386  is inside a tag.
385037  is inside a tag.
405233  is inside a tag.
429706  is inside a tag.
961731  is inside a tag.
962339  is inside a tag.
1346423  is inside a tag.
431766  is inside a tag.
431766  is inside a tag.
274032  is inside a tag.
1125421  is inside a tag.
431766  is inside a tag.
331088  is inside a tag.
330878  is inside a tag.
646366  is insi

True     5321
False     219
Name: loc, dtype: int64

In [15]:
dfMelted['locCorrected'] = dfMelted['loc'].apply(avoidTags)

1208681  is inside a tag.
940508  is inside a tag.
425458  is inside a tag.
1246846  is inside a tag.
657372  is inside a tag.
279507  is inside a tag.
343592  is inside a tag.
350877  is inside a tag.
380573  is inside a tag.
1458924  is inside a tag.
1460047  is inside a tag.
1153319  is inside a tag.
962339  is inside a tag.
776232  is inside a tag.
1146398  is inside a tag.
445923  is inside a tag.
452075  is inside a tag.
693604  is inside a tag.
1108743  is inside a tag.
1460034  is inside a tag.
530677  is inside a tag.
783383  is inside a tag.
830205  is inside a tag.
830326  is inside a tag.
280329  is inside a tag.
281386  is inside a tag.
385037  is inside a tag.
405233  is inside a tag.
429706  is inside a tag.
961731  is inside a tag.
962339  is inside a tag.
1346423  is inside a tag.
431766  is inside a tag.
431766  is inside a tag.
274032  is inside a tag.
1125421  is inside a tag.
431766  is inside a tag.
331088  is inside a tag.
330878  is inside a tag.
646366  is insi

In [16]:
len(fw)

1465021

In [17]:
dfMelted['loc'].max()

1460174

In [18]:
dfMelted.sort_values(by='locCorrected')

Unnamed: 0,Text A,Text B,Threshold,Cutoff,N-Grams,Num Matches,Text A Length,Text B Length,Locations in A,Locations in B,loc,locType,locCorrected
"480(643, 800)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,6,1460281,63300,"[(643, 800), (873885, 873940), (877420, 877517...","[(115, 264), (4913, 4968), (5455, 5552), (5774...",643,start,643
"916(643, 800)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,26389,"[(643, 800)]","[(22222, 22371)]",643,start,643
"677(643, 1000)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,2,1460281,109733,"[(643, 1000), (453652, 453874)]","[(35538, 35867), (58200, 58384)]",643,start,643
"586(643, 713)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,46120,"[(643, 713)]","[(7240, 7304)]",643,start,643
"665(643, 787)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,4724,"[(643, 787)]","[(909, 1044)]",643,start,643
...,...,...,...,...,...,...,...,...,...,...,...,...,...
"1101(1459974, 1460174)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,21855,"[(1459974, 1460174)]","[(10224, 10412)]",1460174,end,1460174
"464(1459735, 1460174)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,8,1460281,93927,"[(1692, 2184), (5121, 5348), (48971, 49265), (...","[(45419, 45885), (57581, 57792), (59472, 59751...",1460174,end,1460174
"359(1460111, 1460174)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,2386,"[(1460111, 1460174)]","[(1811, 1871)]",1460174,end,1460174
"589(1459918, 1460174)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,4,1460281,46419,"[(1458992, 1459379), (1459411, 1459794), (1459...","[(42822, 43180), (43213, 43574), (43425, 43675...",1460174,end,1460174


In [19]:
def showContext(loc, n=100): 
    print(fw[loc-n:loc]+'-XXX-'+fw[loc+n])

In [20]:
showContext(585870)

 heavy Humph with airy Nan,
    Ricqueracqbrimbillyjicqueyjocqjolicass? How sowesthow,
    <hi rendi-XXX-s


- Go through each character in FW
- And Go through each locCorrected
- Is the character the same as our locCorrected?
  - Is it a start?
    - Start a `<span>`. Give it all the metadata. 
  - Is it an end? 
    - Do a `</span>`. Give it metadata. 
  - If so, start a span. Give it all the metadata. 
- Is the character the beginning of a tag? 
  - Raise a flag to indicate that we're inside a tag
  - Keep going until we see the end of the tag
  

In [21]:
def buildAnchor(df): 
    fileNames = " ".join([re.sub('/home/jon/Code/corpus-joyce-finnegans-wake-tei/criticism-analysis/\d/ocr/', '', fn) 
                 for fn in df['Text B'].values])
    indices = " ".join(list(df.index))
    locTypes = df['locType'][0] # We can get away with using [0] since these are only of one type 
    anchor = f'<anchor xml:id="{indices}" corresp="{fileNames}" type="{locTypes}"/>'
    return anchor

def buildSpan(df): 
    fileNames = " ".join([re.sub('/home/jon/Code/corpus-joyce-finnegans-wake-tei/criticism-analysis/\d/ocr/', '', fn) 
                 for fn in df['Text B'].values])
    indices = " ".join(list(df.index))
    nQuotes = len(df.index)
    locType = df['locType'][0] # We can get away with using [0] since these are only of one type 
    if locTypes == 'start': 
        return f'<span data-location="{indices}" data-filenames="{fileNames}" data-n-quotes="{nQuotes}">'
    if locTypes == 'end': 
        return f'</span>'

In [23]:
df['Locations in A']

0                                      [(334922, 335011)]
1                                      [(478048, 478132)]
2                                          [(5475, 5947)]
3       [(296676, 296759), (1122825, 1123526), (120332...
4                                      [(177213, 177359)]
                              ...                        
1135    [(465107, 465196), (496432, 496604), (496874, ...
1136                                   [(437465, 437691)]
1137                                   [(147036, 147196)]
1138    [(244562, 245048), (408888, 408993), (788319, ...
1139                                   [(366179, 366282)]
Name: Locations in A, Length: 1140, dtype: object

In [24]:
dfMelted

Unnamed: 0,Text A,Text B,Threshold,Cutoff,N-Grams,Num Matches,Text A Length,Text B Length,Locations in A,Locations in B,loc,locType,locCorrected
"0(334922, 335011)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,48541,"[(334922, 335011)]","[(2215, 2297)]",334922,start,334922
"0(334922, 335011)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,48541,"[(334922, 335011)]","[(2215, 2297)]",335011,end,335011
"1(478048, 478132)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,61396,"[(478048, 478132)]","[(47617, 47697)]",478048,start,478048
"1(478048, 478132)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,61396,"[(478048, 478132)]","[(47617, 47697)]",478132,end,478132
"2(5475, 5947)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,42828,"[(5475, 5947)]","[(28220, 28665)]",5475,start,5475
...,...,...,...,...,...,...,...,...,...,...,...,...,...
"1138(408888, 408993)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,3,1460281,56651,"[(244562, 245048), (408888, 408993), (788319, ...","[(2307, 2765), (27890, 27992), (28206, 28297)]",408993,end,408993
"1138(788319, 788414)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,3,1460281,56651,"[(244562, 245048), (408888, 408993), (788319, ...","[(2307, 2765), (27890, 27992), (28206, 28297)]",788319,start,788319
"1138(788319, 788414)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,3,1460281,56651,"[(244562, 245048), (408888, 408993), (788319, ...","[(2307, 2765), (27890, 27992), (28206, 28297)]",788414,end,788414
"1139(366179, 366282)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,5082,"[(366179, 366282)]","[(3206, 3305)]",366179,start,366179


In [25]:
5475 in dfMelted['locCorrected'].values

True

In [26]:
dfMelted['locCorrected'].values

array([334922, 335011, 478048, ..., 788414, 366179, 366282])

In [54]:
ranges = []
for i, row in df['Locations in A'].apply(eval).iteritems(): 
#    print(i, row)
    for loc in row: 
        ranges.append((i, loc[0], loc[1]))
#ranges

In [31]:
starts = dfMelted[dfMelted['locType'] == 'start']['locCorrected'].values
ends = dfMelted[dfMelted['locType'] == 'end']['locCorrected'].values

In [43]:
788414 in ends

True

In [42]:
dfMelted

Unnamed: 0,Text A,Text B,Threshold,Cutoff,N-Grams,Num Matches,Text A Length,Text B Length,Locations in A,Locations in B,loc,locType,locCorrected,matchID
"0(334922, 335011)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,48541,"[(334922, 335011)]","[(2215, 2297)]",334922,start,334922,0
"0(334922, 335011)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,48541,"[(334922, 335011)]","[(2215, 2297)]",335011,end,335011,0
"1(478048, 478132)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,61396,"[(478048, 478132)]","[(47617, 47697)]",478048,start,478048,1
"1(478048, 478132)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,61396,"[(478048, 478132)]","[(47617, 47697)]",478132,end,478132,1
"2(5475, 5947)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,42828,"[(5475, 5947)]","[(28220, 28665)]",5475,start,5475,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"1138(408888, 408993)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,3,1460281,56651,"[(244562, 245048), (408888, 408993), (788319, ...","[(2307, 2765), (27890, 27992), (28206, 28297)]",408993,end,408993,1138
"1138(788319, 788414)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,3,1460281,56651,"[(244562, 245048), (408888, 408993), (788319, ...","[(2307, 2765), (27890, 27992), (28206, 28297)]",788319,start,788319,1138
"1138(788319, 788414)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,3,1460281,56651,"[(244562, 245048), (408888, 408993), (788319, ...","[(2307, 2765), (27890, 27992), (28206, 28297)]",788414,end,788414,1138
"1139(366179, 366282)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,5082,"[(366179, 366282)]","[(3206, 3305)]",366179,start,366179,1139


In [90]:
# Extract match ID again
dfMelted['matchID'] = [re.sub('(start|end)', '', item) for item in list(dfMelted.index)]

'0(334922, 335011)'

In [91]:
dfMelted

Unnamed: 0,Text A,Text B,Threshold,Cutoff,N-Grams,Num Matches,Text A Length,Text B Length,Locations in A,Locations in B,loc,locType,locCorrected,matchID
"0(334922, 335011)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,48541,"[(334922, 335011)]","[(2215, 2297)]",334922,start,334922,"0(334922, 335011)"
"0(334922, 335011)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,48541,"[(334922, 335011)]","[(2215, 2297)]",335011,end,335011,"0(334922, 335011)"
"1(478048, 478132)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,61396,"[(478048, 478132)]","[(47617, 47697)]",478048,start,478048,"1(478048, 478132)"
"1(478048, 478132)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,61396,"[(478048, 478132)]","[(47617, 47697)]",478132,end,478132,"1(478048, 478132)"
"2(5475, 5947)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,42828,"[(5475, 5947)]","[(28220, 28665)]",5475,start,5475,"2(5475, 5947)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"1138(408888, 408993)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,3,1460281,56651,"[(244562, 245048), (408888, 408993), (788319, ...","[(2307, 2765), (27890, 27992), (28206, 28297)]",408993,end,408993,"1138(408888, 408993)"
"1138(788319, 788414)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,3,1460281,56651,"[(244562, 245048), (408888, 408993), (788319, ...","[(2307, 2765), (27890, 27992), (28206, 28297)]",788319,start,788319,"1138(788319, 788414)"
"1138(788319, 788414)end",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,3,1460281,56651,"[(244562, 245048), (408888, 408993), (788319, ...","[(2307, 2765), (27890, 27992), (28206, 28297)]",788414,end,788414,"1138(788319, 788414)"
"1139(366179, 366282)start",../texts/corpus-joyce-finnegans-wake-tei/finne...,/home/jon/Code/corpus-joyce-finnegans-wake-tei...,3,5,3,1,1460281,5082,"[(366179, 366282)]","[(3206, 3305)]",366179,start,366179,"1139(366179, 366282)"


In [65]:
# Get matches with start 
def getMatchesWithLoc(loc, locType): 
    return list(dfMelted[(dfMelted['locCorrected'] == loc) & (dfMelted['locType'] == locType)]['matchID'])

In [49]:
fw.index('bababadalgharaghtakammina')

1574

In [125]:
newFW = ""
inRanges = set()
n = 0
fwSubset = fw[:fw.index('bababadalgharaghtakammina')]

def span(matches, kind='start'): 
    if len(matches) == 0: 
        return ""
    matchesStr = " ".join(set([str(match) for match in matches]))
    n = len(matches)
    if len(matchesStr.strip()) == 0: 
        return ""
    if kind == 'start': 
        return f'<!--Start--><span data-matches="{matchesStr}" data-n-quotes="{n}">'
    return f"</span><!--End of {matchesStr}-->"

# Go through each character in the text. 
for i, char in enumerate(fwSubset): 
    # Is this the start of a quote? 
    if i in starts: 
        # End anything already started, first
        if len(inRanges) > 0: 
            newFW += span(inRanges, kind='end')
        matches = getMatchesWithLoc(i, 'start')
        for match in matches: 
            # Keep track of what quotations we're in 
            inRanges.add(match)
            #print(f"Now in range: {inRanges}")
        if len(inRanges) > 0: 
            newFW += f"<!-- Starting {i} in matches {matches} -->"
            newFW += span(inRanges, kind='start')
            
        newFW += char
    # Is this the end of a quote? 
    elif i in ends and len(inRanges) > 0: 
        newFW += char
        # What ends here? 
        matches = getMatchesWithLoc(i, 'end')
        # End everything that ends here
        #print(f"{i} is an end location.")
        #print(f"Removing all of {matches}")
        if len(matches) == 0: 
            continue
        for match in matches: 
            print(f"--Removing {match} from {inRanges}")
            try: 
                inRanges.remove(match)
            except: 
                print(f"Tried to remove {match} but it's not in {inRanges}.")
                print(f"Inranges is: {inRanges} with len {len(inRanges)}")
        #print(f"Inranges is now: {inRanges}")
        newFW += span(inRanges, kind='end')
        #print(f"Inranges is now: {inRanges}")
        # Continue everything else remaining
        #print(inRanges)
        #print(f"Inranges is now: {inRanges}")
        if len(inRanges) > 0: 
            print(f"I is now {i} and Inranges is now: {inRanges}")
            newFW += f"<!-- Now at {i}. Continue everything else: {inRanges} -->"
            newFW += span(inRanges, kind='start')
    # Is this the start of some other tag? 
    elif char == '<' and len(inRanges) > 0: 
        newFW += span(inRanges, kind='end')
        newFW += "<!-- Suspending  -->"
        newFW += char
    # Is this the end of some other tag? 
    elif char == '>' and len(inRanges) > 0: 
        newFW += char
        # Resume if there's anything to resume
        newFW += "<!-- Resuming -->"
        newFW += span(inRanges, kind='start')
    else: 
        newFW += char


--Removing 498(643, 713) from {'586(643, 713)', '253(679, 800)', '954(679, 787)', '665(643, 787)', '297(679, 800)', '480(643, 800)', '234(679, 787)', '1001(679, 800)', '157(679, 787)', '1026(679, 800)', '990(679, 787)', '677(643, 1000)', '573(643, 800)', '83(679, 800)', '65(679, 800)', '357(679, 767)', '1046(666, 724)', '810(679, 800)', '1117(679, 800)', '66(666, 724)', '350(679, 787)', '916(643, 800)', '498(643, 713)', '544(679, 800)', '1119(679, 800)', '370(666, 724)', '680(679, 800)', '381(679, 800)', '305(666, 724)', '870(679, 800)', '606(679, 787)', '1125(679, 800)', '886(679, 833)', '872(679, 800)', '428(679, 800)', '289(679, 800)', '1063(679, 800)'}
--Removing 586(643, 713) from {'586(643, 713)', '253(679, 800)', '954(679, 787)', '665(643, 787)', '297(679, 800)', '480(643, 800)', '234(679, 787)', '1001(679, 800)', '157(679, 787)', '1026(679, 800)', '990(679, 787)', '677(643, 1000)', '573(643, 800)', '83(679, 800)', '65(679, 800)', '357(679, 767)', '1046(666, 724)', '810(679, 800

In [126]:
with open('../finnegans-wake-annotated.xml', 'w') as f: 
    f.write(newFW)

In [111]:
inRanges.remove('1071(1568, 1799)')

In [112]:
inRanges

set()