In [None]:
%load_ext autoreload
%autoreload 2
# default_exp indexers.notelist.parser

In [None]:
# export
import bs4
import random
from integrators.indexers.notelist.util import *
from integrators.indexers.notelist.notelist import *
from integrators.data.schema import *
from integrators.data.basic import *
from integrators.imports import *

In [None]:
#hide
from nbdev.showdoc import *

# Parsing lists

In [None]:
# export

LISTTYPE_VERBS = ["do", "read", "watch", "buy", "listen"]
LIST_PREFIXES = ["to", "to-", "to ", ""]

class HTMLListParser():
    '''Extracts lists from HTML data, generated by an HTML text editor like evernote'''
    
    def __init__(self):        
        self.single_item_list_patterns = [p+v for v in LISTTYPE_VERBS for p in LIST_PREFIXES]
    
    def get_html_lists(self, note, parsed):
        html_lists = parsed.find_all("ul", recursive=False) + parsed.find_all("ol", recursive=False)
        return [ULNoteList.from_data(title=None, content=str(x), textContent=x.get_text(),
                                     note=note, span=get_span(x, parsed)) for x in html_lists]
    
    def get_lists(self, note):
        """Extracts lists from a note"""
        parsed = bs4.BeautifulSoup(note.content, 'html.parser')
        note.content=str(parsed)

        all_lists = self.get_html_lists(note, parsed) + \
                    self.get_unformatted_lists(note, parsed)
        for l in all_lists: note.add_edge("noteList", l)

        return all_lists
    
    def parse(self, x, tag=None):
        if isinstance(x, bs4.BeautifulSoup): return x.find(tag) if tag is not None else x
        elif isinstance(x, bs4.element.Tag): return x
        else: 
            res =  bs4.BeautifulSoup(x, 'html.parser')
            return res.find(tag) if tag is not None else res

    def get_single_line_list(self, par):
        """Get single list lists. An example could be: '<strong>read</strong>: great book title'"""
        par = self.parse(par, "p")
        par_html = "".join(mapped(str, par.contents))
        
        pat = "|".join([f"(<strong>|<em>|<u>)?{v}:? ?(</strong>|</em>|</u>)?:? ?"
                        for v in LISTTYPE_VERBS])
        match = re.search(pat, par_html, re.IGNORECASE)
        if match is None: return None, None
        
        title_html = match.group() if match is not None else None

        if len(par.get_text()) > len(remove_html(title_html)) + 2:
            title = match.group()
            content = par_html[par_html.index(title) + len(title):]
            return title, content
        else:
            return None, None

    def get_unformatted_lists(self, note, parsed):
        """retrieve lists without <ul></ul> tags. We have two options: 
                1) multiline lists prefixed with a title keyword (e.g. "Buy:" "Read:") 
                2) single element single line lists"""

        parsed = parsed if parsed is not None else self.parse(note.content)
        toplevel_paragraphs = parsed.find_all("p", recursive=False)
        res = []

        for i, par in enumerate(toplevel_paragraphs):
            if is_title(par):
                # this extracts the lists that have a title and are not on a single line
                items = trim_till_newline(toplevel_paragraphs[i+1:])
                if len(items) == 0: continue
                    
                list_span  = Span.from_data(startIdx=get_span(title, parsed).startIdx,
                                            endIdx=get_span(items[-1], parsed).endIdx)            

                l = INoteList.from_data(note=note,span=list_span,
                                        title=str(par.contents[0]),
                                        content="".join(mapped(str,items)),
                                        itemSpan=[get_span(x, parsed) for x in items])                    
                res.append(l)

            else:
                title, html_content = self.get_single_line_list(par)
                if title is not None:
                                                  
                    span = get_span(str(par), parsed)
                    
                    
                    itemSpans = [Span.from_data(startIdx=span.startIdx + len(str(title)),
                                                endIdx=span.endIdx)]
                    l = INoteList.from_data(note=note, title=title, content=str(html_content),
                                            itemSpan=itemSpans, span=get_span(par, parsed)) 
                    res.append(l)
        return res

# Usage

Lets see how this works for an example note. We start with a note that was imported from evernote as example and show its content.

In [None]:
note = INote.from_data(content=read_file(PYI_TESTDATA / "notes" / "evernote" / "evernote-test-note-1.html"))

In [None]:
print(note.content[:400])

<div>
    <div><br clear="none" /></div>
    <div><br clear="none" /></div>
    <ul>
        <li>Buy groceries</li>
        <li>Call john<br clear="none" /></li>
        <li>Do the taxes</li>
        <li>Take out the trash</li>
        <li>Reply to carls mail</li>
    </ul>
    <div><br clear="none" /></div>
    <ul>
        <li>Buy groceries</li>
        <li>Call john<ul>
                <li>He r


In [None]:
#hide
from IPython.core.display import display, HTML

Which corresponds to this when rendered

In [None]:
# display(HTML(note.content))

We can parse these using the `HTMLListParser`

# Test

## Memri lists

### Single line lists

In [None]:
parser = HTMLListParser()
note = INote.from_data(content=read_file(PYI_TESTDATA / "notes" / "memri" / "memri-test-note-3.html"))

In [None]:
title, content = parser.get_single_line_list("<p>Buy: Toothpaste</p>")
test_eq(title, "Buy: "); test_eq(content, "Toothpaste")

In [None]:
title, content = parser.get_single_line_list("<p><strong>Read: </strong>The age of surveillance capitalism</p>")
test_eq(title, "<strong>Read: </strong>"); test_eq(content, "The age of surveillance capitalism")

In [None]:
title, content = parser.get_single_line_list("<p>Watch: Parasite</p>")
test_eq(title, "Watch: "); test_eq(content, "Parasite")

In [None]:
title, content = parser.get_single_line_list("<p><u>Do</u>: The dishes</p>")
test_eq(title, "<u>Do</u>: "); test_eq(content, "The dishes")

In [None]:
txt = note.content
parsed = bs4.BeautifulSoup(txt, 'html.parser')
assert len(parser.get_unformatted_lists(note, parsed)) == 7

In [None]:
lists = parser.get_lists(note)
assert len(lists) == 10
# list0,list1,list2,list3,list4,list5,list6,list7,list8,list9 = lists

In [None]:
lists

[ULNoteList # Untitled 
 
 
 Buy groceries
 
 
 
 
 Call john
 
 
 
 
 Do the taxes
 
 
 
 
 Take out the trash
 
 
 
 
 Reply to carls mail
 
 
 ,
 ULNoteList # Untitled 
 Buy groceries
 Do the taxes
 Take out the trash
 Reply to carls mail
 ,
 ULNoteList # Untitled 
 Twenty one lessons for the 21st century
 Dreams from my Father
 ,
 (INoteList) # Buy:  
 Toothpaste
 ,
 (INoteList) # Read:  
 The age of surveillance capitalism
 ,
 (INoteList) # Watch:  
 Parasite
 ,
 (INoteList) # Do:  
 The dishes
 ,
 (INoteList) # Read 
 <p>The Great Gastby</p><p>Alice’s Adventures in Wonderland</p>
 ,
 (INoteList) # Buy 
 <p>Groceries</p><p>Shoes</p>
 ,
 (INoteList) # Read 
 <p>The Great Gatsby</p><p>The Odyssey</p>
 ]

## Evernote lists

In [None]:
parser = HTMLListParser()
note = INote.from_data(content=read_file(PYI_TESTDATA / "notes" / "evernote" / "evernote-test-note-1.html"))

In [None]:
# lists = parser.get_lists(note)
# assert len(lists) == 10
# list0,list1,list2,list3,list4,list5,list6,list7,list8,list9 = lists

In [None]:
# test_eq(list0.get_items(), ['<li>Buy groceries</li>',
#                              '<li>Call john<br clear="none"/></li>',
#                              '<li>Do the taxes</li>',
#                              '<li>Take out the trash</li>',
#                              '<li>Reply to carls mail</li>'])

In [None]:
# test_eq(list1.get_items(), ['<li>Buy groceries</li>',
#                              '<li>Do the taxes</li>',
#                              '<li>Take out the trash</li>',
#                              '<li><br clear="none"/></li>',
#                              '<li>Reply to carls mail</li>'])

In [None]:
# test_eq(list2.get_items(), ['<li>Twenty one lessons for the 21st century</li>',
#                             '<li>Dreams from my Father</li>'])

In [None]:
# test_eq(list3.title, '<strong>Buy</strong>: ')
# test_eq(list3.get_items(), ['Toothpaste'])

In [None]:
# test_eq(list4.title, '<em>Read</em>: ')
# test_eq(list4.get_items(), ['The age of surveillance capitalism'])

In [None]:
# test_eq(list5.title, 'Watch: ')
# test_eq(list5.get_items(), ['Parasite'])

In [None]:
# test_eq(list6.title, '<u>Do</u>: ')
# test_eq(list6.get_items(), ['The dishes'])

In [None]:
# test_eq(list7.title,'<strong>Read</strong><br clear="none"/>')
# test_eq(list7.get_items(), ['The Great Gatsby', "Alice's Adventures in Wonderland"])

In [None]:
# test_eq(list8.title,'<strong>Buy</strong><br clear="none"/>')
# test_eq(list8.get_items(), ['groceries', 'Shoes'])

In [None]:
# test_eq(list9.title,'Read')
# test_eq(list9.get_items(), ['The Great Gatsby', 'The odyssey'])

In [None]:
show_doc(HTMLListParser.get_lists)

<h4 id="HTMLListParser.get_lists" class="doc_header"><code>HTMLListParser.get_lists</code><a href="__main__.py#L17" class="source_link" style="float:right">[source]</a></h4>

> <code>HTMLListParser.get_lists</code>(**`note`**)

Extracts lists from a note

In [None]:
show_doc(HTMLListParser.get_unformatted_lists)

<h4 id="HTMLListParser.get_unformatted_lists" class="doc_header"><code>HTMLListParser.get_unformatted_lists</code><a href="__main__.py#L54" class="source_link" style="float:right">[source]</a></h4>

> <code>HTMLListParser.get_unformatted_lists</code>(**`note`**, **`parsed`**)

retrieve lists without <ul></ul> tags. We have two options: 
1) multiline lists prefixed with a title keyword (e.g. "Buy:" "Read:") 
2) single element single line lists

In [None]:
show_doc(HTMLListParser.get_single_line_list)

<h4 id="HTMLListParser.get_single_line_list" class="doc_header"><code>HTMLListParser.get_single_line_list</code><a href="__main__.py#L35" class="source_link" style="float:right">[source]</a></h4>

> <code>HTMLListParser.get_single_line_list</code>(**`par`**)

Get single list lists. An example could be: '<strong>read</strong>: great book title'

# Export -

In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted basic.ipynb.
Converted importers.EmailImporter.ipynb.
Converted importers.Importer.ipynb.
Converted importers.util.ipynb.
Converted index.ipynb.
Converted indexers.FaceRecognitionIndexer.ipynb.
Converted indexers.FacerecognitionIndexer.Photo.ipynb.
Converted indexers.GeoIndexer.ipynb.
Converted indexers.NoteListIndexer.NoteList.ipynb.
Converted indexers.NoteListIndexer.Parser.ipynb.
Converted indexers.NoteListIndexer.ipynb.
Converted indexers.NoteListIndexer.util.ipynb.
Converted indexers.indexer.ipynb.
Converted itembase.ipynb.
Converted pod.client.ipynb.
