In [None]:
%load_ext autoreload
%autoreload 2
# default_exp indexers.notelist.parser

In [None]:
# export
import bs4
import random
from integrators.indexers.notelist.util import *
from integrators.indexers.notelist.notelist import *
from integrators.data.schema import *
from integrators.data.basic import *
from integrators.imports import *

In [None]:
#hide
from nbdev.showdoc import *

# HTMLListParser
This parsers takes Note objects with a .content field containing the html content of the note as input, and parsers the lists from the html content.

In [None]:
# export
class HTMLListParser():
    '''Extracts lists from HTML data, generated by an HTML text editor like evernote'''
    
    def __init__(self):
        
        self.one_line_list_pa = ["buy", "read", "watch"]
        
        words    = ["do", "read", "watch", "buy", "listen"]
        prefixes = ["to", "to-", "to ", ""]

        self.single_item_list_patterns = [prefix+word for word in words for prefix in prefixes]
        
    def get_lists(self, note):
        """Extracts lists from a note"""

        text = note.content
        parsed = bs4.BeautifulSoup(text, 'html.parser')
        note.content=str(parsed)

        uls = get_toplevel_elements(text, "ul", parsed=parsed)
        ols = get_toplevel_elements(text, "ol", parsed=parsed)

        html_lists = [ULNoteList.from_data(title=None, content=str(x),
         textContent=remove_html(str(x)), note=note, span=get_span(note, x, parsed))
         for x in uls + ols]
        
        unformatted_lists = self.get_unformatted_lists(note, text, parsed)
        all_lists = html_lists + unformatted_lists

        for l in all_lists:
            note.add_edge("noteList", l)

        return all_lists

    def get_single_line_list(self, elem):
        """Get single list lists. An example could be: '<strong>read</strong>: great book title'"""
        ps = ["read", "buy", "watch", "do"]
        pat = "|".join([f"(<strong>|<em>|<u>)?{p}(</strong>|</em>|</u>)?:? ?" for p in ps])
        match = re.search(pat, str(elem), re.IGNORECASE)
        if match is None: return None, None

        cleaned_elem = remove_html(str(elem))

        cleaned_title = remove_html(match.group()) if match is not None else None

        if len(cleaned_elem) > len(cleaned_title) + 2:
            title = match.group()
            content = str(elem)[len(title):]
            return title, content
        else:
            return None, None

    def get_unformatted_lists(self, note, txt, parsed):
        """retrieve lists without <ul></ul> tags. We have two options: 
                1) multiline lists prefixed with a title keyword (e.g. "Buy:" "Read:") 
                2) single element single line lists"""

        toplevel_div = get_toplevel_elements(txt, "div")[0]
        ls = []

        for elem in toplevel_div.children:    
            if elem.name == "div" and not is_newline_div(elem):

                children = get_children(elem)
                for i, child in enumerate(children):
                    # this extracts the lists that have a title and are not on a single line
                    if div_is_unstructured_list_title(child):
                        title = child
                        successors = list(children)[i+1:]
                        if len(successors) == 0:
                            continue
                        items = [x for x in find_till_double_br(successors) if not is_newline(str(x))]
                        items_str = [str(x) for x in items]
                        items_span = [get_span(note, x, parsed) for x in items_str]

                        span1 = get_span(note, title, parsed)
                        span2 = get_span(note, items[-1], parsed)
                        span = Span.from_data(startIdx=span1.startIdx, endIdx=span2.endIdx)   
                        html_content = "".join(items_str)
                        l = INoteList.from_data(note=note, title=title, content=str(html_content), itemSpan=items_span, span=span)                
                        ls.append(l)

                    else:
                        title, html_content = self.get_single_line_list(child)
                        if title is not None:
                            span = get_span(note, child, parsed)
                            itemSpan = [Span.from_data(startIdx=span.startIdx + len(str(title)), endIdx=span.endIdx)]
                            l = INoteList.from_data(note=note, title=title, content=str(html_content), itemSpan=itemSpan, span=span) 
                            ls.append(l)
        return ls

In [None]:
show_doc(HTMLListParser.get_lists)

<h4 id="HTMLListParser.get_lists" class="doc_header"><code>HTMLListParser.get_lists</code><a href="__main__.py#L14" class="source_link" style="float:right">[source]</a></h4>

> <code>HTMLListParser.get_lists</code>(**`note`**)

Extracts lists from a note

In [None]:
show_doc(HTMLListParser.get_unformatted_lists)

<h4 id="HTMLListParser.get_unformatted_lists" class="doc_header"><code>HTMLListParser.get_unformatted_lists</code><a href="__main__.py#L54" class="source_link" style="float:right">[source]</a></h4>

> <code>HTMLListParser.get_unformatted_lists</code>(**`note`**, **`txt`**, **`parsed`**)

retrieve lists without <ul></ul> tags. We have two options: 
1) multiline lists prefixed with a title keyword (e.g. "Buy:" "Read:") 
2) single element single line lists

In [None]:
show_doc(HTMLListParser.get_single_line_list)

<h4 id="HTMLListParser.get_single_line_list" class="doc_header"><code>HTMLListParser.get_single_line_list</code><a href="__main__.py#L36" class="source_link" style="float:right">[source]</a></h4>

> <code>HTMLListParser.get_single_line_list</code>(**`elem`**)

Get single list lists. An example could be: '<strong>read</strong>: great book title'

# Usage

Lets see how this works for an example note. We start with a note that was imported from evernote as example and show its content.

In [None]:
note = INote.from_data(content=read_file(PYI_TESTDATA / "notes" / "evernote" / "evernote-test-note-1.html"))

In [None]:
print(note.content[:400])

<div>
    <div><br clear="none" /></div>
    <div><br clear="none" /></div>
    <ul>
        <li>Buy groceries</li>
        <li>Call john<br clear="none" /></li>
        <li>Do the taxes</li>
        <li>Take out the trash</li>
        <li>Reply to carls mail</li>
    </ul>
    <div><br clear="none" /></div>
    <ul>
        <li>Buy groceries</li>
        <li>Call john<ul>
                <li>He r


In [None]:
#hide
from IPython.core.display import display, HTML

Which corresponds to this when rendered

In [None]:
display(HTML(note.content))

We can parse these using the `HTMLListParser`

# Test

## Memri lists

In [None]:
parser = HTMLListParser()
note = INote.from_data(content=read_file(PYI_TESTDATA / "notes" / "memri" / "memri-test-note-2.html"))

In [None]:
lists = parser.get_lists(note)
assert len(lists) == 10
list0,list1,list2,list3,list4,list5,list6,list7,list8,list9 = lists

AssertionError: 

## Evernote lists

In [None]:
parser = HTMLListParser()
note = INote.from_data(content=read_file(PYI_TESTDATA / "notes" / "evernote" / "evernote-test-note-1.html"))

In [None]:
lists = parser.get_lists(note)
assert len(lists) == 10
list0,list1,list2,list3,list4,list5,list6,list7,list8,list9 = lists

In [None]:
test_eq(list0.get_items(), ['<li>Buy groceries</li>',
                             '<li>Call john<br clear="none"/></li>',
                             '<li>Do the taxes</li>',
                             '<li>Take out the trash</li>',
                             '<li>Reply to carls mail</li>'])

In [None]:
test_eq(list1.get_items(), ['<li>Buy groceries</li>',
                             '<li>Do the taxes</li>',
                             '<li>Take out the trash</li>',
                             '<li><br clear="none"/></li>',
                             '<li>Reply to carls mail</li>'])

In [None]:
test_eq(list2.get_items(), ['<li>Twenty one lessons for the 21st century</li>',
                            '<li>Dreams from my Father</li>'])

In [None]:
test_eq(list3.title, '<strong>Buy</strong>: ')
test_eq(list3.get_items(), ['Toothpaste'])

In [None]:
test_eq(list4.title, '<em>Read</em>: ')
test_eq(list4.get_items(), ['The age of surveillance capitalism'])

In [None]:
test_eq(list5.title, 'Watch: ')
test_eq(list5.get_items(), ['Parasite'])

In [None]:
test_eq(list6.title, '<u>Do</u>: ')
test_eq(list6.get_items(), ['The dishes'])

In [None]:
test_eq(list7.title,'<strong>Read</strong><br clear="none"/>')
test_eq(list7.get_items(), ['The Great Gatsby', "Alice's Adventures in Wonderland"])

In [None]:
test_eq(list8.title,'<strong>Buy</strong><br clear="none"/>')
test_eq(list8.get_items(), ['groceries', 'Shoes'])

In [None]:
test_eq(list9.title,'Read')
test_eq(list9.get_items(), ['The Great Gatsby', 'The odyssey'])

# Export -

In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted basic.ipynb.
Converted index.ipynb.
Converted indexers.GeoIndexer.ipynb.
Converted indexers.NoteListIndexer.NoteList.ipynb.
Converted indexers.NoteListIndexer.Parser.ipynb.
Converted indexers.NoteListIndexer.util.ipynb.
Converted indexers.indexer.ipynb.
Converted itembase.ipynb.
Converted pod.client.ipynb.
