In [6]:
import re
import pandas as pd
import streamlit as st
import base64
from collections import namedtuple
from io import BytesIO

In [7]:
# Define the pattern for each element we want to extract
book_pattern = re.compile(r"^(.*?)\((.*?)\)$")
highlight_pattern = re.compile(
    r"- Your Highlight on page (\d+) \| Location (\d+-\d+) \| Added on (.*?)\n"
)

# Define a namedtuple to hold the information for a single clipping
Clipping = namedtuple(
    "Clipping", ["title", "author", "page", "location", "added_on", "highlight"]
)


def parse_book_info(line):
    """Parse the book title and author from a line."""
    match = book_pattern.match(line)
    if match:
        return match.group(1).strip(), match.group(2).strip()


def parse_highlight_info(line):
    """Parse the page, location, and date from a line."""
    match = highlight_pattern.match(line)
    if match:
        return int(match.group(1)), match.group(2), pd.to_datetime(match.group(3))


def parse_clipping(file):
    """Read and parse a single clipping from the file."""
    try:
        book_info = parse_book_info(next(file))
        highlight_info = parse_highlight_info(next(file))
        next(file)  # Skip the blank line
        highlight_text = next(file).strip()
        next(file)  # Skip the "==========" line

        if book_info and highlight_info:
            return Clipping(*book_info, *highlight_info, highlight_text)

    except StopIteration:
        # If we've reached the end of the file, return None
        return None


def main():
    # Initialize a list to store the Clipping objects
    clippings = []

    # Open and iterate over the file
    with open("./My Clippings.txt", "r", encoding="utf-8") as file:
        while True:
            clipping = parse_clipping(file)
            if clipping is None:
                break
            clippings.append(clipping)
            pprint(clippings)

    # Create a DataFrame from the list of Clipping objects
    df = pd.DataFrame(clippings)

    return df


if __name__ == "__main__":
    df =   main()

In [11]:
gb_df = df.groupby(['title'])
gb_df.head()

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000181292C13A0>