# Basic pdf functionalities

This notebook contains a set of functionalities related to pdf processing.

In [None]:
%load_ext autoreload
%autoreload 2

import fitz
from obsidianizer.pdf_tools.annotations import extract_annotation, AnnotationExtractionMode
from obsidianizer.pdf_tools.pages import extract_page_annotations, get_blocks_summary, get_words_data_frame
from obsidianizer.pdf_tools.plots import get_rectangles_from_data_frame
from IPython.display import display

import plotly.graph_objects as go

from obsidianizer.pdf_tools.page_plots import get_page_figure_widget, get_book_figure_widget
from obsidianizer import EXAMPLE_ECCE_HOMMO_PDF_PATH

import fitz
from obsidianizer.obsidian.vault import load_vault, save_vault
from obsidianizer.obsidian.transformations import join_vaults, create_common_words_vault
from obsidianizer.obsidian.journal_tools import get_vault_df_from_journal, get_journal_entries_from_vault
from obsidianizer.latex_tools.utils import load_drafts_entries
from obsidianizer.latex_tools.journal_processing import get_sentences
from obsidianizer.pdf_tools.ecce_homo import is_ecce_hommo_chapter, is_ecce_hommo_subsection
from obsidianizer.pdf_tools.documents import get_book_filtered_blocks, extract_book_annotations
from obsidianizer.obsidian.pdf_tools import get_vault_df_from_pdf, get_vault_df_from_pdf_by_page
from obsidianizer.latex_tools.utils import load_drafts_entries, save_cleaned_sentences_to_latex, print_differences_in_journals
from obsidianizer.obsidian.utils import get_duplicated_vault_keywords, get_vault_df_unique_backlinks, get_backlinks, get_subbacklinks_from_other_backlinks, add_sublinks_to_vault_df

from obsidianizer.nlp.text_cleanup import get_most_used_words
from obsidianizer.nlp.text_cleanup import remove_stop_words_en
import pandas as pd
import glob


## Loading of the pdf document

In [None]:
doc = fitz.open(EXAMPLE_ECCE_HOMMO_PDF_PATH) 

## 2. Page functionalities

Set of functionalities related to a page. First we subselect a page index

In [None]:
page = doc[231]

### Get individual words in a dataframe

For each word we have also its rectangle coordinates, and the block, line and word number they belong to.

In [None]:
df_words = get_words_data_frame(page)
df_words

### Get summary statistics of the block codes

The statistics are:
- The  words it contain
- The rectangle that would contain the entire block x0, x1, y0, y1
- The number of lines it contains.
- The height and width of the block.

In [None]:
block_sumary = get_blocks_summary(page)
block_sumary

### Get annotations in a page

Get the annotations within a page and the rectangle that surounds them.
- highlighted_text: The original text in the pdf document that was highlighted.
- annotation_text: The associated text to the annotation.

In [None]:
AnnotationExtractionMode.list()

In [None]:
"\n".join([f"{x[1]}: {x[0]}" for x in AnnotationExtractionMode.list()])

In [None]:
AnnotationExtractionMode(1)

In [None]:
annotations_df = extract_page_annotations(page, mode = AnnotationExtractionMode.ENTIRE_LINES)
annotations_df

### Plot Page figure

The following plots the blocks, words and annotations of the page.

In [None]:
fig = get_page_figure_widget(page, width = 600)

In [None]:
fig.show()

## 3. Document functions

Gathering of functions related to a document. 
A document is just a list of pages, but there is a lot to play with when having to guess across pages.

In [None]:
book = [doc[i] for i in range(230,235)]

### Plot the pages of the document

In [None]:
book_tabs = get_book_figure_widget(book, width = 600)

In [None]:
display(book_tabs)

# TODO: Add all the getting the subsection and sections, and then transformaing to vault maybe as well

# 4. Batch processing of pdfs

Ideally we would like to automatically process all the pdfs in a given folder.

In [None]:
path = "../../../knowledge/Books/Improvement/"

In [None]:
filepaths = [f for f in glob.glob(path + "**/*.pdf", recursive=True)]

In [None]:
filepaths

In [None]:
doc = fitz.open(filepaths[0]) 
book = doc

In [None]:
chapter_blocks = get_book_filtered_blocks(book, lambda x: "~~~~~~~" in x["words"])

In [None]:
fig = get_page_figure_widget(book[50], width = 600)
fig

In [None]:
subsection_blocks = get_book_filtered_blocks(book, is_ecce_hommo_subsection)

In [None]:
annotations_blocks = extract_book_annotations(book, mode = AnnotationExtractionMode.SENTENCE)

In [None]:
annotations_blocks

In [None]:
book_vault = get_vault_df_from_pdf_by_page(annotations_blocks, path + "how_to_win/")
book_vault

## Highlight the most common words

In [None]:
words = get_most_used_words(annotations_blocks["highlighted_text"])
important_words = list(pd.Series(words[0:23]).index)

common_words_vault = create_common_words_vault(important_words, path + "how_to_win/")
common_words_vault

## Merge the vaults and create backlinks from the highlighted words in the original book

In [None]:
final_vault = join_vaults(book_vault, common_words_vault, True)

In [None]:
save_vault(final_vault)