<a href="https://colab.research.google.com/github/null-buster/PDF_Highlights_Extractor/blob/main/PDF_Highlights_Extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pymupdf

Collecting pymupdf
[?25l  Downloading https://files.pythonhosted.org/packages/31/22/d59001c1d7df4a1839924c0ca67a3313cbcdadb7a14300f7079440f66c9f/PyMuPDF-1.18.5-cp36-cp36m-manylinux2010_x86_64.whl (6.3MB)
[K     |████████████████████████████████| 6.3MB 3.7MB/s 
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.18.5


In [None]:
from typing import List, Tuple

import fitz  # installed with 'pip install pymupdf'
import os


In [None]:
def _parse_highlight(annot: fitz.Annot, wordlist: List[Tuple[float, float, float, float, str, int, int, int]]) -> str:
    points = annot.vertices
    quad_count = int(len(points) / 4)
    sentences = []
    for i in range(quad_count):
        # where the highlighted part is
        r = fitz.Quad(points[i * 4 : i * 4 + 4]).rect

        words = [w for w in wordlist if fitz.Rect(w[:4]).intersects(r)]
        sentences.append(" ".join(w[4] for w in words))
    sentence = " ".join(sentences)
    return sentence


In [None]:
def handle_page(page):
    wordlist = page.getText("words")  # list of words on page
    wordlist.sort(key=lambda w: (w[3], w[0]))  # ascending y, then x

    highlights = []
    annot = page.firstAnnot
    while annot:
        if annot.type[0] == 8:
            highlights.append(_parse_highlight(annot, wordlist))
        annot = annot.next
    return highlights

In [None]:
def extract_highlight(filepath: str) -> List:
    doc = fitz.open(filepath)

    highlights = []
    for page in doc:
        highlights += handle_page(page)

    return highlights

In [None]:
def read_pdf_paths(folder_name, output_folder):
  path_to_files = os.listdir(os.path.abspath('.') + '/' + folder_name)
  for entry in path_to_files:
    if entry.split('.')[-1] == 'pdf':
      #print("Processing Pdf : " + entry)
      final_highlight = extract_highlight(os.path.abspath('.') + '/' + folder_name + '/' + entry)
      with open(os.path.abspath('.') + '/' + output_folder + '/Highlights_' + entry.split('.')[-2] + '.txt', 'w' ) as f:
        f.write("Highlighted text of " + entry.split('.')[-2]+":\n\n" )
        f.write("\n\n===========================================\n\n" )
        for elements in final_highlight:
          f.write('\n\n'+elements)
        #f.write(final_highlight)
      print(entry + '\n')
      print("Highlights of " + entry.split('.')[-2]+" is :\n")
      print(final_highlight)
      print('\n\n')

In [None]:
pdf_folder = '/pdf_folder'
if not os.path.exists(os.path.abspath('.') + pdf_folder):
  os.mkdir(os.path.abspath('.') + pdf_folder)
summary_folder = '/highlights_folder'
if not os.path.exists(os.path.abspath('.') + summary_folder):
  os.mkdir(os.path.abspath('.') + summary_folder)

In [None]:
read_pdf_paths('pdf_folder', 'highlights_folder')

COVID_19.pdf

Highlights of COVID_19 is :

['The COVID-19 recovery will be digital: A plan The COVID-19 recovery will be digital: A plan for the first 90 days will be digital: A plan for the first 90 days', 'The rapid migration to digital technologies driven by the pandemic will continue into the recovery. Here’s how to accelerate your The rapid migration to digital technologies will continue into the recovery. organization’s digital capabilities', 'As some regions begin reopening, businesses are considering how to return to some semblance of full speed in an unstable environment in which lockdowns will ease (and potentially be reinstated) in waves. In doing so, they will need to confront three structural changes that are playing out.', 'Selectively modernize technology Selectively modernize technology capabilities capabilities', '“Safe-o-meter” to plan “Safe-o-meter” to plan options options', 'As CIOs consider upgrading their tech stacks, two features of a modern technology environmen

In [None]:
#for elements in main("COVID_19.pdf"):
 # print("\n" + elements)

In [None]:
#print(main("COVID_19.pdf"))