<a href="https://colab.research.google.com/github/louispaulet/pdf_watermark_pipeline/blob/main/add_watermark_to_pdfs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Add watermark to PDFs (and compress them)

This script adds a watermark (to be supplied as a transparent text pdf) to all the pdfs located in the input folder using PyPDF2.  
Then, as the resulting PDFs are too heavy, they get compressed thanks to Ghostscript.

In [22]:
from tqdm.notebook import tqdm

# Add Watermark

In [23]:
!pip install PyPDF2



In [24]:
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter

def add_watermark_to_all_pdf_pages(pdf_file, watermark, output_file):
  with open(pdf_file, "rb") as input_file, open(watermark, "rb") as watermark_file:
      input_pdf = PdfFileReader(input_file)
      watermark_pdf = PdfFileReader(watermark_file, strict=False)
      watermark_page = watermark_pdf.getPage(0)

      output = PdfFileWriter()

      for i in range(input_pdf.getNumPages()):
          pdf_page = input_pdf.getPage(i)
          pdf_page.mergePage(watermark_page)
          output.addPage(pdf_page)

      with open(output_file, "wb") as merged_file:
          output.write(merged_file)

# pdf_file = "/content/drive/MyDrive/mereo-scripts/add_watermarks_script/input_pdfs/20210701 - Louis PAULET - CDD (1).pdf"
# watermark = "/content/drive/MyDrive/mereo-scripts/add_watermarks_script/watermark pdf.pdf"
# output_file = "/content/drive/MyDrive/mereo-scripts/add_watermarks_script/output_pdfs/merged.pdf"

# add_watermark_to_all_pdf_pages(pdf_file, watermark, output_file)

In [25]:
import os
import pandas as pd
file_list = []

input_folder_path = "/content/drive/MyDrive/mereo-scripts/add_watermarks_script/input_pdfs"
output_folder_path = "/content/drive/MyDrive/mereo-scripts/add_watermarks_script/watermarked_pdfs"
watermark_file_path = "/content/drive/MyDrive/mereo-scripts/add_watermarks_script/watermark pdf.pdf"

for filename in os.listdir(input_folder_path):
    if filename.endswith("pdf"): 
      file_list.append(filename)

print("The following files will be converted : ") 
pd.DataFrame(file_list, columns=["Filename"])

The following files will be converted : 


Unnamed: 0,Filename
0,Louis Paulet - Bulletins 11-2021.pdf
1,Louis Paulet - Bulletins 10-2021.pdf
2,Louis Paulet - Bulletins 09-2021.pdf
3,Louis Paulet - Bulletins 05-2021.pdf
4,Louis Paulet - Bulletins 07-2021.pdf
5,Louis Paulet - Bulletins 03-2021.pdf
6,Louis Paulet - Bulletins 08-2021.pdf
7,Louis Paulet - Bulletins 06-2021.pdf
8,Louis Paulet - Bulletins 04-2021.pdf
9,Louis Paulet - Bulletins 12-2021.pdf


In [26]:
for filename in tqdm(file_list):
  input_file_path = input_folder_path +'/'+ filename
  output_file_path = output_folder_path +'/'+ filename
  watermark_file_path
  # print(input_file_path)
  add_watermark_to_all_pdf_pages(input_file_path, watermark_file_path, output_file_path)

  0%|          | 0/19 [00:00<?, ?it/s]



# Compress pdfs

Thanks to ghostscript

In [27]:
!apt install ghostscript

Reading package lists... Done
Building dependency tree       
Reading state information... Done
ghostscript is already the newest version (9.26~dfsg+0-0ubuntu0.18.04.15).
The following packages were automatically installed and are no longer required:
  cuda-command-line-tools-10-0 cuda-command-line-tools-10-1
  cuda-command-line-tools-11-0 cuda-compiler-10-0 cuda-compiler-10-1
  cuda-compiler-11-0 cuda-cuobjdump-10-0 cuda-cuobjdump-10-1
  cuda-cuobjdump-11-0 cuda-cupti-10-0 cuda-cupti-10-1 cuda-cupti-11-0
  cuda-cupti-dev-11-0 cuda-documentation-10-0 cuda-documentation-10-1
  cuda-documentation-11-0 cuda-documentation-11-1 cuda-gdb-10-0 cuda-gdb-10-1
  cuda-gdb-11-0 cuda-gpu-library-advisor-10-0 cuda-gpu-library-advisor-10-1
  cuda-libraries-10-0 cuda-libraries-10-1 cuda-libraries-11-0
  cuda-memcheck-10-0 cuda-memcheck-10-1 cuda-memcheck-11-0 cuda-nsight-10-0
  cuda-nsight-10-1 cuda-nsight-11-0 cuda-nsight-11-1 cuda-nsight-compute-10-0
  cuda-nsight-compute-10-1 cuda-nsight-compute-11

In [28]:
input_folder_path = "/content/drive/MyDrive/mereo-scripts/add_watermarks_script/watermarked_pdfs"
output_folder_path = "/content/drive/MyDrive/mereo-scripts/add_watermarks_script/compressed_pdfs"

input_file = "/content/drive/MyDrive/mereo-scripts/add_watermarks_script/watermarked_pdfs/Louis Paulet - Bulletins 02-2021.pdf"

In [30]:
for filename in tqdm(os.listdir(input_folder_path)):
    if filename.endswith("pdf"): 
      input_file_path = input_folder_path + "/" + filename
      output_file_path = output_folder_path + "/" + filename

      !gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook \
      -dNOPAUSE -dQUIET -dBATCH -dAutoRotatePages=/None -sOutputFile="{output_file_path}" "{input_file_path}"

  0%|          | 0/19 [00:00<?, ?it/s]