Skip to content

Commit

Permalink
1. generate previews in s3_worker and upload them to s3 storage (#362)
Browse files Browse the repository at this point in the history
S3 + CF integration
  • Loading branch information
ciur authored May 23, 2024
1 parent 8054b7a commit 21ef136
Show file tree
Hide file tree
Showing 16 changed files with 447 additions and 156 deletions.
2 changes: 1 addition & 1 deletion docker/cloud/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ COPY ./papermerge ${CORE_APP}/papermerge/
COPY ./docker/cloud/config ${CORE_APP}/config
COPY ./docker/cloud/manage.py ${CORE_APP}/manage.py

RUN poetry install -E pgsql -vv
RUN poetry install -E pgsql -E cloud -vv

COPY docker/cloud/etc/ /etc/papermerge/
COPY docker/cloud/run.bash /run.bash
Expand Down
7 changes: 6 additions & 1 deletion docker/cloud/config/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,10 @@
}

app.conf.task_routes = {
's3': {'queue': 's3'}
# `s3_worker`: uploads/downloads of document version files
# via s3 queue
's3': {'queue': 's3'},
# `s3_worker`: generates previews and uploads them to s3 storage
# via s3preview queue
's3preview': {'queue': 's3preview'}
}
6 changes: 4 additions & 2 deletions docker/cloud/etc/logging.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,20 @@ disable_existing_loggers: false

formatters:
verbose:
format: '%(levelname)s %(asctime)s %(module)s %(message)s'
format: '%(levelname)s:%(pathname)s:%(lineno)d:%(funcName)s: %(message)s'

handlers:
console:
level: INFO
class: logging.StreamHandler
formatter: verbose

loggers:
oidc_app:
level: INFO
handlers: [console]
s3worker:
level: DEBUG
handlers: [console]
papermerge.search.tasks:
level: INFO
handlers: [console]
Expand Down
9 changes: 9 additions & 0 deletions docker/dev/config/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,12 @@ def config_loggers(*args, **kwags):
'interval_step': 0.2,
'interval_max': 0.2,
}

app.conf.task_routes = {
# `s3_worker`: uploads/downloads of document version files
# via s3 queue
's3': {'queue': 's3'},
# `s3_worker`: generates previews and uploads them to s3 storage
# via s3preview queue
'preview': {'queue': 's3preview'}
}
37 changes: 37 additions & 0 deletions papermerge/conf/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,43 @@
'deu'
)

# Where are image previews, i.e. pages, document thumbnails, serverd from ?
# - from local storage (convenient for simple setups)
# - from S3 storage (convenient for cloud setups)
PREVIEW_MODE = os.environ.get(
'PAPERMERGE__MAIN__IMAGE_PREVIEW_SOURCE',
'local'
)

# absolute path to private key used to sign
# cloudfront URLs for accessing private S3 content
CF_SIGN_URL_PRIVATE_KEY = os.environ.get(
'PAPERMERGE__MAIN__CF_SIGN_URL_PRIVATE_KEY',
None
)

# Cloudfront public key ID
# CF -> public keys -> ID
# example of key id value: "K2FRE1IUML0Y0N"
# This value should be provided only if:
# - S3 as content storage is used
# - there is a cloudfront private key used for signing urls
CF_SIGN_URL_KEY_ID = os.environ.get(
'PAPERMERGE__MAIN__CF_SIGN_URL_KEY_ID',
None
)
# cloudfront domain used to access S3 content
# e.g. 'd3j1f4sy1s01dy.cloudfront.net'
CF_DOMAIN = os.environ.get(
'PAPERMERGE__MAIN__CF_DOMAIN',
None
)

OBJECT_PREFIX = os.environ.get(
'PAPERMERGE__MAIN__OBJECT_PREFIX',
None
)

LOGGING_CFG_FILENAME = os.environ.get(
'PAPERMERGE__MAIN__LOGGING_CFG',
'/etc/papermerge/logging.yaml'
Expand Down
19 changes: 19 additions & 0 deletions papermerge/core/cli/cf_sign_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import typer
from rich import print
from typing_extensions import Annotated

from papermerge.core import cloudfront

app = typer.Typer(help="List various entities")

ValidFor = Annotated[
int,
typer.Argument(help='Number of seconds the url will be valid for')
]


@app.command()
def cf_sign_url(url: str, valid_for: ValidFor = 600):
"""Sign URL using AWS CloudFront signer"""
result = cloudfront.sign_url(url, valid_for)
print(f"Signed URL: {result}")
53 changes: 53 additions & 0 deletions papermerge/core/cloudfront.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from datetime import datetime, timedelta
from pathlib import Path

from botocore.signers import CloudFrontSigner
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import hashes, serialization
from cryptography.hazmat.primitives.asymmetric import padding
from django.conf import settings


def rsa_signer(message):
_kpath = getattr(settings, 'CF_SIGN_URL_PRIVATE_KEY', None)
if _kpath is None:
raise ValueError(
"Missing CF_SIGN_URL_PRIVATE_KEY setting"
)

key_path = Path(_kpath)
if not key_path.exists():
raise ValueError(
f"{key_path} does not exist"
)

with open(key_path, 'rb') as key_file:
private_key = serialization.load_pem_private_key(
key_file.read(),
password=None,
backend=default_backend()
)
return private_key.sign(message, padding.PKCS1v15(), hashes.SHA1())


def sign_url(url: str, valid_for: int = 600):
"""
:type url: str
:param url: The URL of the protected object
:type valid_for: int
:param valid_for: number of seconds the url will be valid for, defaults
to 600 (i.e. 10 minutes)
"""
key_id = getattr(settings, 'CF_SIGN_URL_KEY_ID', None)
if key_id is None:
raise ValueError(
"CF_SIGN_URL_KEY_ID is empty"
)
cf_signer = CloudFrontSigner(key_id, rsa_signer)
date_less_than = datetime.now() + timedelta(seconds=valid_for)
signed_url = cf_signer.generate_presigned_url(
url,
date_less_than=date_less_than
)
return signed_url
3 changes: 3 additions & 0 deletions papermerge/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
CTYPE_FOLDER = 'folder'
CTYPE_DOCUMENT = 'document'
DEFAULT_THUMBNAIL_SIZE = 100 # 100 pixels wide
DEFAULT_PAGE_SIZE = 900 # 900 pixels wide
JPG = 'jpg'
PAGES = 'pages'
THUMBNAILS = 'thumbnails'
Expand All @@ -17,3 +18,5 @@
INDEX_UPDATE = 'index_update'
S3_WORKER_ADD_DOC_VER = 's3_worker_add_doc_vers'
S3_WORKER_REMOVE_DOC_VER = 's3_worker_remove_doc_vers'
S3_WORKER_REMOVE_DOC_THUMBNAIL = 's3_worker_remove_doc_thumbnail'
S3_WORKER_GENERATE_PREVIEW = 's3_worker_generate_preview'
16 changes: 15 additions & 1 deletion papermerge/core/routers/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import uuid
from typing import Annotated

from celery.app import default_app as celery_app
from fastapi import APIRouter, Depends, Security, UploadFile

from papermerge.conf import settings
from papermerge.core import constants as const
from papermerge.core import db, schemas, utils
from papermerge.core.auth import get_current_user, scopes
from papermerge.core.models import Document
Expand Down Expand Up @@ -80,6 +83,17 @@ def upload_file(
file_name=file.filename,
content_type=file.headers.get('content-type')
)
doc.generate_thumbnail()

if settings.PREVIEW_MODE == 'local':
# generate preview and store it in local storage
doc.generate_thumbnail()
else:
# generate preview using `s3_worker`
# it will, as well, upload previews to s3 storage
celery_app.send_task(
const.S3_WORKER_GENERATE_PREVIEW,
kwargs={'doc_id': str(doc.id)},
route_name='preview',
)

return schemas.Document.model_validate(doc)
51 changes: 49 additions & 2 deletions papermerge/core/schemas/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
from typing import List, Literal, Optional, Tuple
from uuid import UUID

from django.conf import settings
from django.db.models.manager import BaseManager
from pydantic import (BaseModel, ConfigDict, Field, ValidationInfo,
field_validator)
from typing_extensions import Annotated

from papermerge.core import constants as const
from papermerge.core import pathlib as plib
from papermerge.core.types import OCRStatusEnum


Expand Down Expand Up @@ -37,7 +40,15 @@ def svg_url_value(cls, value, info: ValidationInfo) -> str:
@field_validator("jpg_url", mode='before')
@classmethod
def jpg_url_value(cls, value, info: ValidationInfo) -> str:
return f"/api/pages/{info.data['id']}/jpg"
if settings.PREVIEW_MODE == 'local':
return f"/api/pages/{info.data['id']}/jpg"

s3_url = _s3_page_thumbnail_url(
info.data['id'], # UUID of the page here
size=const.DEFAULT_PAGE_SIZE
)

return s3_url

# Config
model_config = ConfigDict(from_attributes=True)
Expand Down Expand Up @@ -112,7 +123,11 @@ def get_all_from_manager(cls, v: object) -> object:

@field_validator('thumbnail_url', mode='before')
def thumbnail_url_validator(cls, value, info):
return f"/api/thumbnails/{info.data['id']}"
if settings.PREVIEW_MODE == 'local':
return f"/api/thumbnails/{info.data['id']}"

# if it is not local, then it is s3 + cloudfront
return _s3_doc_thumbnail_url(info.data['id'])

@field_validator('tags', mode='before')
def tags_validator(cls, value):
Expand Down Expand Up @@ -155,3 +170,35 @@ class CreateDocument(BaseModel):
class Thumbnail(BaseModel):
url: str
size: int


def _s3_doc_thumbnail_url(uid: UUID) -> str:
from papermerge.core.cloudfront import sign_url

resource_path = plib.thumbnail_path(uid)
prefix = getattr(settings, 'OBJECT_PREFIX', None)
if prefix:
url = f"https://{settings.CF_DOMAIN}/{prefix}/{resource_path}"
else:
url = f"https://{settings.CF_DOMAIN}/{resource_path}"

return sign_url(
url,
valid_for=600 # valid for 600 seconds
)


def _s3_page_thumbnail_url(uid: UUID, size: int) -> str:
from papermerge.core.cloudfront import sign_url

resource_path = plib.thumbnail_path(uid, size=size)
prefix = getattr(settings, 'OBJECT_PREFIX', None)
if prefix:
url = f"https://{settings.CF_DOMAIN}/{prefix}/{resource_path}"
else:
url = f"https://{settings.CF_DOMAIN}/{resource_path}"

return sign_url(
url,
valid_for=600 # valid for 600 seconds
)
5 changes: 5 additions & 0 deletions papermerge/core/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,11 @@ def s3_delete(sender, instance: Document, **kwargs):
kwargs={'doc_ver_ids': ids},
route_name='s3',
)
celery_app.send_task(
const.S3_WORKER_REMOVE_DOC_THUMBNAIL,
kwargs={'doc_id': str(instance.id)},
route_name='s3',
)


@receiver(post_delete, sender=User)
Expand Down
Loading

0 comments on commit 21ef136

Please sign in to comment.