Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1. generate previews in s3_worker and upload them to s3 storage #362

Merged
merged 9 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/cloud/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ COPY ./papermerge ${CORE_APP}/papermerge/
COPY ./docker/cloud/config ${CORE_APP}/config
COPY ./docker/cloud/manage.py ${CORE_APP}/manage.py

RUN poetry install -E pgsql -vv
RUN poetry install -E pgsql -E cloud -vv

COPY docker/cloud/etc/ /etc/papermerge/
COPY docker/cloud/run.bash /run.bash
Expand Down
7 changes: 6 additions & 1 deletion docker/cloud/config/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,10 @@
}

app.conf.task_routes = {
's3': {'queue': 's3'}
# `s3_worker`: uploads/downloads of document version files
# via s3 queue
's3': {'queue': 's3'},
# `s3_worker`: generates previews and uploads them to s3 storage
# via s3preview queue
's3preview': {'queue': 's3preview'}
}
6 changes: 4 additions & 2 deletions docker/cloud/etc/logging.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,20 @@ disable_existing_loggers: false

formatters:
verbose:
format: '%(levelname)s %(asctime)s %(module)s %(message)s'
format: '%(levelname)s:%(pathname)s:%(lineno)d:%(funcName)s: %(message)s'

handlers:
console:
level: INFO
class: logging.StreamHandler
formatter: verbose

loggers:
oidc_app:
level: INFO
handlers: [console]
s3worker:
level: DEBUG
handlers: [console]
papermerge.search.tasks:
level: INFO
handlers: [console]
Expand Down
9 changes: 9 additions & 0 deletions docker/dev/config/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,12 @@ def config_loggers(*args, **kwags):
'interval_step': 0.2,
'interval_max': 0.2,
}

app.conf.task_routes = {
# `s3_worker`: uploads/downloads of document version files
# via s3 queue
's3': {'queue': 's3'},
# `s3_worker`: generates previews and uploads them to s3 storage
# via s3preview queue
'preview': {'queue': 's3preview'}
}
37 changes: 37 additions & 0 deletions papermerge/conf/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,43 @@
'deu'
)

# Where are image previews, i.e. pages, document thumbnails, serverd from ?
# - from local storage (convenient for simple setups)
# - from S3 storage (convenient for cloud setups)
PREVIEW_MODE = os.environ.get(
'PAPERMERGE__MAIN__IMAGE_PREVIEW_SOURCE',
'local'
)

# absolute path to private key used to sign
# cloudfront URLs for accessing private S3 content
CF_SIGN_URL_PRIVATE_KEY = os.environ.get(
'PAPERMERGE__MAIN__CF_SIGN_URL_PRIVATE_KEY',
None
)

# Cloudfront public key ID
# CF -> public keys -> ID
# example of key id value: "K2FRE1IUML0Y0N"
# This value should be provided only if:
# - S3 as content storage is used
# - there is a cloudfront private key used for signing urls
CF_SIGN_URL_KEY_ID = os.environ.get(
'PAPERMERGE__MAIN__CF_SIGN_URL_KEY_ID',
None
)
# cloudfront domain used to access S3 content
# e.g. 'd3j1f4sy1s01dy.cloudfront.net'
CF_DOMAIN = os.environ.get(
'PAPERMERGE__MAIN__CF_DOMAIN',
None
)

OBJECT_PREFIX = os.environ.get(
'PAPERMERGE__MAIN__OBJECT_PREFIX',
None
)

LOGGING_CFG_FILENAME = os.environ.get(
'PAPERMERGE__MAIN__LOGGING_CFG',
'/etc/papermerge/logging.yaml'
Expand Down
19 changes: 19 additions & 0 deletions papermerge/core/cli/cf_sign_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import typer
from rich import print
from typing_extensions import Annotated

from papermerge.core import cloudfront

app = typer.Typer(help="List various entities")

ValidFor = Annotated[
int,
typer.Argument(help='Number of seconds the url will be valid for')
]


@app.command()
def cf_sign_url(url: str, valid_for: ValidFor = 600):
"""Sign URL using AWS CloudFront signer"""
result = cloudfront.sign_url(url, valid_for)
print(f"Signed URL: {result}")
53 changes: 53 additions & 0 deletions papermerge/core/cloudfront.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from datetime import datetime, timedelta
from pathlib import Path

from botocore.signers import CloudFrontSigner
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import hashes, serialization
from cryptography.hazmat.primitives.asymmetric import padding
from django.conf import settings


def rsa_signer(message):
_kpath = getattr(settings, 'CF_SIGN_URL_PRIVATE_KEY', None)
if _kpath is None:
raise ValueError(
"Missing CF_SIGN_URL_PRIVATE_KEY setting"
)

key_path = Path(_kpath)
if not key_path.exists():
raise ValueError(
f"{key_path} does not exist"
)

with open(key_path, 'rb') as key_file:
private_key = serialization.load_pem_private_key(
key_file.read(),
password=None,
backend=default_backend()
)
return private_key.sign(message, padding.PKCS1v15(), hashes.SHA1())


def sign_url(url: str, valid_for: int = 600):
"""
:type url: str
:param url: The URL of the protected object

:type valid_for: int
:param valid_for: number of seconds the url will be valid for, defaults
to 600 (i.e. 10 minutes)
"""
key_id = getattr(settings, 'CF_SIGN_URL_KEY_ID', None)
if key_id is None:
raise ValueError(
"CF_SIGN_URL_KEY_ID is empty"
)
cf_signer = CloudFrontSigner(key_id, rsa_signer)
date_less_than = datetime.now() + timedelta(seconds=valid_for)
signed_url = cf_signer.generate_presigned_url(
url,
date_less_than=date_less_than
)
return signed_url
3 changes: 3 additions & 0 deletions papermerge/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
CTYPE_FOLDER = 'folder'
CTYPE_DOCUMENT = 'document'
DEFAULT_THUMBNAIL_SIZE = 100 # 100 pixels wide
DEFAULT_PAGE_SIZE = 900 # 900 pixels wide
JPG = 'jpg'
PAGES = 'pages'
THUMBNAILS = 'thumbnails'
Expand All @@ -17,3 +18,5 @@
INDEX_UPDATE = 'index_update'
S3_WORKER_ADD_DOC_VER = 's3_worker_add_doc_vers'
S3_WORKER_REMOVE_DOC_VER = 's3_worker_remove_doc_vers'
S3_WORKER_REMOVE_DOC_THUMBNAIL = 's3_worker_remove_doc_thumbnail'
S3_WORKER_GENERATE_PREVIEW = 's3_worker_generate_preview'
16 changes: 15 additions & 1 deletion papermerge/core/routers/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import uuid
from typing import Annotated

from celery.app import default_app as celery_app
from fastapi import APIRouter, Depends, Security, UploadFile

from papermerge.conf import settings
from papermerge.core import constants as const
from papermerge.core import db, schemas, utils
from papermerge.core.auth import get_current_user, scopes
from papermerge.core.models import Document
Expand Down Expand Up @@ -80,6 +83,17 @@ def upload_file(
file_name=file.filename,
content_type=file.headers.get('content-type')
)
doc.generate_thumbnail()

if settings.PREVIEW_MODE == 'local':
# generate preview and store it in local storage
doc.generate_thumbnail()
else:
# generate preview using `s3_worker`
# it will, as well, upload previews to s3 storage
celery_app.send_task(
const.S3_WORKER_GENERATE_PREVIEW,
kwargs={'doc_id': str(doc.id)},
route_name='preview',
)

return schemas.Document.model_validate(doc)
51 changes: 49 additions & 2 deletions papermerge/core/schemas/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
from typing import List, Literal, Optional, Tuple
from uuid import UUID

from django.conf import settings
from django.db.models.manager import BaseManager
from pydantic import (BaseModel, ConfigDict, Field, ValidationInfo,
field_validator)
from typing_extensions import Annotated

from papermerge.core import constants as const
from papermerge.core import pathlib as plib
from papermerge.core.types import OCRStatusEnum


Expand Down Expand Up @@ -37,7 +40,15 @@ def svg_url_value(cls, value, info: ValidationInfo) -> str:
@field_validator("jpg_url", mode='before')
@classmethod
def jpg_url_value(cls, value, info: ValidationInfo) -> str:
return f"/api/pages/{info.data['id']}/jpg"
if settings.PREVIEW_MODE == 'local':
return f"/api/pages/{info.data['id']}/jpg"

s3_url = _s3_page_thumbnail_url(
info.data['id'], # UUID of the page here
size=const.DEFAULT_PAGE_SIZE
)

return s3_url

# Config
model_config = ConfigDict(from_attributes=True)
Expand Down Expand Up @@ -112,7 +123,11 @@ def get_all_from_manager(cls, v: object) -> object:

@field_validator('thumbnail_url', mode='before')
def thumbnail_url_validator(cls, value, info):
return f"/api/thumbnails/{info.data['id']}"
if settings.PREVIEW_MODE == 'local':
return f"/api/thumbnails/{info.data['id']}"

# if it is not local, then it is s3 + cloudfront
return _s3_doc_thumbnail_url(info.data['id'])

@field_validator('tags', mode='before')
def tags_validator(cls, value):
Expand Down Expand Up @@ -155,3 +170,35 @@ class CreateDocument(BaseModel):
class Thumbnail(BaseModel):
url: str
size: int


def _s3_doc_thumbnail_url(uid: UUID) -> str:
from papermerge.core.cloudfront import sign_url

resource_path = plib.thumbnail_path(uid)
prefix = getattr(settings, 'OBJECT_PREFIX', None)
if prefix:
url = f"https://{settings.CF_DOMAIN}/{prefix}/{resource_path}"
else:
url = f"https://{settings.CF_DOMAIN}/{resource_path}"

return sign_url(
url,
valid_for=600 # valid for 600 seconds
)


def _s3_page_thumbnail_url(uid: UUID, size: int) -> str:
from papermerge.core.cloudfront import sign_url

resource_path = plib.thumbnail_path(uid, size=size)
prefix = getattr(settings, 'OBJECT_PREFIX', None)
if prefix:
url = f"https://{settings.CF_DOMAIN}/{prefix}/{resource_path}"
else:
url = f"https://{settings.CF_DOMAIN}/{resource_path}"

return sign_url(
url,
valid_for=600 # valid for 600 seconds
)
5 changes: 5 additions & 0 deletions papermerge/core/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,11 @@ def s3_delete(sender, instance: Document, **kwargs):
kwargs={'doc_ver_ids': ids},
route_name='s3',
)
celery_app.send_task(
const.S3_WORKER_REMOVE_DOC_THUMBNAIL,
kwargs={'doc_id': str(instance.id)},
route_name='s3',
)


@receiver(post_delete, sender=User)
Expand Down