Skip to content

Commit

Permalink
Merge pull request #69 from openzim/archives-metadata
Browse files Browse the repository at this point in the history
Request Task to farm
  • Loading branch information
rgaudin authored Jun 14, 2024
2 parents b34d1b4 + ebb41c8 commit 1e23ced
Show file tree
Hide file tree
Showing 23 changed files with 1,226 additions and 213 deletions.
8 changes: 3 additions & 5 deletions .github/workflows/Tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
steps:
- uses: actions/checkout@v3

- name: Set up Python ${{ matrix.python }}
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version-file: "backend/pyproject.toml"
Expand All @@ -51,11 +51,9 @@ jobs:
run: inv coverage --args "-vvv"

- name: Upload coverage to Codecov
if: matrix.python == '3.11'
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4
with:
root_dir: backend
working-directory: backend
directory: backend
fail_ci_if_error: true
token: ${{ secrets.CODECOV_TOKEN }}

Expand Down
48 changes: 22 additions & 26 deletions backend/README.md
Original file line number Diff line number Diff line change
@@ -1,31 +1,27 @@
# Contribution

## Dependencies
```bash
# Install all the dependencies.
pipenv sync
# Update dependencies.
pipenv install
```
# backend

## Development
Leverages great things to achieve great results

If you want to link to Postgresql, create the `.env` file and set the `POSTGRES_URI` environment variable in it, example:
[![CodeFactor](https://www.codefactor.io/repository/github/openzim/nautilus-webui/badge)](https://www.codefactor.io/repository/github/openzim/nautilus-webui)
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
[![codecov](https://codecov.io/gh/openzim/nautilus-webui/branch/main/graph/badge.svg)](https://codecov.io/gh/openzim/nautilus-webui)
![Python Version from PEP 621 TOML](https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fgithub.com%2Fopenzim%2Fnautilus-webui%2Fraw%2Fmain%2Fbackend%2Fpyproject.toml)

```env
POSTGRES_URI=postgresql+psycopg://username:password@host/database
```

Dev commands:
```bash
# Init database
pipenv run init
# Start FastAPI
pipenv run start
# Run tests
pipenv run tests
# Format code
pipenv run format
# Check format.
pipenv run format:check
## Usage

**CAUTION**: this is not a standalone, installable Python package.

- It's the backend of a web service that is intended to be deployed using OCI images.
- See the sample Composefile in the dev folder of the repository.
- It has external dependencies (including [S3 Storage](https://wasabi.com/), [Mailgun](https://www.mailgun.com/) account and a full-fledged [Zimfarm](https://github.com/openzim/zimfarm).
- It **must be configured** via environment variables (see `constants.py` and Compose's Envfile)
- There is no CHANGELOG nor release management. Production is tied to CD on `main` branch.

```sh
❯ hatch run serve
```

nautilus-webui backend adheres to openZIM's [Contribution Guidelines](https://github.com/openzim/overview/wiki/Contributing).

nautilus-webui backend has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/docs/Policy.md) **v1.0.1**.
23 changes: 11 additions & 12 deletions backend/alembic.ini
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,17 @@ sqlalchemy.url = driver://user:pass@localhost/dbname
# on newly generated revision scripts. See the documentation for further
# detail and examples

# format using "black" - use the console_scripts runner, against the "black" entrypoint
# hooks = black
# black.type = console_scripts
# black.entrypoint = black
# black.options = -l 79 REVISION_SCRIPT_FILENAME
hooks = black isort
black.type = console_scripts
black.entrypoint = black
black.options = REVISION_SCRIPT_FILENAME
isort.type = console_scripts
isort.entrypoint = isort
isort.options = --profile black REVISION_SCRIPT_FILENAME
hooks = ruff, ruff_format

# lint with attempts to fix using ruff
ruff.type = exec
ruff.executable = ruff
ruff.options = check --fix REVISION_SCRIPT_FILENAME

# format using ruff
ruff_format.type = exec
ruff_format.executable = ruff
ruff_format.options = format REVISION_SCRIPT_FILENAME


# Logging configuration
Expand Down
104 changes: 83 additions & 21 deletions backend/api/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
import tempfile
import uuid
from dataclasses import dataclass, field
from pathlib import Path

Expand All @@ -21,45 +22,106 @@ class BackendConf:
Backend configuration, read from environment variables and set default values.
"""

logger: logging.Logger = field(init=False)
# Configuration
project_expire_after: datetime.timedelta = datetime.timedelta(days=7)
project_quota: int = 0
chunk_size: int = 1024 # reading/writing received files
illustration_quota: int = 0
api_version_prefix: str = "/v1" # our API

# Database
postgres_uri: str = os.getenv("POSTGRES_URI") or "nodb"

# Scheduler process
redis_uri: str = os.getenv("REDIS_URI") or "redis://localhost:6379/0"
channel_name: str = os.getenv("CHANNEL_NAME") or "s3_upload"

# Mandatory configurations
postgres_uri = os.getenv("POSTGRES_URI", "nodb")
s3_url_with_credentials = os.getenv("S3_URL_WITH_CREDENTIALS")
private_salt = os.getenv("PRIVATE_SALT")
# Transient (on host disk) Storage
transient_storage_path: Path = Path()

# Optional configuration.
s3_max_tries = int(os.getenv("S3_MAX_TRIES", "3"))
s3_retry_wait = humanfriendly.parse_timespan(os.getenv("S3_RETRY_TIMES", "10s"))
s3_deletion_delay = datetime.timedelta(
# S3 Storage
s3_url_with_credentials: str = os.getenv("S3_URL_WITH_CREDENTIALS") or ""
s3_max_tries: int = int(os.getenv("S3_MAX_TRIES", "3"))
s3_retry_wait: int = int(
humanfriendly.parse_timespan(os.getenv("S3_RETRY_TIMES") or "10s")
)
s3_deletion_delay: datetime.timedelta = datetime.timedelta(
hours=int(os.getenv("S3_REMOVE_DELETEDUPLOADING_AFTER_HOURS", "12"))
)
transient_storage_path = Path(
os.getenv("TRANSIENT_STORAGE_PATH", tempfile.gettempdir())
).resolve()
redis_uri = os.getenv("REDIS_URI", "redis://localhost:6379/0")
channel_name = os.getenv("CHANNEL_NAME", "s3_upload")
private_salt = os.getenv(
"PRIVATE_SALT", uuid.uuid4().hex
) # used to make S3 keys unguessable

# Cookies
cookie_domain = os.getenv("COOKIE_DOMAIN", None)
cookie_expiration_days = int(os.getenv("COOKIE_EXPIRATION_DAYS", "30"))
project_quota = humanfriendly.parse_size(os.getenv("PROJECT_QUOTA", "100MB"))
chunk_size = humanfriendly.parse_size(os.getenv("CHUNK_SIZE", "2MiB"))
illustration_quota = humanfriendly.parse_size(
os.getenv("ILLUSTRATION_QUOTA", "2MiB")
authentication_cookie_name: str = "user_id"

# Deployment
public_url: str = os.getenv("PUBLIC_URL") or "http://localhost"
download_url: str = (
os.getenv("DOWNLOAD_URL")
or "https://s3.us-west-1.wasabisys.com/org-kiwix-zimit/zim"
)
allowed_origins = os.getenv(
"ALLOWED_ORIGINS",
"http://localhost",
).split("|")

authentication_cookie_name: str = "user_id"
api_version_prefix = "/v1"
project_expire_after = datetime.timedelta(days=7)
# Zimfarm (3rd party API creating ZIMs and calling back with feedback)
zimfarm_api_url: str = (
os.getenv("ZIMFARM_API_URL") or "https://api.farm.zimit.kiwix.org/v1"
)
zimfarm_username: str = os.getenv("ZIMFARM_API_USERNAME") or ""
zimfarm_password: str = os.getenv("ZIMFARM_API_PASSWORD") or ""
zimfarm_nautilus_image: str = (
os.getenv("ZIMFARM_NAUTILUS_IMAGE") or "ghcr.io/openzim/nautilus:latest"
)
zimfarm_task_cpu: int = int(os.getenv("ZIMFARM_TASK_CPU") or "3")
zimfarm_task_memory: int = 0
zimfarm_task_disk: int = 0
zimfarm_callback_base_url = os.getenv("ZIMFARM_CALLBACK_BASE_URL", "")
zimfarm_callback_token = os.getenv("ZIMFARM_CALLBACK_TOKEN", uuid.uuid4().hex)
zimfarm_task_worker: str = os.getenv("ZIMFARM_TASK_WORKDER") or "-"
zimfarm_request_timeout_sec: int = 10

# Mailgun (3rd party API to send emails)
mailgun_api_url: str = os.getenv("MAILGUN_API_URL") or ""
mailgun_api_key: str = os.getenv("MAILGUN_API_KEY") or ""
mailgun_from: str = os.getenv("MAILGUN_FROM") or "Nautilus ZIM"
mailgun_request_timeout_sec: int = 10

logger: logging.Logger = field(init=False)

def __post_init__(self):
self.logger = logging.getLogger(Path(__file__).parent.name)
self.transient_storage_path.mkdir(exist_ok=True)
self.job_retry = Retry(max=self.s3_max_tries, interval=int(self.s3_retry_wait))

self.transient_storage_path = Path(
os.getenv("TRANSIENT_STORAGE_PATH") or tempfile.gettempdir()
).resolve()

self.project_quota = humanfriendly.parse_size(
os.getenv("PROJECT_QUOTA") or "100MB"
)

self.chunk_size = humanfriendly.parse_size(os.getenv("CHUNK_SIZE", "2MiB"))

self.illustration_quota = humanfriendly.parse_size(
os.getenv("ILLUSTRATION_QUOTA", "2MiB")
)

self.zimfarm_task_memory = humanfriendly.parse_size(
os.getenv("ZIMFARM_TASK_MEMORY") or "1000MiB"
)
self.zimfarm_task_disk = humanfriendly.parse_size(
os.getenv("ZIMFARM_TASK_DISK") or "200MiB"
)

if not self.zimfarm_callback_base_url:
self.zimfarm_callback_base_url = f"{self.zimfarm_api_url}/requests/hook"


constants = BackendConf()
logger = constants.logger
5 changes: 3 additions & 2 deletions backend/api/database/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from collections.abc import Generator
from uuid import UUID

from bson.json_util import DEFAULT_JSON_OPTIONS, dumps, loads
import pydantic_core
from bson.json_util import DEFAULT_JSON_OPTIONS, loads
from sqlalchemy import create_engine
from sqlalchemy.orm import Session as OrmSession
from sqlalchemy.orm import sessionmaker
Expand All @@ -25,7 +26,7 @@ def my_loads(s, *args, **kwargs):
bind=create_engine(
constants.postgres_uri,
echo=False,
json_serializer=dumps, # use bson serializer to handle datetime naively
json_serializer=pydantic_core.to_json,
json_deserializer=my_loads, # use custom bson deserializer for same reason
)
)
Expand Down
74 changes: 71 additions & 3 deletions backend/api/database/models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from datetime import datetime
from typing import Any, ClassVar
from typing import Any, ClassVar, TypeVar
from uuid import UUID

from sqlalchemy import DateTime, ForeignKey, String, text
from pydantic import BaseModel
from sqlalchemy import DateTime, ForeignKey, String, text, types
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
from sqlalchemy.orm import (
DeclarativeBase,
Expand All @@ -12,16 +13,82 @@
relationship,
)
from sqlalchemy.sql.schema import MetaData
from zimscraperlib.zim.metadata import (
validate_description,
validate_language,
validate_required_values,
validate_tags,
validate_title,
)

from api.database import get_local_fpath_for

T = TypeVar("T", bound="ArchiveConfig")


class ArchiveConfig(BaseModel):
title: str
description: str
name: str
publisher: str
creator: str
languages: str
tags: list[str]
illustration: str
filename: str

@classmethod
def init_with(cls: type[T], filename: str, **kwargs) -> T:
default = {"tags": []}
data: dict = {key: default.get(key, "") for key in cls.model_fields.keys()}
data.update({"filename": filename})
if kwargs:
data.update(kwargs)
return cls.model_validate(data)

def is_ready(self) -> bool:
try:
for key in self.model_fields.keys():
validate_required_values(key.title(), getattr(self, key, ""))
validate_title("Title", self.title)
validate_description("Description", self.description)
validate_language("Language", self.languages)
validate_tags("Tags", self.tags)

except ValueError:
return False
return True


class ArchiveConfigType(types.TypeDecorator):
cache_ok = True
impl = JSONB

def process_bind_param(self, value, dialect): # noqa: ARG002
if isinstance(value, ArchiveConfig):
return value.model_dump()
if isinstance(value, dict):
return value
return dict(value) if value else {}

def process_result_value(self, value, dialect) -> ArchiveConfig: # noqa: ARG002
if isinstance(value, ArchiveConfig):
return value
return ArchiveConfig.model_validate(dict(value) if value else {})

def coerce_compared_value(self, op, value):
return self.impl.coerce_compared_value(
op, value
) # pyright: ignore [reportCallIssue]


class Base(MappedAsDataclass, DeclarativeBase):
# This map details the specific transformation of types between Python and
# PostgreSQL. This is only needed for the case where a specific PostgreSQL
# type has to be used or when we want to ensure a specific setting (like the
# timezone below)
type_annotation_map: ClassVar = {
ArchiveConfig: ArchiveConfigType,
dict[str, Any]: JSONB, # transform Python Dict[str, Any] into PostgreSQL JSONB
list[dict[str, Any]]: JSONB,
datetime: DateTime(
Expand Down Expand Up @@ -137,9 +204,10 @@ class Archive(Base):
filesize: Mapped[int | None]
created_on: Mapped[datetime]
requested_on: Mapped[datetime | None]
completed_on: Mapped[datetime | None]
download_url: Mapped[str | None]
collection_json_path: Mapped[str | None]
status: Mapped[str]
zimfarm_task_id: Mapped[UUID | None]
email: Mapped[str | None]
config: Mapped[dict[str, Any]]
config: Mapped[ArchiveConfig]
28 changes: 28 additions & 0 deletions backend/api/database/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from uuid import UUID

from sqlalchemy import select

from api.database import Session as DBSession
from api.database.models import File, Project


def get_file_by_id(file_id: UUID) -> File:
"""Get File instance by its id."""
with DBSession.begin() as session:
stmt = select(File).where(File.id == file_id)
file = session.execute(stmt).scalar()
if not file:
raise ValueError(f"File not found: {file_id}")
session.expunge(file)
return file


def get_project_by_id(project_id: UUID) -> Project:
"""Get Project instance by its id."""
with DBSession.begin() as session:
stmt = select(Project).where(Project.id == project_id)
project = session.execute(stmt).scalar()
if not project:
raise ValueError(f"Project not found: {project_id}")
session.expunge(project)
return project
Loading

0 comments on commit 1e23ced

Please sign in to comment.