Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
190c5ff
Add openaleph-procrastinate. Bump versions to satisfy dependencies (p…
catileptic Jun 11, 2025
a46a028
🧑‍💻 Add pre-commit, use requirements.txt, upgrade to python3.13
simonwoerpel Jun 16, 2025
109b53a
🧑‍💻 Add dev requirements only for test build
simonwoerpel Jun 16, 2025
43a02d1
🔥 (github) Drop daily cache job
simonwoerpel Jun 16, 2025
f977f90
✅ (tests/test_pdf) Fix whitespace errors from test results
simonwoerpel Jun 16, 2025
d6ad514
🔨 (make) Build before test
simonwoerpel Jun 16, 2025
8ef339f
👷 Inline base build
simonwoerpel Jun 16, 2025
9e376e4
🚧 Tweak builds and tags
simonwoerpel Jun 16, 2025
39a2385
👷 (github) Skip intermediate arm46 build for tests
simonwoerpel Jun 16, 2025
03f86fd
👷 (github) Skip cache-from [tmp]
simonwoerpel Jun 16, 2025
ff1a8c4
Revert "👷 (github) Skip cache-from [tmp]"
simonwoerpel Jun 16, 2025
4adcfb2
👷 (github/docker) Try this
simonwoerpel Jun 16, 2025
099aef5
🚨 Apply black
simonwoerpel Jun 17, 2025
4071354
👷 (github/docker) Don't use registry cache
simonwoerpel Jun 17, 2025
dd8bb0c
🧪 (test_image) Skip gif test
simonwoerpel Jun 17, 2025
9adf413
👷 (github/docker) maybe this
simonwoerpel Jun 17, 2025
73a87b8
Downgrade TesserOCR to 2.6.2
catileptic Jun 18, 2025
28d1c53
Add MacOS flags in Makefile and LD_PRELOAD path in docker-compose.yml
catileptic Jun 18, 2025
a1a6395
Add Dockerfile.test which contains dev dependencies
catileptic Jun 18, 2025
4d40855
Add path to Dockerfile.base in .github/workflows/docker-base.yml
catileptic Jun 18, 2025
d23c41a
Revert skipping test_tesseract_ocr_regression for GIF images
catileptic Jun 18, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .bumpversion.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ tag_name = {new_version}
commit = True
tag = True
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)([-](?P<release>(pre|rc))(?P<build>\d+))?
serialize =
serialize =
{major}.{minor}.{patch}-{release}{build}
{major}.{minor}.{patch}

[bumpversion:part:release]
optional_value = prod
first_value = prod
values =
values =
rc
prod

Expand Down
16 changes: 8 additions & 8 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ jobs:
type=ref,event=branch
type=semver,pattern={{version}}
type=sha
type=raw,value=latest
type=raw,value=cache
type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags') }}

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
Expand All @@ -50,10 +51,10 @@ jobs:
uses: docker/build-push-action@v6
with:
context: .
platforms: linux/amd64
load: true
cache-from: type=registry,ref=ghcr.io/openaleph/ingest-file:cache
cache-to: type=registry,ref=ghcr.io/openaleph/ingest-file:cache,mode=max
platforms: linux/amd64
cache-from: type=gha
cache-to: type=gha,mode=max

- name: Start services
run: |
Expand All @@ -73,12 +74,11 @@ jobs:

- name: Push docker images
uses: docker/build-push-action@v6
if: (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags')) && github.actor != 'dependabot[bot]'
with:
context: .
platforms: linux/amd64, linux/arm64
platforms: linux/amd64,linux/arm64
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=registry,ref=ghcr.io/openaleph/ingest-file:cache
cache-to: type=registry,ref=ghcr.io/openaleph/ingest-file:cache,mode=max
cache-from: type=gha
cache-to: type=gha,mode=max
23 changes: 0 additions & 23 deletions .github/workflows/daily.yml

This file was deleted.

52 changes: 52 additions & 0 deletions .github/workflows/docker-base.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: Build ingest-file-base

on:
workflow_dispatch: {}
schedule:
- cron: "0 0 * * *"
push:
paths:
- Dockerfile.base
- .github/workflows/docker-base.yml

permissions:
packages: write

jobs:
docker:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Docker meta
id: meta
uses: docker/metadata-action@v4
with:
images: ghcr.io/openaleph/ingest-file-base
tags: |
type=ref,event=branch
type=semver,pattern={{version}}
type=sha
type=raw,value=latest
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
install: true
- name: Login to GitHub Container Registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push release
uses: docker/build-push-action@v3
with:
context: .
file: ./Dockerfile.base
platforms: linux/amd64,linux/arm64
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
data/model_type_prediction.ftz
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
83 changes: 83 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# This is the configuration file for pre-commit (https://pre-commit.com/).
# To use:
# * Install pre-commit (https://pre-commit.com/#installation)
# * Copy this file as ".pre-commit-config.yaml"
# * Run "pre-commit install".
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: check-added-large-files
- id: check-case-conflict
- id: check-merge-conflict
- id: check-symlinks
- id: check-toml
- id: check-yaml
- id: debug-statements
- id: end-of-file-fixer
- id: mixed-line-ending
args: ["--fix=lf"]
- id: trailing-whitespace

# - repo: https://github.com/asottile/pyupgrade
# rev: v3.10.1
# hooks:
# - id: pyupgrade
# args: [ "--py310-plus" ]

- repo: https://github.com/MarcoGorelli/absolufy-imports
rev: v0.3.1
hooks:
- id: absolufy-imports

- repo: https://github.com/pycqa/isort
rev: 6.0.1
hooks:
- id: isort
args: ["--profile", "black"]

- repo: https://github.com/psf/black
rev: 25.1.0
hooks:
- id: black

- repo: https://github.com/csachs/pyproject-flake8
rev: v7.0.0
hooks:
- id: pyproject-flake8
additional_dependencies: [flake8-bugbear]
args: ["--extend-ignore", "E203, E501, W503"]
exclude: (test_[\w]+\.py|\.csv|\.json|\.lock)$

- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
hooks:
- id: codespell
exclude: (tests/.*|\.lock)$

- repo: https://github.com/pre-commit/pygrep-hooks
rev: v1.10.0
hooks:
- id: python-check-blanket-noqa
exclude: (test_[\w]+\.py)$
- id: python-check-blanket-type-ignore
- id: python-no-eval
- id: python-use-type-annotations
- id: rst-backticks
- id: rst-directive-colons
- id: rst-inline-touching-normal

- repo: https://github.com/python-poetry/poetry
rev: 2.1.3
hooks:
- id: poetry-check
- id: poetry-lock

- repo: https://github.com/python-poetry/poetry-plugin-export
rev: 1.9.0
hooks:
- id: poetry-export
args: ["--without-hashes", "-o", "requirements.txt"]
- id: poetry-export
args:
["--without-hashes", "--only", "dev", "-o", "requirements-dev.txt"]
160 changes: 5 additions & 155 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,165 +1,15 @@
FROM python:3.11-slim

ENV DEBIAN_FRONTEND="noninteractive"

LABEL org.opencontainers.image.title="FollowTheMoney File Ingestors"
LABEL org.opencontainers.image.licenses="MIT"
LABEL org.opencontainers.image.source="https://github.com/alephdata/ingest-file"

# Enable non-free archive for `unrar`.
RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \
&& apt-get -qq -y update \
&& apt-get -qq -y install build-essential locales \
# python deps (mostly to install their dependencies)
git python3-dev \
pkg-config libicu-dev \
# tesseract
tesseract-ocr libtesseract-dev libleptonica-dev \
# libraries
libldap2-dev libsasl2-dev \
# package tools
unrar p7zip-full \
# audio & video metadata
libmediainfo-dev \
# image processing, djvu
mdbtools djvulibre-bin \
libtiff5-dev \
libtiff-tools ghostscript librsvg2-bin jbig2dec \
pst-utils libgif-dev \
# necessary for python-magic
libmagic1 \
### tesseract
tesseract-ocr-eng \
tesseract-ocr-swa \
tesseract-ocr-swe \
# tesseract-ocr-tam \
# tesseract-ocr-tel \
tesseract-ocr-fil \
# tesseract-ocr-tha \
tesseract-ocr-tur \
tesseract-ocr-ukr \
# tesseract-ocr-vie \
tesseract-ocr-nld \
tesseract-ocr-nor \
tesseract-ocr-pol \
tesseract-ocr-por \
tesseract-ocr-ron \
tesseract-ocr-rus \
tesseract-ocr-slk \
tesseract-ocr-slv \
tesseract-ocr-spa \
# tesseract-ocr-spa_old \
tesseract-ocr-sqi \
tesseract-ocr-srp \
tesseract-ocr-ind \
tesseract-ocr-isl \
tesseract-ocr-ita \
# tesseract-ocr-ita_old \
# tesseract-ocr-jpn \
tesseract-ocr-kan \
tesseract-ocr-kat \
# tesseract-ocr-kor \
tesseract-ocr-khm \
tesseract-ocr-lav \
tesseract-ocr-lit \
# tesseract-ocr-mal \
tesseract-ocr-mkd \
tesseract-ocr-mya \
tesseract-ocr-mlt \
tesseract-ocr-msa \
tesseract-ocr-est \
# tesseract-ocr-eus \
tesseract-ocr-fin \
tesseract-ocr-fra \
tesseract-ocr-frk \
# tesseract-ocr-frm \
# tesseract-ocr-glg \
# tesseract-ocr-grc \
tesseract-ocr-heb \
tesseract-ocr-hin \
tesseract-ocr-hrv \
tesseract-ocr-hye \
tesseract-ocr-hun \
# tesseract-ocr-ben \
tesseract-ocr-bul \
tesseract-ocr-cat \
tesseract-ocr-ces \
tesseract-ocr-nep \
# tesseract-ocr-chi_sim \
# tesseract-ocr-chi_tra \
# tesseract-ocr-chr \
tesseract-ocr-dan \
tesseract-ocr-deu \
tesseract-ocr-ell \
# tesseract-ocr-enm \
# tesseract-ocr-epo \
# tesseract-ocr-equ \
tesseract-ocr-afr \
tesseract-ocr-ara \
tesseract-ocr-aze \
tesseract-ocr-bel \
tesseract-ocr-uzb \
### pdf convert: libreoffice + a bunch of fonts
libreoffice fonts-opensymbol hyphen-fr hyphen-de \
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-extra \
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
fonts-tlwg-purisa \
###
&& apt-get -qq -y autoremove \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
&& localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

# Set up the locale and make sure the system uses unicode for the file system.
ENV LANG='en_US.UTF-8' \
TZ='UTC' \
OMP_THREAD_LIMIT='1' \
OPENBLAS_NUM_THREADS='1'

RUN groupadd -g 1000 -r app \
&& useradd -m -u 1000 -s /bin/false -g app app

# Download the ftm-typepredict model
RUN mkdir /models/ && \
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"

COPY requirements.txt /tmp/
RUN pip3 install --no-cache-dir -q -U pip setuptools
RUN pip3 install --no-binary=:pyicu: pyicu
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt

# Install spaCy models
RUN python3 -m spacy download en_core_web_sm \
&& python3 -m spacy download de_core_news_sm \
&& python3 -m spacy download fr_core_news_sm \
&& python3 -m spacy download es_core_news_sm
RUN python3 -m spacy download ru_core_news_sm \
&& python3 -m spacy download pt_core_news_sm \
&& python3 -m spacy download ro_core_news_sm \
&& python3 -m spacy download mk_core_news_sm
RUN python3 -m spacy download el_core_news_sm \
&& python3 -m spacy download pl_core_news_sm \
&& python3 -m spacy download it_core_news_sm \
&& python3 -m spacy download lt_core_news_sm \
&& python3 -m spacy download nl_core_news_sm \
&& python3 -m spacy download nb_core_news_sm \
&& python3 -m spacy download da_core_news_sm
# RUN python3 -m spacy download zh_core_web_sm
FROM ghcr.io/openaleph/ingest-file-base:latest

COPY . /ingestors
WORKDIR /ingestors
RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
RUN chown -R app:app /ingestors

RUN pip3 install --no-cache-dir -r /ingestors/requirements.txt
RUN pip3 install --no-cache-dir /ingestors

ENV ARCHIVE_TYPE=file \
ARCHIVE_PATH=/data \
FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
REDIS_URL=redis://redis:6379/0 \
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata

ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata

# USER app
USER app
CMD ingestors process

Check warning on line 15 in Dockerfile

View workflow job for this annotation

GitHub Actions / build

JSON arguments recommended for ENTRYPOINT/CMD to prevent unintended behavior related to OS signals

JSONArgsRecommended: JSON arguments recommended for CMD to prevent unintended behavior related to OS signals More info: https://docs.docker.com/go/dockerfile/rule/json-args-recommended/

Check warning on line 15 in Dockerfile

View workflow job for this annotation

GitHub Actions / build

JSON arguments recommended for ENTRYPOINT/CMD to prevent unintended behavior related to OS signals

JSONArgsRecommended: JSON arguments recommended for CMD to prevent unintended behavior related to OS signals More info: https://docs.docker.com/go/dockerfile/rule/json-args-recommended/
Loading