Skip to content

Commit

Permalink
Bakes the NLTK data into the image (~60mb)
Browse files Browse the repository at this point in the history
  • Loading branch information
stumpylog committed Dec 7, 2022
1 parent 3f6e3a2 commit 8da3ae2
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 31 deletions.
4 changes: 4 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,10 @@ RUN set -eux \
&& python3 -m pip install --no-cache-dir --upgrade wheel \
&& echo "Installing Python requirements" \
&& python3 -m pip install --default-timeout=1000 --no-cache-dir --requirement requirements.txt \
&& echo "Installing NLTK data" \
&& python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" snowball_data \
&& python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" stopwords \
&& python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" punkt \
&& echo "Cleaning up image" \
&& apt-get -y purge ${BUILD_PACKAGES} \
&& apt-get -y autoremove --purge \
Expand Down
30 changes: 0 additions & 30 deletions docker/docker-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,30 +53,6 @@ map_folders() {
export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}"
}

nltk_data () {
# Store the NLTK data outside the Docker container
local -r nltk_data_dir="${DATA_DIR}/nltk"
local -r truthy_things=("yes y 1 t true")

# If not set, or it looks truthy
if [[ -z "${PAPERLESS_ENABLE_NLTK}" ]] || [[ "${truthy_things[*]}" =~ ${PAPERLESS_ENABLE_NLTK,} ]]; then

# Download or update the snowball stemmer data
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" snowball_data

# Download or update the stopwords corpus
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" stopwords

# Download or update the punkt tokenizer data
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" punkt

else
echo "Skipping NLTK data download"

fi

}

custom_container_init() {
# Mostly borrowed from the LinuxServer.io base image
# https://github.com/linuxserver/docker-baseimage-ubuntu/tree/bionic/root/etc/cont-init.d
Expand Down Expand Up @@ -157,8 +133,6 @@ initialize() {
echo "Creating directory ${tmp_dir}"
mkdir -p "${tmp_dir}"

nltk_data

set +e
echo "Adjusting permissions of paperless files. This may take a while."
chown -R paperless:paperless ${tmp_dir}
Expand Down Expand Up @@ -191,10 +165,6 @@ install_languages() {

for lang in "${langs[@]}"; do
pkg="tesseract-ocr-$lang"
# English is installed by default
#if [[ "$lang" == "eng" ]]; then
# continue
#fi

if dpkg -s "$pkg" &>/dev/null; then
echo "Package $pkg already installed!"
Expand Down
2 changes: 1 addition & 1 deletion src/paperless/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def _parse_redis_url(env_redis: Optional[str]) -> Tuple[str]:

DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))

NLTK_DIR = os.path.join(DATA_DIR, "nltk")
NLTK_DIR = __get_path("PAPERLESS_NLTK_DIR", "/usr/local/share/nltk_data")

TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")

Expand Down

0 comments on commit 8da3ae2

Please sign in to comment.