From 5fe2db91d3da7e0b681b162fe947c568a818e5a0 Mon Sep 17 00:00:00 2001 From: Peter Boothe Date: Thu, 25 Apr 2019 14:06:36 -0400 Subject: [PATCH] Adds tcp_keepalive options. This is likely my last attempt to fix the scraper bug. --- Dockerfile | 30 +++++++++++++++++------------- scraper.py | 7 ++++--- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/Dockerfile b/Dockerfile index c54bd5a..409517b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.6 +FROM alpine:3.7 MAINTAINER Peter Boothe # Install all the standard packages we need RUN apk update && apk add python python-dev py2-pip gcc g++ libc-dev bash rsync tar @@ -7,21 +7,25 @@ ADD requirements.txt /requirements.txt RUN pip install -r requirements.txt -U # Install scraper ADD scraper.py /scraper.py -RUN chmod +x /scraper.py ADD run_scraper.py /run_scraper.py RUN chmod +x run_scraper.py # The monitoring port EXPOSE 9090 +# Set the default values for TCP keepalive before starting the scraper. +# # The :- syntax specifies a default value for the variable, so the deployment # need not set it unless you want to specify something other than that default. -CMD /run_scraper.py \ - --rsync_host=$RSYNC_HOST \ - --rsync_port=${RSYNC_PORT:-7999} \ - --rsync_module=$RSYNC_MODULE \ - --bucket=$GCS_BUCKET \ - --data_dir=scraper_data \ - --datastore_namespace=$DATASTORE_NAMESPACE \ - --metrics_port=${METRICS_PORT:-9090} \ - --expected_wait_time=${EXPECTED_WAIT_TIME:-1800} \ - --max_uncompressed_size=${MAX_UNCOMPRESSED_SIZE:-1000000000} \ - --tarfile_directory=${TARFILE_DIRECTORY:-/tmp} +CMD echo 60 > /proc/sys/net/ipv4/tcp_keepalive_time ; \ + echo 30 > /proc/sys/net/ipv4/tcp_keepalive_intvl ; \ + echo 20 > /proc/sys/net/ipv4/tcp_keepalive_probes ; \ + /run_scraper.py \ + --rsync_host=$RSYNC_HOST \ + --rsync_port=${RSYNC_PORT:-7999} \ + --rsync_module=$RSYNC_MODULE \ + --bucket=$GCS_BUCKET \ + --data_dir=scraper_data \ + --datastore_namespace=$DATASTORE_NAMESPACE \ + --metrics_port=${METRICS_PORT:-9090} \ + --expected_wait_time=${EXPECTED_WAIT_TIME:-1800} \ + --max_uncompressed_size=${MAX_UNCOMPRESSED_SIZE:-1000000000} \ + --tarfile_directory=${TARFILE_DIRECTORY:-/tmp} diff --git a/scraper.py b/scraper.py index 9aa7d76..e5b51d7 100755 --- a/scraper.py +++ b/scraper.py @@ -135,10 +135,11 @@ def has_one_bit_set_or_is_zero(i): # Use IPv4, archive mode, compression, limit total bandwidth usage to 10 Mbps, -# don't wait too long before bailing out, and make sure to chmod the files to -# have sensible permissions. +# don't wait too long before bailing out, make sure to chmod the files to +# have sensible permissions, and set SO_KEEPALIVE to ensure that long-lived +# connections don't disappear from any NAT boxes in the middle. RSYNC_ARGS = ['-4', '-az', '--bwlimit=10000', '--timeout=300', - '--contimeout=300', '--chmod=u=rwX'] + '--contimeout=300', '--chmod=u=rwX', '--sockopts=SO_KEEPALIVE=1'] RemoteFile = collections.namedtuple('RemoteFile', ['filename', 'mtime'])