Skip to content

Commit

Permalink
bzip2 to lbzip2 migration to use all CPU cores
Browse files Browse the repository at this point in the history
If lbzip2 installed on system we use it. If not, using legacy bzip2.
Updated Dockerfile to install lbzip2
Additional dependency:  command-exists
  • Loading branch information
loadit1 authored and missinglink committed Feb 25, 2020
1 parent 31d55e8 commit 8bc781c
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 6 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Expand Up @@ -3,7 +3,7 @@ FROM pelias/baseimage

# downloader apt dependencies
# note: this is done in one command in order to keep down the size of intermediate containers
RUN apt-get update && apt-get install -y autoconf automake libtool pkg-config python bzip2 unzip && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y autoconf automake libtool pkg-config python bzip2 lbzip2 unzip && rm -rf /var/lib/apt/lists/*

# change working dir
ENV WORKDIR /code/pelias/whosonfirst
Expand Down
1 change: 1 addition & 0 deletions package.json
Expand Up @@ -29,6 +29,7 @@
"async": "^3.0.1",
"better-sqlite3": "^5.0.0",
"combined-stream": "^1.0.5",
"command-exists": "^1.2.8",
"csv-stream": "^0.2.0",
"download-file-sync": "^1.0.4",
"fs-extra": "^8.0.0",
Expand Down
13 changes: 11 additions & 2 deletions utils/download_data_all.js
Expand Up @@ -4,6 +4,7 @@ const fs = require('fs-extra');
const os = require('os');
const url = require('url');
const path = require('path');
const commandExistsSync = require('command-exists').sync;

const bundles = require('../src/bundleList');
const config = require( 'pelias-config' ).generate(require('../schema'));
Expand All @@ -21,18 +22,26 @@ function download(callback) {
const maxSimultaneousDownloads = config.get('imports.whosonfirst.maxDownloads') || 4;
const cpuCount = os.cpus().length;
const simultaneousDownloads = Math.max(maxSimultaneousDownloads, Math.min(1, cpuCount / 2));

// generate a shell command that does the following:
// 1.) use curl to download the bundle, piping directly to tar (this avoids the
// need for intermediate storage of the archive file)
// 2.) extract the archive so that the data directory goes in the right place and
// the README file is ignored (it just would get overridden by subsequent bundles)
// 3.) move the meta file to the meta files directory
function generateCommand(bundle, directory) {
let extract;
//Check if we have lbzip2 installed
if (commandExistsSync('lbzip2')) {
extract = `tar -x --use-compress-program=lbzip2`;
} else {
extract = `tar -xj`;
};

const csvFilename = bundle.replace(/-\d{8}T\d{6}-/, '-latest-') // support timestamped downloads
.replace('.tar.bz2', '.csv');

return `curl -s ${wofDataHost}/bundles/${bundle} | tar -xj --strip-components=1 --exclude=README.txt -C ` +
return `curl -s ${wofDataHost}/bundles/${bundle} | ${extract} --strip-components=1 --exclude=README.txt -C ` +
`${directory} && mv ${path.join(directory, csvFilename)} ${path.join(directory, 'meta')}`;
}

Expand Down
16 changes: 14 additions & 2 deletions utils/download_sqlite_all.js
Expand Up @@ -4,6 +4,7 @@ const fs = require('fs-extra');
const os = require('os');
const path = require('path');
const downloadFileSync = require('download-file-sync');
const commandExistsSync = require('command-exists').sync;

const config = require('pelias-config').generate(require('../schema'));

Expand Down Expand Up @@ -50,9 +51,20 @@ function download(callback) {
const generateCommand = (sqlite, directory) => {
let extract;
if (/\.db\.bz2$/.test(sqlite.name_compressed)) {
extract = `bunzip2`;
//Check if we have lbzip2 installed
if ( commandExistsSync('lbzip2') ) {
extract = `lbzip2`;
} else {
extract = `bunzip2`;
};
} else if(/\.db\.tar\.bz2$/.test(sqlite.name_compressed)) {
extract = `tar -xjO`;
//Check if we have lbzip2 installed
if ( commandExistsSync('lbzip2') ) {
//Aim tar to use lbzip2
extract = `tar -xO --use-compress-program=lbzip2`;
} else {
extract = `tar -xjO`;
};
} else {
throw new Error('What is this extension ?!?');
}
Expand Down
11 changes: 10 additions & 1 deletion utils/sqlite_download.sh
Expand Up @@ -25,9 +25,18 @@ REMOTE_PATH="${REMOTE}/${DB_FILENAME}.bz2"

info() { echo -e "\e[33m[$1]\t\e[0m $2" >&2; }
err() { echo -e "\e[31m[$1]\t\e[0m \e[91m$2\e[0m" >&2; }

#Check if we have lbzip2 (https://lbzip2.org/) installed
decompress_utility() {
if hash lbzip2 2>/dev/null; then
lbzip2 -d -f "${LOCAL_BZ2_PATH}" > "${LOCAL_DB_PATH}"
else
bunzip2 -f "${LOCAL_BZ2_PATH}" > "${LOCAL_DB_PATH}"
fi
}
extract_file() {
info 'whosonfirst-sqlite-decompress' "${LOCAL_BZ2_PATH}"
bunzip2 -f "${LOCAL_BZ2_PATH}" > "${LOCAL_DB_PATH}"
decompress_utility
}
generate_timestamp() {
printf "@" > "${LOCAL_TS_PATH}" # date command requires @ prefix
Expand Down

0 comments on commit 8bc781c

Please sign in to comment.