Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 17 lines (14 sloc) 598 Bytes
#!/usr/bin/env bash
set -e
# Path to Gigaword corpus with all data files decompressed.
# The directory to write output to
export OUTPUTDIR=$2
# The number of jobs to run at once
export NUMJOBS=$3
echo "Flattening Gigaword with ${NUMJOBS} processes..."
mkdir -p $OUTPUTDIR
find ${GIGAWORDDIR}/data/*/* | parallel --gnu --progress -j ${NUMJOBS} python \
--gigaword-path \{\} --output-dir ${OUTPUTDIR}
echo "Combining the flattened files into one..."
cat ${OUTPUTDIR}/*.flat > ${OUTPUTDIR}/flattened_gigaword.txt
You can’t perform that action at this time.