Skip to content

Commit

Permalink
[ENH] Make data upload script backend responsive (#205)
Browse files Browse the repository at this point in the history
* add bool flag for whether to use GraphDB API endpoints

* capture HTTP status codes and output list of failed uploads

* more verbose clear db step and force exit if db clearing fails

* fix newlines and typos

* rearrange comments

* rename database name arg for clarity
  • Loading branch information
alyssadai committed Oct 22, 2023
1 parent 036fb3b commit 0b44bcd
Showing 1 changed file with 74 additions and 19 deletions.
93 changes: 74 additions & 19 deletions add_data_to_graph.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
# ARG_HELP([Upload JSONLD and Turtle data to a Neurobagel graph])
# ARG_POSITIONAL_SINGLE([dir],[Path to directory containing .jsonld and/or .ttl files. ALL .jsonld and .ttl files in this directory will be uploaded.])
# ARG_POSITIONAL_SINGLE([graph-url],[Host and port at which to access the graph database to add data to (e.g., localhost:7200)])
# ARG_POSITIONAL_SINGLE([graph-db],[Name of graph database to add data to])
# ARG_POSITIONAL_SINGLE([graph-database],[Name of graph database to add data to])
# ARG_POSITIONAL_SINGLE([user],[Username for graph database access])
# ARG_POSITIONAL_SINGLE([password],[Password for graph database user])
# ARG_OPTIONAL_BOOLEAN([clear-data],[],[Whether or not to first clear all existing data from the graph database],[off])
# ARG_OPTIONAL_BOOLEAN([use-graphdb-syntax],[],[Whether or not to use GraphDB API endpoints to update the specified graph database. If off, assumes the graph database is a Stardog database.],[off])
# ARGBASH_GO()
# needed because of Argbash --> m4_ignore([
### START OF CODE GENERATED BY Argbash v2.9.0 one line above ###
Expand Down Expand Up @@ -35,19 +36,21 @@ begins_with_short_option()
_positionals=()
# THE DEFAULTS INITIALIZATION - OPTIONALS
_arg_clear_data="off"
_arg_use_graphdb_syntax="off"


print_help()
{
printf '%s\n' "Upload JSONLD and Turtle data to a Neurobagel graph"
printf 'Usage: %s [-h|--help] [--(no-)clear-data] <dir> <graph-url> <graph-db> <user> <password>\n' "$0"
printf 'Usage: %s [-h|--help] [--(no-)clear-data] [--(no-)use-graphdb-syntax] <dir> <graph-url> <graph-database> <user> <password>\n' "$0"
printf '\t%s\n' "<dir>: Path to directory containing .jsonld and/or .ttl files. ALL .jsonld and .ttl files in this directory will be uploaded."
printf '\t%s\n' "<graph-url>: Host and port at which to access the graph database to add data to (e.g., localhost:7200)"
printf '\t%s\n' "<graph-db>: Name of graph database to add data to"
printf '\t%s\n' "<graph-database>: Name of graph database to add data to"
printf '\t%s\n' "<user>: Username for graph database access"
printf '\t%s\n' "<password>: Password for graph database user"
printf '\t%s\n' "-h, --help: Prints help"
printf '\t%s\n' "--clear-data, --no-clear-data: Whether or not to first clear all existing data from the graph database (off by default)"
printf '\t%s\n' "--use-graphdb-syntax, --no-use-graphdb-syntax: Whether or not to use GraphDB API endpoints to update the specified graph database. If off, assumes the graph database is a Stardog database. (off by default)"
}


Expand All @@ -70,6 +73,10 @@ parse_commandline()
_arg_clear_data="on"
test "${1:0:5}" = "--no-" && _arg_clear_data="off"
;;
--no-use-graphdb-syntax|--use-graphdb-syntax)
_arg_use_graphdb_syntax="on"
test "${1:0:5}" = "--no-" && _arg_use_graphdb_syntax="off"
;;
*)
_last_positional="$1"
_positionals+=("$_last_positional")
Expand All @@ -83,7 +90,7 @@ parse_commandline()

handle_passed_args_count()
{
local _required_args_string="'dir', 'graph-url', 'graph-db', 'user' and 'password'"
local _required_args_string="'dir', 'graph-url', 'graph-database', 'user' and 'password'"
test "${_positionals_count}" -ge 5 || _PRINT_HELP=yes die "FATAL ERROR: Not enough positional arguments - we require exactly 5 (namely: $_required_args_string), but got only ${_positionals_count}." 1
test "${_positionals_count}" -le 5 || _PRINT_HELP=yes die "FATAL ERROR: There were spurious positional arguments --- we expect exactly 5 (namely: $_required_args_string), but got ${_positionals_count} (the last one was: '${_last_positional}')." 1
}
Expand All @@ -92,7 +99,7 @@ handle_passed_args_count()
assign_positional_args()
{
local _positional_name _shift_for=$1
_positional_names="_arg_dir _arg_graph_url _arg_graph_db _arg_user _arg_password "
_positional_names="_arg_dir _arg_graph_url _arg_graph_database _arg_user _arg_password "

shift "$_shift_for"
for _positional_name in ${_positional_names}
Expand All @@ -116,9 +123,10 @@ assign_positional_args 1 "${_positionals[@]}"
jsonld_dir=$_arg_dir
user=$_arg_user
password=$_arg_password
graph_db=$_arg_graph_db
graph_db=$_arg_graph_database
graph_url=$_arg_graph_url
clear_data=$_arg_clear_data # value is either on or off (https://argbash.readthedocs.io/en/stable/guide.html#optional-arguments)
use_graphdb_syntax=$_arg_use_graphdb_syntax

DELETE_TRIPLES_QUERY="
DELETE {
Expand All @@ -127,34 +135,81 @@ DELETE {
?s ?p ?o .
}"

# Depending on the graph backend used, set URLs for uploading data to and clearing data in graph database
base_url="http://${graph_url}/${graph_db}"
if [ "$use_graphdb_syntax" = "on" ]; then
upload_data_url="${base_url}/statements"
clear_data_url=$upload_data_url
else
upload_data_url=$base_url
clear_data_url="${base_url}/update"
fi


# Clear existing data in graph database if requested
if [ "$clear_data" = "on" ]; then
echo -e "\nClearing existing data from ${graph_db}..."
echo -e "\nCLEARING EXISTING DATA FROM ${graph_db}..."

curl -u "${user}:${password}" -X POST http://${graph_url}/${graph_db}/update \
response=$(curl -u "${user}:${password}" --no-progress-meter -i -w "\n%{http_code}\n" \
-X POST $clear_data_url \
-H "Content-Type: application/sparql-update" \
--data-binary "${DELETE_TRIPLES_QUERY}"

echo -e "Done clearing existing data from ${graph_db}.\n"
--data-binary "${DELETE_TRIPLES_QUERY}")

# Extract and check status code outputted as final line of response
httpcode=$(tail -n1 <<< "$response")
if (( $httpcode < 200 || $httpcode >= 300 )); then
echo -e "\nERROR: Failed to clear ${graph_db}:"
echo "$(sed '$d' <<< "$response")"
echo -e "\nEXITING..."
exit 1
fi
fi


# Add data to specified graph database
echo "Uploading data from ${jsonld_dir} to ${graph_db}..."
echo -e "\nUPLOADING DATA FROM ${jsonld_dir} TO ${graph_db}...\n"

upload_failed=()

for db in ${jsonld_dir}/*.jsonld; do
curl -u "${user}:${password}" -i -X POST http://${graph_url}/${graph_db} \
-H "Content-Type: application/ld+json" \
--data-binary @${db}
# Prevent edge case where no matching files are present in directory and so loop executes once with glob pattern string itself
[ -e "$db" ] || continue

echo "$(basename ${db}):"
response=$(curl -u "${user}:${password}" --no-progress-meter -i -w "\n%{http_code}\n" \
-X POST $upload_data_url \
-H "Content-Type: application/ld+json" \
--data-binary @${db})

httpcode=$(tail -n1 <<< "$response")
if (( $httpcode < 200 || $httpcode >= 300 )); then
upload_failed+=("${db}")
fi
# Print rest of response to stdout
echo -e "$(sed '$d' <<< "$response")\n"
done

for file in ${jsonld_dir}/*.ttl; do
curl -u "${user}:${password}" -i -X POST http://${graph_url}/${graph_db} \
-H "Content-Type: text/turtle" \
--data-binary @${file}
[ -e "$file" ] || continue

echo "$(basename ${file}):"
response=$(curl -u "${user}:${password}" --no-progress-meter -i -w "\n%{http_code}\n" \
-X POST $upload_data_url \
-H "Content-Type: text/turtle" \
--data-binary @${file})

httpcode=$(tail -n1 <<< "$response")
if (( $httpcode < 200 || $httpcode >= 300 )); then
upload_failed+=("${file}")
fi
echo -e "$(sed '$d' <<< "$response")\n"
done

echo "Finished uploading data from ${jsonld_dir} to ${graph_db}"
echo "FINISHED UPLOADING DATA FROM ${jsonld_dir} TO ${graph_db}."

if (( ${#upload_failed[@]} != 0 )); then
echo -e "\nERROR: Upload failed for these files:"
printf '%s\n' "${upload_failed[@]}"
fi

# ] <-- needed because of Argbash

0 comments on commit 0b44bcd

Please sign in to comment.