From 69150b1627ec2b534bc16751fa21ab2c51250065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20B=C3=BClte?= Date: Mon, 3 Jun 2024 13:46:40 +0200 Subject: [PATCH] Keep single records for every id Created objects that are kept in path: records with different ids but the same rvk elements, encoded them as json and reopened them with direction records as the record container. By that I am able to create a single record for each id. --- ...Rvk-Verbundbibliothek_concordance_csv.flux | 5 ++-- .../fix-cg-to-es.fix | 27 ++++++++++++------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux b/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux index ca4c6de..20d1cd2 100644 --- a/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux +++ b/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux @@ -11,7 +11,7 @@ // // curl -XPOST --header 'Content-Type: application/x-ndjson' -d @bulk.ndjson 'http://localhost:9200/_bulk' -default outfile = FLUX_DIR + "bulk.json"; +default outfile = FLUX_DIR + "bulk.csv"; default infile = FLUX_DIR + "aggregate_auslieferung_20191212.small.marcxml.gz"; default fixfile = FLUX_DIR + "fix-cg-to-es.fix"; @@ -21,7 +21,8 @@ infile | decode-xml | handle-marcxml | fix(fixfile) +| encode-json +| decode-json(recordPath="records") | encode-csv -//encode-json | write(outfile) ; \ No newline at end of file diff --git a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix index c21b900..dc881db 100644 --- a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix +++ b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix @@ -1,34 +1,41 @@ -set_array("id") -set_array("rvk[]") +set_array("records[]") +set_array("@id[]") +set_array("@rvk[]") do list(path: "084??", "var": "$i") if any_match("$i.2", "rvk") - copy_field("$i.a","rvk[].$append") + copy_field("$i.a","@rvk[].$append") end end +uniq("@rvk[]") + do list(path: "035??", "var": "$i") if any_match("$i.a", "^\\(DE-605\\)(.*)") - copy_field("$i.a","id.$append") + copy_field("$i.a","@id[].$append") end end -replace_all("id.*","^\\(DE-605\\)(.*)","$1") -join_field("id",", ") +replace_all("id[].*","^\\(DE-605\\)(.*)","$1") + +do list(path: "@id[]", "var": "$i") + copy_field("$i","records[].$append.id") + copy_field("@rvk[]","records[].$last.rvk[]") +end +replace_all("records[].*.id","^\\(DE-605\\)(.*)","$1") -retain("rvk[]","id") vacuum() # Filter records without RVK -unless exists("rvk[]") +unless exists("@rvk[]") reject() end # Filter records without hbz ids -unless exists("id") +unless exists("@id[]") reject() end - +retain("records[]")