Skip to content

Commit 0eb0200

Browse files
authored
Merge branch 'main' into snyk-fix-e8264e89d591506fede024e9ef787bee
2 parents 2e8e0f5 + bd8a25c commit 0eb0200

File tree

65 files changed

+2178
-79
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+2178
-79
lines changed

.github/workflows/maven.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ jobs:
1515
runs-on: ubuntu-latest
1616

1717
steps:
18-
- uses: actions/checkout@v4
18+
- name: Check out the repo
19+
uses: actions/checkout@v4
1920
with:
2021
fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis
2122

README.md

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ digital collections.
4040
* [`unique <boolean>`](#unique-boolean)
4141
* [`dependencies [id1, id2, ..., idN]`](#dependencies-id1-id2--idn)
4242
* [`dimension [criteria1, criteria2, ..., criteriaN]`](#dimension-criteria1-criteria2--criterian)
43+
* [`hasLanguageTag <anyOf|oneOf|allOf>`](#haslanguagetag-anyofoneofallof)
44+
* [`isMultilingual <boolean>`](#ismultilingual-boolean)
4345
- [General properties](#general-properties)
4446
* [`id <String>`](#id-string)
4547
* [`description <String>`](#description-string)
@@ -92,6 +94,9 @@ usage:
9294
[-f <format>] [-h <arg>] [-o <file>] [-r <path>] [-v <format>] [-w <format>] [-z]
9395
```
9496
* `-i,--input <file>` Input file.
97+
* `-n,--inputFormat <format>` (optional, String) The format of input file. Right now it supports two JSON variants:
98+
* `ndjson`: line delimited JSON in which every line is a new record (the default value)
99+
* `json-array`: JSON file that contains an array of objects
95100
* `-s,--schema <file>` Schema file describing the metadata structure to run assessment against.
96101
* `-v,--schemaFormat <format>` Format of schema file: json, yaml. Default: based on file extension, else json.
97102
* `-m,--measurements <file>` Configuration file for measurements.
@@ -505,6 +510,11 @@ a single data elements (a DataELement in the API). Its properties are:
505510
against
506511
* `indexField` (String): the name which can be used in a search engine connected
507512
to the application (at the time of writing Apache Solr is supported)
513+
* `inactive` (boolean): the data element is inactive, do not run checks on this
514+
* `identifierField` (boolean): the data element is the identifier of the record
515+
* `asLanguageTagged` (boolean): treat the data element as language tagged. It works
516+
for JSON where the content of the data element is encoded with an associated
517+
array, where the keys are the language tags.
508518

509519
Optionaly you can set the "canonical list" of categories. It provides
510520
two additional functionalities
@@ -534,7 +544,7 @@ One can add constraints to the fields. There are content rules, which
534544
the tool will check. In this version the tool mimin SHACL constraints.
535545

536546
#### Cardinality
537-
One can specify with this properties how many occurrences of a data elemens
547+
One can specify with these constraints how many occurrences of a data element
538548
a record can have.
539549

540550
##### `minCount <number>`
@@ -877,6 +887,79 @@ fields:
877887
minHeight: 200
878888
```
879889

890+
##### `hasLanguageTag <anyOf|oneOf|allOf>`
891+
892+
(since v0.9.6)
893+
894+
It checks if the data element value has language tag. In XML the language tag is
895+
found in `@xml:lang` attribute. In JSON it might be encoded differently. Right now
896+
MQAF suppoert the following encoding:
897+
898+
```json
899+
"description": {
900+
"de": ["Porträt"]
901+
}
902+
```
903+
904+
Since this kind of structure might be applied not only for the language annotation, at
905+
the field level we should set that the field is expected to have language annotation:
906+
907+
```yaml
908+
format: json
909+
fields:
910+
- name: description
911+
path: $.['description']
912+
asLanguageTagged: true
913+
```
914+
915+
The parameters defines if any, one or all instances should have language annottation:
916+
917+
* `anyOf`: the test passes if at least one instance has language tag
918+
* `oneOf`: the test passes if one and only one instance has language tag
919+
* `allOf`: the test passes if at least all instances have language tag
920+
921+
A full example:
922+
923+
```yaml
924+
format: json
925+
fields:
926+
- name: description
927+
path: $.['description']
928+
asLanguageTagged: true
929+
rules:
930+
- hasLanguageTag: allOf
931+
```
932+
933+
##### `isMultilingual <boolean>`
934+
935+
(since v0.9.6)
936+
937+
It checks if the data element is multilingual, so it has at least two instances with
938+
different language annotations.
939+
940+
```json
941+
{
942+
"description":{
943+
"de":["Portr\u00e4t"],
944+
"zh":["\u8096\u50cf"]
945+
}
946+
}
947+
948+
```
949+
950+
an example schema
951+
952+
```yaml
953+
format: json
954+
fields:
955+
- name: description
956+
path: $.['description']
957+
asLanguageTagged: true
958+
rules:
959+
- isMultilingual: true
960+
```
961+
962+
880963
#### General properties
881964

882965
##### `id <String>`

mqa

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
#-----------------------------------------------------------------------
33
# Command line interface for the Metadata Quality Assessment Framework -
44
#-----------------------------------------------------------------------
5-
export VERSION=0.9.4
5+
export VERSION=0.9.5-SNAPSHOT
66
java -jar $(dirname $(readlink -f $0))/metadata-qa-api-${VERSION}-shaded.jar $@

pom.xml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<groupId>de.gwdg.metadataqa</groupId>
66
<artifactId>metadata-qa-api</artifactId>
77
<packaging>jar</packaging>
8-
<version>0.9.4</version>
8+
<version>0.9.6-SNAPSHOT</version>
99
<name>Metadata Quality Assurance Framework API</name>
1010
<description>
1111
A metadata quality assurance framework. It checks some metrics of
@@ -86,12 +86,12 @@
8686
<dependency>
8787
<groupId>org.apache.commons</groupId>
8888
<artifactId>commons-lang3</artifactId>
89-
<version>3.14.0</version>
89+
<version>3.16.0</version>
9090
</dependency>
9191
<dependency>
9292
<groupId>commons-io</groupId>
9393
<artifactId>commons-io</artifactId>
94-
<version>2.16.0</version>
94+
<version>2.16.1</version>
9595
</dependency>
9696
<dependency>
9797
<groupId>commons-httpclient</groupId>
@@ -101,12 +101,12 @@
101101
<dependency>
102102
<groupId>org.slf4j</groupId>
103103
<artifactId>slf4j-simple</artifactId>
104-
<version>2.0.12</version>
104+
<version>2.0.14</version>
105105
</dependency>
106106
<dependency>
107107
<groupId>org.slf4j</groupId>
108108
<artifactId>slf4j-api</artifactId>
109-
<version>2.0.12</version>
109+
<version>2.0.14</version>
110110
</dependency>
111111
<!-- CSV reader. Docs: http://opencsv.sourceforge.net -->
112112
<dependency>
@@ -121,7 +121,7 @@
121121
<dependency>
122122
<groupId>org.apache.commons</groupId>
123123
<artifactId>commons-text</artifactId>
124-
<version>1.11.0</version>
124+
<version>1.12.0</version>
125125
</dependency>
126126
<!--
127127
Hierarchical context runner for jUnit
@@ -150,7 +150,7 @@
150150
<dependency>
151151
<groupId>commons-cli</groupId>
152152
<artifactId>commons-cli</artifactId>
153-
<version>1.6.0</version>
153+
<version>1.8.0</version>
154154
</dependency>
155155
<!-- language detection -->
156156
<dependency>
@@ -168,7 +168,7 @@
168168
<dependency>
169169
<groupId>com.fasterxml.jackson.core</groupId>
170170
<artifactId>jackson-databind</artifactId>
171-
<version>2.16.2</version>
171+
<version>2.17.2</version>
172172
</dependency>
173173
<dependency>
174174
<groupId>com.github.stefanbirkner</groupId>

src/main/java/de/gwdg/metadataqa/api/calculator/FieldExtractor.java

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,21 @@
44
import de.gwdg.metadataqa.api.counter.FieldCounter;
55
import de.gwdg.metadataqa.api.interfaces.Calculator;
66
import de.gwdg.metadataqa.api.interfaces.MetricResult;
7+
import de.gwdg.metadataqa.api.json.DataElement;
78
import de.gwdg.metadataqa.api.model.EdmFieldInstance;
89
import de.gwdg.metadataqa.api.model.selector.Selector;
910
import de.gwdg.metadataqa.api.model.XmlFieldInstance;
1011
import de.gwdg.metadataqa.api.problemcatalog.FieldCounterBasedResult;
1112
import de.gwdg.metadataqa.api.schema.Schema;
1213
import de.gwdg.metadataqa.api.util.FileUtils;
14+
import org.apache.commons.lang3.StringUtils;
1315

1416
import java.io.Serializable;
1517
import java.util.ArrayList;
18+
import java.util.LinkedHashSet;
1619
import java.util.List;
20+
import java.util.Set;
21+
import java.util.logging.Logger;
1722

1823
/**
1924
* Field extractor
@@ -22,6 +27,8 @@
2227
*/
2328
public class FieldExtractor implements Calculator, Serializable {
2429

30+
private static final Logger LOGGER = Logger.getLogger(FieldExtractor.class.getCanonicalName());
31+
2532
public static final String CALCULATOR_NAME = "fieldExtractor";
2633
public static final String FIELD_NAME = "recordId";
2734

@@ -56,30 +63,52 @@ public List<MetricResult> measure(Selector cache)
5663

5764
if (schema != null) {
5865
String path;
66+
DataElement dataELement;
5967
for (String fieldName : schema.getExtractableFields().keySet()) {
6068
if (idPath == null || !fieldName.equals(FIELD_NAME)) {
69+
dataELement = schema.getPathByLabel(fieldName);
6170
path = schema.getExtractableFields().get(fieldName);
62-
extractSingleField(cache, resultMap, path, fieldName);
71+
extractSingleField(cache, resultMap, path, fieldName, dataELement);
6372
}
6473
}
6574
}
6675
return List.of(new FieldCounterBasedResult<>(getCalculatorName(), resultMap).withNoCompression());
6776
}
6877

6978
private void extractSingleField(Selector cache, FieldCounter<String> resultMap, String path, String fieldName) {
70-
List<XmlFieldInstance> values = cache.get(path);
79+
extractSingleField(cache, resultMap, path, fieldName,null);
80+
}
81+
82+
private void extractSingleField(Selector cache,
83+
FieldCounter<String> resultMap,
84+
String path,
85+
String fieldName,
86+
DataElement dataELement) {
87+
List<XmlFieldInstance> fieldInstances;
88+
if (dataELement != null) {
89+
fieldInstances = cache.get(dataELement);
90+
} else {
91+
fieldInstances = cache.get(path);
92+
}
7193
String value = null;
72-
if (values == null || values.isEmpty() || values.get(0) == null) {
94+
if (fieldInstances == null || fieldInstances.isEmpty() || fieldInstances.get(0) == null) {
7395
value = nullValue;
7496
} else {
75-
XmlFieldInstance instance = values.get(0);
76-
boolean isEdm = instance instanceof EdmFieldInstance;
77-
if (isEdm && ((EdmFieldInstance)instance).getResource() != null) {
78-
value = ((EdmFieldInstance) instance).getResource();
79-
} else if (instance.getValue() != null) {
80-
value = instance.getValue();
97+
Set<String> values = new LinkedHashSet<>();
98+
for (XmlFieldInstance instance : fieldInstances) {
99+
boolean isEdm = instance instanceof EdmFieldInstance;
100+
if (isEdm && ((EdmFieldInstance)instance).getResource() != null) {
101+
value = ((EdmFieldInstance) instance).getResource();
102+
} else if (instance.getValue() != null) {
103+
value = instance.getValue();
104+
}
105+
// if (!values.contains(values))
106+
if (StringUtils.isNotBlank(value))
107+
values.add(value);
81108
}
109+
value = StringUtils.join(values, " --- ");
82110
}
111+
// LOGGER.info("value: " + value);
83112
resultMap.put(fieldName, value);
84113
}
85114

src/main/java/de/gwdg/metadataqa/api/calculator/Indexer.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,10 @@ public Indexer(SolrClient solrClient, Schema schema) {
3333
@Override
3434
public List<MetricResult> measure(Selector cache) {
3535
try {
36-
String recordId = extractValue(cache, schema.getRecordId().getPath()).get(0);
36+
List<String> extractedValues = extractValue(cache, schema.getRecordId().getPath());
37+
if (extractedValues.isEmpty())
38+
throw new RuntimeException(String.format("Missing record ID (path: %s)", schema.getRecordId().getPath()));
39+
String recordId = extractedValues.get(0);
3740

3841
Map<String, List<String>> resultMap = new HashMap<>();
3942
for (UniquenessField solrField : solrFields) {

src/main/java/de/gwdg/metadataqa/api/cli/App.java

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package de.gwdg.metadataqa.api.cli;
22

3+
import com.jayway.jsonpath.InvalidJsonException;
34
import com.opencsv.exceptions.CsvValidationException;
45
import de.gwdg.metadataqa.api.calculator.CalculatorFacade;
56
import de.gwdg.metadataqa.api.io.reader.XMLRecordReader;
@@ -9,6 +10,7 @@
910
import de.gwdg.metadataqa.api.interfaces.MetricResult;
1011
import de.gwdg.metadataqa.api.io.reader.RecordReader;
1112
import de.gwdg.metadataqa.api.schema.Schema;
13+
import net.minidev.json.parser.ParseException;
1214
import org.apache.commons.cli.CommandLine;
1315
import org.apache.commons.cli.CommandLineParser;
1416
import org.apache.commons.cli.DefaultParser;
@@ -42,6 +44,7 @@ public class App {
4244

4345
// Arguments
4446
private static final String INPUT_FILE = "input";
47+
private static final String INPUT_FORMAT = "inputFormat";
4548
private static final String OUTPUT_FILE = "output";
4649
private static final String OUTPUT_FORMAT = "outputFormat";
4750
private static final String SCHEMA_CONFIG = "schema";
@@ -101,7 +104,8 @@ public App(CommandLine cmd) throws IOException, CsvValidationException {
101104

102105
// initialize input
103106
String inputFile = cmd.getOptionValue(INPUT_FILE);
104-
this.inputReader = RecordFactory.getRecordReader(inputFile, calculator, cmd.hasOption(GZIP_FLAG));
107+
InputFormat inputFormat = InputFormat.byCode(cmd.getOptionValue(INPUT_FORMAT));
108+
this.inputReader = RecordFactory.getRecordReader(inputFile, calculator, cmd.hasOption(GZIP_FLAG), inputFormat);
105109

106110
// initialize output
107111
String outFormat = cmd.getOptionValue(OUTPUT_FORMAT, NDJSON);
@@ -162,6 +166,14 @@ private static Options buildOptions() {
162166
.desc("Input file.")
163167
.build();
164168

169+
Option inputFormatOption = Option.builder("n")
170+
.numberOfArgs(1)
171+
.argName("inputFormat")
172+
.required(false)
173+
.longOpt(INPUT_FORMAT)
174+
.desc("Format of the input: json, ndjson (new line delimited JSON), json-array (JSON file that contains an array of objects). Default: ndjson.")
175+
.build();
176+
165177
Option outputOption = Option.builder("o")
166178
.numberOfArgs(1)
167179
.argName("file")
@@ -233,6 +245,7 @@ private static Options buildOptions() {
233245
.build();
234246

235247
options.addOption(inputOption);
248+
options.addOption(inputFormatOption);
236249
options.addOption(outputOption);
237250
options.addOption(outputFormatOption);
238251
options.addOption(schemaConfigOption);
@@ -253,7 +266,6 @@ private void run() {
253266
outputWriter.writeHeader(header);
254267

255268
while (inputReader.hasNext()) {
256-
257269
Map<String, List<MetricResult>> measurement = inputReader.next();
258270
outputWriter.writeResult(measurement);
259271

@@ -265,9 +277,10 @@ private void run() {
265277
}
266278
logger.info(String.format("Assessment completed successfully with %s records. ", counter));
267279
outputWriter.close();
268-
} catch (IOException e) {
280+
} catch (InvalidJsonException | IOException e) {
269281
logger.severe(String.format("Assessment failed with %s records. ", counter));
270282
logger.severe(e.getMessage());
283+
e.printStackTrace();
271284
}
272285
}
273286
}

0 commit comments

Comments
 (0)