Skip to content

Commit

Permalink
Group results in completeness #199
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Mar 14, 2023
1 parent 282232c commit 4a5fdd3
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 65 deletions.
15 changes: 15 additions & 0 deletions common-script
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,9 @@ do_sqlite() {
printf "%s %s> create importable files\n" $(date +"%F %T")
tail -n +2 ${OUTPUT_DIR}/issue-details-normalized.csv > ${OUTPUT_DIR}/issue-details-normalized_noheader.csv
tail -n +2 ${OUTPUT_DIR}/issue-summary.csv > ${OUTPUT_DIR}/issue-summary_noheader.csv
if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
tail -n +2 ${OUTPUT_DIR}/id-groupid.csv > ${OUTPUT_DIR}/id-groupid_noheader.csv
fi

printf "%s %s> import issue details\n" $(date +"%F %T")
sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite << EOF
Expand All @@ -283,9 +286,21 @@ EOF
.import ${OUTPUT_DIR}/issue-summary_noheader.csv issue_summary
EOF

if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
printf "%s %s> import id_groupid\n" $(date +"%F %T")
sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite << EOF
.mode csv
.import ${OUTPUT_DIR}/id-groupid_noheader.csv id_groupid
EOF
fi


printf "%s %s> delete importable files\n" $(date +"%F %T")
rm ${OUTPUT_DIR}/issue-details-normalized_noheader.csv
rm ${OUTPUT_DIR}/issue-summary_noheader.csv
if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
rm ${OUTPUT_DIR}/id-groupid_noheader.csv
fi

if [[ "${HAS_GROUP_PARAM}" == "0" ]]; then
printf "%s %s> index\n" $(date +"%F %T")
Expand Down
4 changes: 4 additions & 0 deletions scripts/sqlite/qa_catalogue.groupped.sqlite.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,8 @@ CREATE TABLE IF NOT EXISTS "issue_details" (
"errorId" INTEGER,
"instances" INTEGER
);
CREATE TABLE IF NOT EXISTS "id_groupid" (
"id" TEXT,
"groupId" TEXT
);
COMMIT;
15 changes: 11 additions & 4 deletions src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.marc4j.marc.Record;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Files;
Expand Down Expand Up @@ -50,7 +50,7 @@ public class Completeness extends QACli implements BibliographicInputProcessor,
private CompletenessPlugin plugin;
private RecordFilter recordFilter;
private RecordIgnorator recordIgnorator;

private File idCollectorFile;

public Completeness(String[] args) throws ParseException {
parameters = new CompletenessParameters(args);
Expand All @@ -59,6 +59,10 @@ public Completeness(String[] args) throws ParseException {
recordIgnorator = parameters.getRecordIgnorator();
initializeGroups(parameters.getGroupBy(), parameters.isPica());
readyToProcess = true;
if (doGroups()) {
idCollectorFile = prepareReportFile(parameters.getOutputDir(), "id-groupid.csv");
printToFile(idCollectorFile, CsvUtils.createCsv("id", "groupId"));
}
}

public static void main(String[] args) {
Expand Down Expand Up @@ -103,9 +107,12 @@ public void processRecord(BibliographicRecord bibliographicRecord, int recordNum
RecordCompleteness recordCompleteness = new RecordCompleteness(bibliographicRecord, parameters, completenessDAO, plugin, groupBy);
recordCompleteness.process();

if (groupBy != null)
for (String id : recordCompleteness.getGroupIds())
if (doGroups()) {
for (String id : recordCompleteness.getGroupIds()) {
count(id, completenessDAO.getGroupCounter());
printToFile(idCollectorFile, CsvUtils.createCsv(bibliographicRecord.getId(true), id));
}
}

for (String key : recordCompleteness.getRecordFrequency().keySet()) {
if (groupBy != null) {
Expand Down
26 changes: 26 additions & 0 deletions src/main/java/de/gwdg/metadataqa/marc/cli/QACli.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,21 @@
import de.gwdg.metadataqa.marc.cli.parameters.CommonParameters;
import de.gwdg.metadataqa.marc.utils.BibiographicPath;
import de.gwdg.metadataqa.marc.utils.pica.path.PicaPathParser;
import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

public abstract class QACli {
private static final Logger logger = Logger.getLogger(QACli.class.getCanonicalName());

protected BibiographicPath groupBy = null;

Expand Down Expand Up @@ -61,4 +66,25 @@ public boolean doGroups() {
return groupBy != null;
}

protected File prepareReportFile(String outputDir, String fileName) {
File reportFile = new File(outputDir, fileName);
if (reportFile.exists())
if (!reportFile.delete())
logger.log(Level.SEVERE, "File {} hasn't been deleted", reportFile.getAbsolutePath());
return reportFile;
}

/**
* Print to file
* @param file The output file
* @param content The content
*/
protected void printToFile(File file, String content) {
try {
FileUtils.writeStringToFile(file, content, Charset.defaultCharset(), true);
} catch (IOException e) {
logger.log(Level.SEVERE, "printToFile", e);
}
}

}
22 changes: 0 additions & 22 deletions src/main/java/de/gwdg/metadataqa/marc/cli/ValidatorCli.java
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,6 @@ public void beforeIteration() {
saveParameters("validation.params.json", parameters);
}

private File prepareReportFile(String outputDir, String fileName) {
File reportFile = new File(outputDir, fileName);
if (reportFile.exists())
if (!reportFile.delete())
logger.log(Level.SEVERE, "File {} hasn't been deleted", reportFile.getAbsolutePath());
return reportFile;
}

@Override
public void fileOpened(Path currentFile) {
// do nothing
Expand Down Expand Up @@ -588,20 +580,6 @@ private void print(File file, String content) {
}
}

/**
* Print to file
* @param file The output file
* @param content The content
*/
private void printToFile(File file, String content) {
try {
FileUtils.writeStringToFile(file, content, Charset.defaultCharset(), true);
} catch (IOException e) {
if (parameters.doLog())
logger.log(Level.SEVERE, "printToFile", e);
}
}

private void updateErrorCollector(String recordId, int errorId) {
if (!validatorDAO.getErrorCollector().containsKey(errorId)) {
validatorDAO.getErrorCollector().put(errorId, new HashSet<>());
Expand Down
82 changes: 81 additions & 1 deletion src/test/java/de/gwdg/metadataqa/marc/cli/CompletenessTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ public void setUp() throws Exception {
"completeness-groups.csv",
"completeness-groupped-marc-elements.csv",
"completeness-groupped-packages.csv",
"completeness.params.json"
"completeness.params.json",
"id-groupid.csv"
);
}

Expand Down Expand Up @@ -179,6 +180,7 @@ public void completeness_pica_groupBy_file() throws Exception {
lineNr++;
}
reader.close();

} else if (outputFile.equals("completeness-groups.csv")) {
output = new File(outputDir, outputFile);
assertTrue(output.exists());
Expand Down Expand Up @@ -234,6 +236,84 @@ public void completeness_pica_groupBy_file() throws Exception {
"all,all,10\n",
actual);

} else if (outputFile.equals("completeness-groupped-packages.csv")) {
output = new File(outputDir, outputFile);
assertTrue(output.exists());
List<String> lines = FileUtils.readLinesFromFile(output.toPath().toString());
assertEquals("group,documenttype,packageid,name,label,iscoretag,count", lines.get(0));
assertEquals("100,Druckschriften (einschließlich Bildbänden),50,0...,PICA+ bibliograhic description,false,1", lines.get(1));
assertEquals("100,Druckschriften (einschließlich Bildbänden),99,unknown,unknown origin,false,1", lines.get(2));
assertEquals("100,all,50,0...,PICA+ bibliograhic description,false,1", lines.get(3));
assertEquals("100,all,99,unknown,unknown origin,false,1", lines.get(4));

} else if (outputFile.equals("id-groupid.csv")) {
output = new File(outputDir, outputFile);
assertTrue(output.exists());
List<String> lines = FileUtils.readLinesFromFile(output.toPath().toString());
assertEquals(76, lines.size());
assertEquals("id,groupId", lines.get(0));
assertEquals("010000011,all", lines.get(1));
assertEquals("010000011,77", lines.get(2));
assertEquals("010000011,2035", lines.get(3));
assertEquals("010000011,70", lines.get(4));
assertEquals("010000011,20", lines.get(5));

} else if (outputFile.equals("completeness.params.json")) {
output = new File(outputDir, outputFile);
assertTrue(output.exists());
String line = Files.readString(output.toPath());
assertTrue(line.contains("{\"args\":[\""));
assertTrue(line.contains("metadata-qa-marc/src/test/resources/pica/pica-with-holdings-info.dat\"],"));
assertTrue(line.contains("\"marcVersion\":\"MARC21\","));
assertTrue(line.contains("\"marcFormat\":\"PICA_NORMALIZED\","));
assertTrue(line.contains("\"dataSource\":\"FILE\","));
assertTrue(line.contains("\"limit\":-1,"));
assertTrue(line.contains("\"offset\":-1,"));
assertTrue(line.contains("\"id\":null,"));
assertTrue(line.contains("\"defaultRecordType\":null,"));
assertTrue(line.contains("\"alephseq\":false,"));
assertTrue(line.contains("\"marcxml\":false,"));
assertTrue(line.contains("\"lineSeparated\":false,"));
assertTrue(line.contains("\"trimId\":false,"));
assertTrue(line.contains("\"outputDir\":\"/"));
assertTrue(line.contains("/metadata-qa-marc/src/test/resources/output\","));
assertTrue(line.contains("\"recordIgnorator\":{\"criteria\":[],\"booleanCriteria\":null,\"empty\":true},"));
assertTrue(line.contains("\"recordFilter\":{\"criteria\":[],\"booleanCriteria\":null,\"empty\":true},"));
assertTrue(line.contains("\"ignorableFields\":{\"fields\":null,\"empty\":true},"));
assertTrue(line.contains("\"stream\":null,"));
assertTrue(line.contains("\"defaultEncoding\":null,"));
assertTrue(line.contains("\"alephseqLineType\":null,"));
assertTrue(line.contains("\"picaIdField\":\"003@$0\","));
assertTrue(line.contains("\"picaSubfieldSeparator\":\"$\","));
assertTrue(line.contains("\"picaSchemaFile\":null,"));
assertTrue(line.contains("\"picaRecordTypeField\":\"002@$0\","));
assertTrue(line.contains("\"schemaType\":\"PICA\","));
assertTrue(line.contains("\"groupBy\":\"001@$0\","));
assertTrue(line.contains("\"groupListFile\":\"/"));
assertTrue(line.contains("metadata-qa-marc/target/test-classes/k10plus-libraries-by-unique-iln.txt\","));
assertTrue(line.contains("\"format\":\"COMMA_SEPARATED\","));
assertTrue(line.contains("\"advanced\":false,"));
assertTrue(line.contains("\"onlyPackages\":false,"));
assertTrue(line.contains("\"pica\":true,"));
assertTrue(line.contains("\"replacementInControlFields\":null,"));
assertTrue(line.contains("\"marc21\":false,"));
assertTrue(line.contains("\"mqaf.version\":\"0.9.0\","));
assertTrue(line.contains("\"qa-catalogue.version\":\"0.7.0-SNAPSHOT\"}"));

} else if (outputFile.equals("libraries.csv")) {
output = new File(outputDir, outputFile);
assertTrue(output.exists());
String actual = Files.readString(output.toPath());
assertEquals("library,count\n", actual);

} else if (outputFile.equals("libraries003.csv")) {
output = new File(outputDir, outputFile);
assertTrue(output.exists());
String actual = Files.readString(output.toPath());
assertEquals("library,count\n", actual);

} else {
fail("Untested file: " + outputFile);
}
output.delete();
assertFalse(outputFile + " should not exist anymore", output.exists());
Expand Down
78 changes: 40 additions & 38 deletions src/test/java/de/gwdg/metadataqa/marc/cli/ValidatorCliTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import static junit.framework.TestCase.assertTrue;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.fail;

public class ValidatorCliTest extends CliTestUtils {

Expand Down Expand Up @@ -209,46 +210,47 @@ public void validate_pica_groupBy() throws Exception {

} else if (outputFile.equals("validation.params.json")) {
assertEquals(1, lines.size());
assertTrue(lines.get(0).contains("\"args\":[\""));
assertTrue(lines.get(0).contains("metadata-qa-marc/src/test/resources/pica/pica-with-holdings-info.dat\"]"));
assertTrue(lines.get(0).contains("\"marcVersion\":\"MARC21\","));
assertTrue(lines.get(0).contains("\"marcFormat\":\"PICA_NORMALIZED\","));
assertTrue(lines.get(0).contains("\"dataSource\":\"FILE\","));
assertTrue(lines.get(0).contains("\"limit\":-1,"));
assertTrue(lines.get(0).contains("\"offset\":-1,"));
assertTrue(lines.get(0).contains("\"id\":null,"));
assertTrue(lines.get(0).contains("\"defaultRecordType\":\"BOOKS\","));
assertTrue(lines.get(0).contains("\"alephseq\":false,"));
assertTrue(lines.get(0).contains("\"marcxml\":false,"));
assertTrue(lines.get(0).contains("\"lineSeparated\":false,"));
assertTrue(lines.get(0).contains("\"trimId\":true,"));
assertTrue(lines.get(0).contains("\"outputDir\":\""));
assertTrue(lines.get(0).contains("metadata-qa-marc/src/test/resources/output\","));
assertTrue(lines.get(0).contains("\"recordIgnorator\":{\"criteria\":[],\"booleanCriteria\":null,\"empty\":true},"));
assertTrue(lines.get(0).contains("\"recordFilter\":{\"criteria\":[],\"booleanCriteria\":null,\"empty\":true},"));
assertTrue(lines.get(0).contains("\"ignorableFields\":{\"fields\":null,\"empty\":true},"));
assertTrue(lines.get(0).contains("\"stream\":null,"));
assertTrue(lines.get(0).contains("\"defaultEncoding\":null,"));
assertTrue(lines.get(0).contains("\"alephseqLineType\":null,"));
assertTrue(lines.get(0).contains("\"picaIdField\":\"003@$0\","));
assertTrue(lines.get(0).contains("\"picaSubfieldSeparator\":\"$\","));
assertTrue(lines.get(0).contains("\"picaSchemaFile\":null,"));
assertTrue(lines.get(0).contains("\"picaRecordTypeField\":\"002@$0\","));
assertTrue(lines.get(0).contains("\"schemaType\":\"PICA\","));
assertTrue(lines.get(0).contains("\"groupBy\":\"001@$0\","));
assertTrue(lines.get(0).contains("\"groupListFile\":null,"));
assertTrue(lines.get(0).contains("\"detailsFileName\":\"issue-details.csv\","));
assertTrue(lines.get(0).contains("\"summaryFileName\":\"issue-summary.csv\","));
assertTrue(lines.get(0).contains("\"format\":\"COMMA_SEPARATED\","));
assertTrue(lines.get(0).contains("\"ignorableIssueTypes\":null,"));
assertTrue(lines.get(0).contains("\"pica\":true,"));
assertTrue(lines.get(0).contains("\"replacementInControlFields\":null,"));
assertTrue(lines.get(0).contains("\"marc21\":false,"));
assertTrue(lines.get(0).contains("\"mqaf.version\":\"0.9.0\","));
assertTrue(lines.get(0).contains("\"qa-catalogue.version\":\"0.7.0-SNAPSHOT\"}"));
String line = lines.get(0);
assertTrue(line.contains("\"args\":[\""));
assertTrue(line.contains("metadata-qa-marc/src/test/resources/pica/pica-with-holdings-info.dat\"]"));
assertTrue(line.contains("\"marcVersion\":\"MARC21\","));
assertTrue(line.contains("\"marcFormat\":\"PICA_NORMALIZED\","));
assertTrue(line.contains("\"dataSource\":\"FILE\","));
assertTrue(line.contains("\"limit\":-1,"));
assertTrue(line.contains("\"offset\":-1,"));
assertTrue(line.contains("\"id\":null,"));
assertTrue(line.contains("\"defaultRecordType\":\"BOOKS\","));
assertTrue(line.contains("\"alephseq\":false,"));
assertTrue(line.contains("\"marcxml\":false,"));
assertTrue(line.contains("\"lineSeparated\":false,"));
assertTrue(line.contains("\"trimId\":true,"));
assertTrue(line.contains("\"outputDir\":\""));
assertTrue(line.contains("metadata-qa-marc/src/test/resources/output\","));
assertTrue(line.contains("\"recordIgnorator\":{\"criteria\":[],\"booleanCriteria\":null,\"empty\":true},"));
assertTrue(line.contains("\"recordFilter\":{\"criteria\":[],\"booleanCriteria\":null,\"empty\":true},"));
assertTrue(line.contains("\"ignorableFields\":{\"fields\":null,\"empty\":true},"));
assertTrue(line.contains("\"stream\":null,"));
assertTrue(line.contains("\"defaultEncoding\":null,"));
assertTrue(line.contains("\"alephseqLineType\":null,"));
assertTrue(line.contains("\"picaIdField\":\"003@$0\","));
assertTrue(line.contains("\"picaSubfieldSeparator\":\"$\","));
assertTrue(line.contains("\"picaSchemaFile\":null,"));
assertTrue(line.contains("\"picaRecordTypeField\":\"002@$0\","));
assertTrue(line.contains("\"schemaType\":\"PICA\","));
assertTrue(line.contains("\"groupBy\":\"001@$0\","));
assertTrue(line.contains("\"groupListFile\":null,"));
assertTrue(line.contains("\"detailsFileName\":\"issue-details.csv\","));
assertTrue(line.contains("\"summaryFileName\":\"issue-summary.csv\","));
assertTrue(line.contains("\"format\":\"COMMA_SEPARATED\","));
assertTrue(line.contains("\"ignorableIssueTypes\":null,"));
assertTrue(line.contains("\"pica\":true,"));
assertTrue(line.contains("\"replacementInControlFields\":null,"));
assertTrue(line.contains("\"marc21\":false,"));
assertTrue(line.contains("\"mqaf.version\":\"0.9.0\","));
assertTrue(line.contains("\"qa-catalogue.version\":\"0.7.0-SNAPSHOT\"}"));

} else {
assertTrue("Unhandlet output: " + outputFile, outputFile.equals(""));
fail("Untested output file: " + outputFile);
}

output.delete();
Expand Down

0 comments on commit 4a5fdd3

Please sign in to comment.