This repository has been archived by the owner on Jan 16, 2021. It is now read-only.
/
PerseusDatafileImporter.groovy
151 lines (128 loc) · 5.5 KB
/
PerseusDatafileImporter.groovy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
package org.pelagios.graph.importer.perseus
import org.pelagios.graph.PelagiosGraph;
import org.pelagios.graph.builder.GeoAnnotationBuilder;
import org.pelagios.graph.builder.DatasetBuilder;
import org.pelagios.graph.builder.PelagiosGraphImpl;
import org.pelagios.graph.exceptions.DatasetExistsException;
import org.pelagios.graph.importer.AbstractDatasetImporter;
import org.pelagios.graph.importer.Hierarchy;
import com.hp.hpl.jena.rdf.model.Resource
import com.hp.hpl.jena.vocabulary.DC;
/**
* Importer for one Perseus data file. RDF/XML sample:
*
* <oac:Annotation rdf:about="org.perseus:entityoccurrence:1867133:1:Perseus:text:1999.02.0025:book=3:poem=17">
* <oac:hasBody>
* <rdf:Description rdf:about="http://pleiades.stoa.org/places/432839"/>
* </oac:hasBody>
* <oac:hasTarget>
* <rdf:Description rdf:about="http://www.perseus.tufts.edu/hopper/xmlchunk?doc=Perseus:text:1999.02.0025:book=3:poem=17"/>
* </oac:hasTarget>
* <dc:title xml:lang="en">Formiae (Italy)</dc:title>
* <dcterms:creator>Perseus Digital Library</dcterms:creator>
* <dcterms:created>2011-05-19</dcterms:created>
* </oac:Annotation>
*
* To introduce some structuring in the Perseus data, I took the liberty of
* defining two levels of hierarchy, based on the URI format:
*
* org.perseus:entityoccurrence:1867133:1:Perseus:text:1999.02.0025:book=3:poem=17
*
* Level one is all annotations with same 'text' numbers (e.g. 1999.02.005)
*
* Level two is all annotations with same book number (e.g. 3)
*
* @author Rainer Simon
*/
class PerseusDatafileImporter extends AbstractDatasetImporter {
private String name
public PerseusDatafileImporter(String name, File rdf, DatasetBuilder rootNode) {
super(rdf, rootNode)
this.name = name
}
/**
* Imports the RDF data into the Pelagios Graph.
* @param graph the graph
* @throws DatasetExistsException
* @throws PlaceNotFoundException
*/
@Override
public void importData(PelagiosGraph graph) throws DatasetExistsException {
// Start by creating the collection root node and making this
// the root of this dataset
DatasetBuilder newRoot = new DatasetBuilder(name)
graph.addDataset(newRoot, rootNode)
rootNode = newRoot;
HashMap<Hierarchy, List<GeoAnnotationBuilder>> allRecords =
new HashMap<Hierarchy, List<GeoAnnotationBuilder>>()
for (Resource oac : listOACAnnotations()) {
// Annotation URI for building the hierarchy
Hierarchy h = getHierarchy(oac.toString())
// Target = data record URN
String recordURL = oac.getProperty(OAC_HASTARGET).getObject().asResource().getURI()
// Body = Pleiades Place
String pleiadesURL = oac.getProperty(OAC_HASBODY).getObject().toString()
// Title
String title = oac.getProperty(DC.title).getObject().asLiteral().getString()
// Create the record and store in memory - we'll batch-add
// all records to the graph later for added performance
try {
List<GeoAnnotationBuilder> records = allRecords.get(h)
if (records == null) {
records = new ArrayList<GeoAnnotationBuilder>()
allRecords.put(h, records)
}
GeoAnnotationBuilder annotation = new GeoAnnotationBuilder(
new URI(recordURL.replace("xmlchunk", "text").replace(" ", "%20")),
new URI(pleiadesURL));
annotation.setLabel(title)
records.add(annotation)
} catch (URISyntaxException e) {
// Only happens in case of data set errors - we have a
// zero-tolerance policy for those kinds of things
throw new RuntimeException(e);
}
}
batchAdd(allRecords, graph);
}
/**
* Utility method which strips out the 'hierarchy' from the
* the URI, e.g. 1999 > 01 > 0084 > Book 12 in case of
*
* org.perseus:entityoccurrence:1867974:1:Perseus:text:1999.01.0084:book=12:chapter=36
*
* @param uri the URI
* @return the hierarchy
*/
Hierarchy getHierarchy(String uri) {
List<String> hierarchy = new ArrayList<String>()
hierarchy.add(name);
StringTokenizer tokenizer = new StringTokenizer(uri, ":")
if (tokenizer.countTokens() < 8)
throw new RuntimeException("Illegal URI format - looks like an error in the dataset: " + uri)
// Skip the first 6 tokens
for (int i=0; i<6; i++)
tokenizer.nextToken()
// E.g. '1999.01.0084'
String prefix = tokenizer.nextToken();
hierarchy.add(prefix)
// Look for 'book', 'chapter', 'section', 'speech', 'narrative' and 'poem'
while (tokenizer.hasMoreTokens()) {
String nextToken = tokenizer.nextToken()
if (nextToken.startsWith('book=')) {
hierarchy.add('Book ' + nextToken.substring(nextToken.lastIndexOf('=') + 1))
} else if (nextToken.startsWith('chapter')) {
hierarchy.add('Chapter ' + nextToken.substring(nextToken.lastIndexOf('=') + 1))
} else if (nextToken.startsWith('section')) {
hierarchy.add('Section ' + nextToken.substring(nextToken.lastIndexOf('=') + 1))
} else if (nextToken.startsWith('speech')) {
hierarchy.add('Speech ' + nextToken.substring(nextToken.lastIndexOf('=') + 1))
} else if (nextToken.startsWith('narrative')) {
hierarchy.add('Narrative' + nextToken.substring(nextToken.lastIndexOf('=') + 1))
} else if (nextToken.startsWith('poem')) {
hierarchy.add('Poem ' + nextToken.substring(nextToken.lastIndexOf('=') + 1))
}
}
return new Hierarchy(hierarchy)
}
}