Skip to content

Commit

Permalink
Use icu4j for sort keys
Browse files Browse the repository at this point in the history
  • Loading branch information
marktriggs committed May 11, 2012
1 parent f82303f commit d92ffd6
Show file tree
Hide file tree
Showing 21 changed files with 592 additions and 221 deletions.
1 change: 0 additions & 1 deletion README
Expand Up @@ -134,7 +134,6 @@
<lst name="subjects">
<str name="DBpath">/path/to/your/subjectsbrowse.db</str>
<str name="field">subject-browse</str>
<str name="ignoreDiacritics">yes</str>
<str name="dropChars">[]()',</str>
</lst>
</requestHandler>
Expand Down
Expand Up @@ -23,6 +23,9 @@
import java.util.logging.Logger;
import java.util.concurrent.locks.ReentrantReadWriteLock;

import au.gov.nla.util.Normaliser;
import au.gov.nla.util.BrowseEntry;

class Log
{
private static Logger log ()
Expand Down Expand Up @@ -52,12 +55,14 @@ class HeadingsDB
String path;
long dbVersion;
int totalCount;
Normaliser normaliser;

ReentrantReadWriteLock dbLock = new ReentrantReadWriteLock ();

public HeadingsDB (String path) throws Exception
{
this.path = path;
normaliser = Normaliser.getInstance ();
}


Expand Down Expand Up @@ -142,7 +147,7 @@ public int getHeadingStart (String from) throws Exception
"order by key " +
"limit 1");

rowStmnt.setString (1, from);
rowStmnt.setBytes (1, normaliser.normalise (from));

ResultSet rs = rowStmnt.executeQuery ();

Expand Down Expand Up @@ -558,8 +563,6 @@ public void queryFinished ()

private void populateItem (BrowseItem item) throws Exception
{
Log.info ("Populating: " + item.heading);

List<String> ids = bibDB.matchingIDs (item.heading);
item.ids = ids;
item.count = ids.size ();
Expand Down Expand Up @@ -618,20 +621,17 @@ class BrowseSource
{
public String DBpath;
public String field;
public String ignoreDiacritics;
public String dropChars;

public Browse browse;


public BrowseSource (String DBpath,
String field,
String ignoreDiacritics,
String dropChars)
{
this.DBpath = DBpath;
this.field = field;
this.ignoreDiacritics = ignoreDiacritics;
this.dropChars = dropChars;
}
}
Expand Down Expand Up @@ -680,7 +680,6 @@ public void init (NamedList args)
sources.put (source,
new BrowseSource (entry.get ("DBpath"),
entry.get ("field"),
entry.get ("ignoreDiacritics"),
entry.get ("dropChars")));
}
}
Expand All @@ -697,40 +696,6 @@ private int asInt (String s)
}


private String handle_diacritics (String s)
{
DiacriticStripper ds = new DiacriticStripper ();

return ds.fix (s);
}


private String clean (String s, boolean ignoreDiacritics, String dropChars)
{
String cleaned = s;

if (dropChars != null) {
for (int i = 0; i < dropChars.length (); i++) {
cleaned = cleaned.replace (String.valueOf
(dropChars.charAt (i)),
"");
}
}

cleaned = cleaned.replaceAll ("[\\(\\)]", "");
cleaned = cleaned.replaceAll ("-$", "");
cleaned = cleaned.replaceAll ("-", " ");
cleaned = cleaned.replaceAll (" +", " ");

if (!ignoreDiacritics) {
cleaned = handle_diacritics (cleaned);
}

return cleaned.toLowerCase ();
}



@Override
public void handleRequestBody (org.apache.solr.request.SolrQueryRequest req,
org.apache.solr.response.SolrQueryResponse rsp)
Expand Down Expand Up @@ -787,11 +752,7 @@ public void handleRequestBody (org.apache.solr.request.SolrQueryRequest req,
source.browse.reopenDatabasesIfUpdated ();

if (from != null) {
rowid = (source.browse.getId
(clean (from,
(source.ignoreDiacritics != null &&
source.ignoreDiacritics.equals ("yes")),
source.dropChars)));
rowid = (source.browse.getId (from));
}


Expand Down
10 changes: 7 additions & 3 deletions browse-indexing/CreateBrowseSQLite.java
Expand Up @@ -7,13 +7,15 @@

import java.sql.*;

// Note that this version is coming from Solr!
import org.apache.commons.codec.binary.Base64;

class CreateBrowseSQLite

public class CreateBrowseSQLite
{
private Connection outputDB;

private String KEY_SEPARATOR = "\1";
private String RECORD_SEPARATOR = "\2";


/*
Expand Down Expand Up @@ -66,7 +68,9 @@ private void loadHeadings (BufferedReader br)
while ((line = readCRLFLine (br)) != null) {
int sep = line.indexOf (KEY_SEPARATOR.charAt (0));
if (sep >= 0) {
prep.setString (1, line.substring (0, sep));

byte[] key = Base64.decodeBase64 (line.substring (0, sep).getBytes());
prep.setBytes (1, key);
prep.setString (2, line.substring (sep + 1));

prep.addBatch ();
Expand Down
27 changes: 8 additions & 19 deletions browse-indexing/Leech.java
Expand Up @@ -3,6 +3,10 @@
import org.apache.lucene.search.*;
import java.io.*;

import au.gov.nla.util.Normaliser;
import au.gov.nla.util.BrowseEntry;


public class Leech
{
protected IndexReader reader;
Expand All @@ -13,13 +17,6 @@ public class Leech
private Normaliser normaliser;


protected String getEnvironment (String var)
{
return (System.getenv (var) != null) ?
System.getenv (var) : System.getProperty (var.toLowerCase ());
}


public Leech (String indexPath,
String field) throws Exception
{
Expand All @@ -28,19 +25,11 @@ public Leech (String indexPath,
this.field = field;
tenum = reader.terms (new Term (field, ""));

if (getEnvironment ("NORMALISER") != null) {
String normaliserClass = getEnvironment ("NORMALISER");

normaliser = (Normaliser) (Class.forName (normaliserClass)
.getConstructor ()
.newInstance ());
} else {
normaliser = new Normaliser ();
}
normaliser = Normaliser.getInstance ();
}


public String buildSortKey (String heading)
public byte[] buildSortKey (String heading)
{
return normaliser.normalise (heading);
}
Expand All @@ -64,14 +53,14 @@ private boolean termExists (Term t)
}


public String[] next () throws Exception
public BrowseEntry next () throws Exception
{
if (tenum.term () != null &&
tenum.term ().field ().equals (this.field)) {
if (termExists (tenum.term ())) {
String term = tenum.term ().text ();
tenum.next ();
return new String[] {buildSortKey (term), term};
return new BrowseEntry (buildSortKey (term), term);
} else {
tenum.next ();
return this.next ();
Expand Down
31 changes: 0 additions & 31 deletions browse-indexing/Normaliser.java

This file was deleted.

17 changes: 12 additions & 5 deletions browse-indexing/PrintBrowseHeadings.java
Expand Up @@ -13,9 +13,13 @@

import java.sql.*;

import au.gov.nla.util.BrowseEntry;

// Note that this version is coming from Solr!
import org.apache.commons.codec.binary.Base64;

class PrintBrowseHeadings

public class PrintBrowseHeadings
{
private Leech bibLeech;
private Leech authLeech;
Expand All @@ -34,18 +38,21 @@ private void loadHeadings (Leech leech,
Predicate predicate)
throws Exception
{
String[] h;
BrowseEntry h;
while ((h = leech.next ()) != null) {
String sort_key = h[0];
String heading = h[1];
byte[] sort_key = h.key;
String heading = h.value;

if (predicate != null &&
!predicate.isSatisfiedBy (heading)) {
continue;
}

if (sort_key != null) {
out.print (sort_key + KEY_SEPARATOR + heading + RECORD_SEPARATOR);
out.print (new String (Base64.encodeBase64 (sort_key)) +
KEY_SEPARATOR +
heading +
RECORD_SEPARATOR);
}
}
}
Expand Down
17 changes: 11 additions & 6 deletions browse-indexing/StoredFieldLeech.java
Expand Up @@ -7,10 +7,13 @@
import org.apache.lucene.index.*;
import org.apache.lucene.document.*;

import au.gov.nla.util.Utils;
import au.gov.nla.util.BrowseEntry;

public class StoredFieldLeech extends Leech
{
int currentDoc = 0;
LinkedList<String[]> buffer;
LinkedList<BrowseEntry> buffer;

String sortField;
String valueField;
Expand All @@ -22,8 +25,8 @@ public StoredFieldLeech (String indexPath, String field) throws Exception
{
super (indexPath, field);

sortField = getEnvironment ("SORTFIELD");
valueField = getEnvironment ("VALUEFIELD");
sortField = Utils.getEnvironment ("SORTFIELD");
valueField = Utils.getEnvironment ("VALUEFIELD");

if (sortField == null || valueField == null) {
throw new IllegalArgumentException ("Both SORTFIELD and " +
Expand All @@ -32,6 +35,8 @@ public StoredFieldLeech (String indexPath, String field) throws Exception
}

fieldSelector = new FieldSelector () {
static final long serialVersionUID = -3547604067655030732L;

public FieldSelectorResult accept (String fieldName) {
if (fieldName.equals (sortField) ||
fieldName.equals (valueField)) {
Expand All @@ -44,7 +49,7 @@ public FieldSelectorResult accept (String fieldName) {


reader = IndexReader.open (FSDirectory.open (new File (indexPath)));
buffer = new LinkedList<String[]> ();
buffer = new LinkedList<BrowseEntry> ();
}


Expand All @@ -57,12 +62,12 @@ private void loadDocument (IndexReader reader, int docid)
String[] value = doc.getValues (valueField);

if (sort_key.length == 1 && value.length == 1) {
buffer.add (new String[] {sort_key[0], value[0]});
buffer.add (new BrowseEntry(buildSortKey(sort_key[0]), value[0]));
}
}


public String[] next () throws Exception
public BrowseEntry next () throws Exception
{
while (buffer.isEmpty ()) {
if (currentDoc < reader.maxDoc ()) {
Expand Down

0 comments on commit d92ffd6

Please sign in to comment.