Use icu4j for sort keys

marktriggs · May 11, 2012 · d92ffd6 · d92ffd6
1 parent f82303f
commit d92ffd6
Show file tree

Hide file tree

Showing 21 changed files with 592 additions and 221 deletions.
diff --git a/README b/README
@@ -134,7 +134,6 @@
        <lst name="subjects">
 	 <str name="DBpath">/path/to/your/subjectsbrowse.db</str>
 	 <str name="field">subject-browse</str>
-         <str name="ignoreDiacritics">yes</str>
          <str name="dropChars">[]()',</str>
        </lst>
     </requestHandler>

diff --git a/browse-handler/java/au/gov/nla/solr/handler/BrowseRequestHandler.java b/browse-handler/java/au/gov/nla/solr/handler/BrowseRequestHandler.java
@@ -23,6 +23,9 @@
 import java.util.logging.Logger;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 
+import au.gov.nla.util.Normaliser;
+import au.gov.nla.util.BrowseEntry;
+
 class Log
 {
     private static Logger log ()
@@ -52,12 +55,14 @@ class HeadingsDB
     String path;
     long dbVersion;
     int totalCount;
+    Normaliser normaliser;
 
     ReentrantReadWriteLock dbLock = new ReentrantReadWriteLock ();
 
     public HeadingsDB (String path) throws Exception
     {
         this.path = path;
+        normaliser = Normaliser.getInstance ();
     }
 
 
@@ -142,7 +147,7 @@ public int getHeadingStart (String from) throws Exception
             "order by key " +
             "limit 1");
 
-        rowStmnt.setString (1, from);
+        rowStmnt.setBytes (1, normaliser.normalise (from));
 
         ResultSet rs = rowStmnt.executeQuery ();
 
@@ -558,8 +563,6 @@ public void queryFinished ()
 
     private void populateItem (BrowseItem item) throws Exception
     {
-        Log.info ("Populating: " + item.heading);
-
         List<String> ids = bibDB.matchingIDs (item.heading);
         item.ids = ids;
         item.count = ids.size ();
@@ -618,20 +621,17 @@ class BrowseSource
 {
     public String DBpath;
     public String field;
-    public String ignoreDiacritics;
     public String dropChars;
 
     public Browse browse;
 
 
     public BrowseSource (String DBpath,
                          String field,
-                         String ignoreDiacritics,
                          String dropChars)
     {
         this.DBpath = DBpath;
         this.field = field;
-        this.ignoreDiacritics = ignoreDiacritics;
         this.dropChars = dropChars;
     }
 }
@@ -680,7 +680,6 @@ public void init (NamedList args)
             sources.put (source,
                          new BrowseSource (entry.get ("DBpath"),
                                            entry.get ("field"),
-                                           entry.get ("ignoreDiacritics"),
                                            entry.get ("dropChars")));
         }
     }
@@ -697,40 +696,6 @@ private int asInt (String s)
     }
 
 
-    private String handle_diacritics (String s)
-    {
-        DiacriticStripper ds = new DiacriticStripper ();
-
-        return ds.fix (s);
-    }
-
-
-    private String clean (String s, boolean ignoreDiacritics, String dropChars)
-    {
-        String cleaned = s;
-
-        if (dropChars != null) {
-            for (int i = 0; i < dropChars.length (); i++) {
-                cleaned = cleaned.replace (String.valueOf
-                                           (dropChars.charAt (i)),
-                                           "");
-            }
-        }
-
-        cleaned = cleaned.replaceAll ("[\\(\\)]", "");
-        cleaned = cleaned.replaceAll ("-$", "");
-        cleaned = cleaned.replaceAll ("-", " ");
-        cleaned = cleaned.replaceAll (" +", " ");
-
-        if (!ignoreDiacritics) {
-            cleaned = handle_diacritics (cleaned);
-        }
-
-        return cleaned.toLowerCase ();
-    }
-
-
-
     @Override
     public void handleRequestBody (org.apache.solr.request.SolrQueryRequest req,
                                    org.apache.solr.response.SolrQueryResponse rsp)
@@ -787,11 +752,7 @@ public void handleRequestBody (org.apache.solr.request.SolrQueryRequest req,
             source.browse.reopenDatabasesIfUpdated ();
 
             if (from != null) {
-                rowid = (source.browse.getId
-                         (clean (from,
-                                 (source.ignoreDiacritics != null &&
-                                  source.ignoreDiacritics.equals ("yes")),
-                                 source.dropChars)));
+                rowid = (source.browse.getId (from));
             }
 
 

diff --git a/browse-indexing/CreateBrowseSQLite.java b/browse-indexing/CreateBrowseSQLite.java
@@ -7,13 +7,15 @@
 
 import java.sql.*;
 
+// Note that this version is coming from Solr!
+import org.apache.commons.codec.binary.Base64;
 
-class CreateBrowseSQLite
+
+public class CreateBrowseSQLite
 {
     private Connection outputDB;
 
     private String KEY_SEPARATOR = "\1";
-    private String RECORD_SEPARATOR = "\2";
 
 
     /*
@@ -66,7 +68,9 @@ private void loadHeadings (BufferedReader br)
         while ((line = readCRLFLine (br)) != null) {
             int sep = line.indexOf (KEY_SEPARATOR.charAt (0));
             if (sep >= 0) {
-                prep.setString (1, line.substring (0, sep));
+
+                byte[] key = Base64.decodeBase64 (line.substring (0, sep).getBytes());
+                prep.setBytes (1, key);
                 prep.setString (2, line.substring (sep + 1));
 
                 prep.addBatch ();

diff --git a/browse-indexing/Leech.java b/browse-indexing/Leech.java
@@ -3,6 +3,10 @@
 import org.apache.lucene.search.*;
 import java.io.*;
 
+import au.gov.nla.util.Normaliser;
+import au.gov.nla.util.BrowseEntry;
+
+
 public class Leech
 {
     protected IndexReader reader;
@@ -13,13 +17,6 @@ public class Leech
     private Normaliser normaliser;
 
 
-    protected String getEnvironment (String var)
-    {
-        return (System.getenv (var) != null) ?
-            System.getenv (var) : System.getProperty (var.toLowerCase ());
-    }
-
-
     public Leech (String indexPath,
                   String field) throws Exception
     {
@@ -28,19 +25,11 @@ public Leech (String indexPath,
         this.field = field;
         tenum = reader.terms (new Term (field, ""));
 
-        if (getEnvironment ("NORMALISER") != null) {
-            String normaliserClass = getEnvironment ("NORMALISER");
-
-            normaliser = (Normaliser) (Class.forName (normaliserClass)
-                        .getConstructor ()
-                        .newInstance ());
-        } else {
-            normaliser = new Normaliser ();
-        }
+        normaliser = Normaliser.getInstance ();
     }
 
 
-    public String buildSortKey (String heading)
+    public byte[] buildSortKey (String heading)
     {
         return normaliser.normalise (heading);
     }
@@ -64,14 +53,14 @@ private boolean termExists (Term t)
     }
 
 
-    public String[] next () throws Exception
+    public BrowseEntry next () throws Exception
     {
         if (tenum.term () != null &&
             tenum.term ().field ().equals (this.field)) {
             if (termExists (tenum.term ())) {
                 String term = tenum.term ().text ();
                 tenum.next ();
-                return new String[] {buildSortKey (term), term};
+                return new BrowseEntry (buildSortKey (term), term);
             } else {
                 tenum.next ();
                 return this.next ();

diff --git a/browse-indexing/Normaliser.java b/browse-indexing/Normaliser.java
diff --git a/browse-indexing/PrintBrowseHeadings.java b/browse-indexing/PrintBrowseHeadings.java
@@ -13,9 +13,13 @@
 
 import java.sql.*;
 
+import au.gov.nla.util.BrowseEntry;
 
+// Note that this version is coming from Solr!
+import org.apache.commons.codec.binary.Base64;
 
-class PrintBrowseHeadings
+
+public class PrintBrowseHeadings
 {
     private Leech bibLeech;
     private Leech authLeech;
@@ -34,18 +38,21 @@ private void loadHeadings (Leech leech,
                                Predicate predicate)
         throws Exception
     {
-        String[] h;
+        BrowseEntry h;
         while ((h = leech.next ()) != null) {
-            String sort_key = h[0];
-            String heading = h[1];
+            byte[] sort_key = h.key;
+            String heading = h.value;
 
             if (predicate != null &&
                 !predicate.isSatisfiedBy (heading)) {
                 continue;
             }
 
             if (sort_key != null) {
-                out.print (sort_key + KEY_SEPARATOR + heading + RECORD_SEPARATOR);
+                out.print (new String (Base64.encodeBase64 (sort_key)) +
+                           KEY_SEPARATOR +
+                           heading +
+                           RECORD_SEPARATOR);
             }
         }
     }

diff --git a/browse-indexing/StoredFieldLeech.java b/browse-indexing/StoredFieldLeech.java
@@ -7,10 +7,13 @@
 import org.apache.lucene.index.*;
 import org.apache.lucene.document.*;
 
+import au.gov.nla.util.Utils;
+import au.gov.nla.util.BrowseEntry;
+
 public class StoredFieldLeech extends Leech
 {
     int currentDoc = 0;
-    LinkedList<String[]> buffer;
+    LinkedList<BrowseEntry> buffer;
 
     String sortField;
     String valueField;
@@ -22,8 +25,8 @@ public StoredFieldLeech (String indexPath, String field) throws Exception
     {
         super (indexPath, field);
 
-        sortField = getEnvironment ("SORTFIELD");
-        valueField = getEnvironment ("VALUEFIELD");
+        sortField = Utils.getEnvironment ("SORTFIELD");
+        valueField = Utils.getEnvironment ("VALUEFIELD");
 
         if (sortField == null || valueField == null) {
             throw new IllegalArgumentException ("Both SORTFIELD and " +
@@ -32,6 +35,8 @@ public StoredFieldLeech (String indexPath, String field) throws Exception
         }
 
         fieldSelector = new FieldSelector () {
+                static final long serialVersionUID = -3547604067655030732L;
+
                 public FieldSelectorResult accept (String fieldName) {
                     if (fieldName.equals (sortField) ||
                         fieldName.equals (valueField)) {
@@ -44,7 +49,7 @@ public FieldSelectorResult accept (String fieldName) {
 
 
         reader = IndexReader.open (FSDirectory.open (new File (indexPath)));
-        buffer = new LinkedList<String[]> ();
+        buffer = new LinkedList<BrowseEntry> ();
     }
 
 
@@ -57,12 +62,12 @@ private void loadDocument (IndexReader reader, int docid)
         String[] value = doc.getValues (valueField);
 
         if (sort_key.length == 1 && value.length == 1) {
-            buffer.add (new String[] {sort_key[0], value[0]});
+            buffer.add (new BrowseEntry(buildSortKey(sort_key[0]), value[0]));
         }
     }
 
 
-    public String[] next () throws Exception
+    public BrowseEntry next () throws Exception
     {
         while (buffer.isEmpty ()) {
             if (currentDoc < reader.maxDoc ()) {