Completed basic unit tests for Pimarc

Archive format with reader and writer now completed. All the most basic features have unit tests now, which are passing. Now it's probably time to start implementing the use of this format in Pimlico corpus storage.
markgw · Mar 30, 2020 · 6f6e3e2 · 6f6e3e2
1 parent 036a286
commit 6f6e3e2
Show file tree

Hide file tree

Showing 13 changed files with 252 additions and 12 deletions.
diff --git a/bin/test/pimarc_tests.sh b/bin/test/pimarc_tests.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+# Temporary script to just run unit tests for the Pimarc file format.
+# Once development of the readers/writers is finished, this will be removed.
+# The tests are run by all_unit_tests.sh anyway.
+DIR="$(cd "$( dirname $( readlink -f "${BASH_SOURCE[0]}" ))" && pwd )"
+VIRTUALENV=$DIR/../../lib/test_env $DIR/../python -m unittest discover $DIR/../../src/test/python/pimlicotest/utils/pimarc/ "*.py"
diff --git a/src/python/pimlico/__init__.py b/src/python/pimlico/__init__.py
@@ -20,7 +20,7 @@
     subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'future'])
     # Reload the environment, so we see the newly install package(s)
     import site
-    from imp import reload
+    from importlib import reload
     reload(site)
 
 from pimlico.core.dependencies.base import check_and_install

diff --git a/src/python/pimlico/core/dependencies/python.py b/src/python/pimlico/core/dependencies/python.py
@@ -10,7 +10,7 @@
 """
 from builtins import str
 import sys
-from imp import reload
+from importlib import reload
 from pkgutil import find_loader
 
 import pkg_resources

diff --git a/src/python/pimlico/utils/docs/modulegen.py b/src/python/pimlico/utils/docs/modulegen.py
@@ -26,7 +26,6 @@
 from importlib import import_module
 from pkgutil import iter_modules
 
-from past.builtins import basestring
 from sphinx import __version__
 from sphinx.ext.apidoc import format_heading
 
@@ -449,7 +448,7 @@ def generate_example_config(info, input_types, module_path, minimal=False):
 
 
 def _val_to_config(val):
-    if isinstance(val, basestring):
+    if isinstance(val, str):
         # This is easy: we can just use it
         return val
     # Some types can simply be converted to strings to get a good example
@@ -470,7 +469,7 @@ def _opt_type_to_config(otype):
     if hasattr(otype, "_opt_type_example"):
         return otype._opt_type_example
     if type(otype) is type:
-        if issubclass(otype, basestring):
+        if issubclass(otype, str):
             # Just a string, anything can go here
             return "text"
         elif issubclass(otype, int):

diff --git a/src/python/pimlico/utils/pimarc/__init__.py b/src/python/pimlico/utils/pimarc/__init__.py
@@ -34,6 +34,10 @@
 Restrictions on filenames:
 Filenames may use any unicode characters, excluding EOF, newline and tab.
 
+The standard filename for a Pimarc file is `.prc`. This file contains the archive's
+data. A second file is always stored in the same location, with an identical filename,
+except the extension `.prci`.
+
 """
 from .reader import PimarcReader
 from .writer import PimarcWriter

diff --git a/src/python/pimlico/utils/pimarc/index.py b/src/python/pimlico/utils/pimarc/index.py
@@ -27,6 +27,13 @@ def get_data_start_byte(self, filename):
     def __getitem__(self, item):
         return self.get_metadata_start_byte(item), self.get_data_start_byte(item)
 
+    def __iter__(self):
+        """ Simply iterate over the filenames. You can access the data using these as args to other methods. """
+        return iter(self.filenames)
+
+    def __len__(self):
+        return len(self.filenames)
+
     def append(self, filename, metadata_start, data_start):
         if filename in self.filenames:
             raise DuplicateFilename(filename)

diff --git a/src/python/pimlico/utils/pimarc/reader.py b/src/python/pimlico/utils/pimarc/reader.py
@@ -25,6 +25,7 @@ def open(self):
     def __enter__(self):
         self.archive_file = self.open()
         self.index = PimarcIndex.load(self.index_filename)
+        return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.archive_file.close()
@@ -100,6 +101,9 @@ def iter_files(self):
     def __iter__(self):
         return self.iter_files()
 
+    def __len__(self):
+        return len(self.index)
+
 
 def _read_var_length_data(reader):
     """

diff --git a/src/python/pimlico/utils/pimarc/writer.py b/src/python/pimlico/utils/pimarc/writer.py
@@ -32,7 +32,8 @@ def __init__(self, archive_filename, mode="w"):
 
     def open(self):
         """
-        Open the archive file.
+        Open the archive file. Called by the context manager's enter method, so
+        you don't usually need to call this.
 
         """
         return open(self.archive_filename, mode="ab" if self.append else "wb")
@@ -45,24 +46,38 @@ def __enter__(self):
         else:
             # Create an empty index
             self.index = PimarcIndex()
+        return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.archive_file.close()
         # We need to store the updated index, which is only updated in memory while writing
         self.index.save(self.index_filename)
 
-    def write_file(self, metadata, data):
+    def write_file(self, data, name=None, metadata=None):
         """
         Append a write to the end of the archive. The metadata should be a dictionary
         that can be encoded as JSON (which is how it will be stored). The data should
         be a bytes object.
 
+        If you want to write text files, you should encode the text as UTF-8 to get a
+        bytes object and write that.
+
+        Setting `name=X` is simply a shorthand for setting `metadata["name"]=X`.
+        Either `name` or a metadata dict including the `name` key is required.
+
         """
-        # The file's name should always be in the metadata as "name"
-        try:
-            filename = metadata["name"]
-        except KeyError:
-            raise MetadataError("metadata should include 'name' key")
+        if metadata is None:
+            metadata = {}
+
+        if name is not None:
+            filename = name
+            metadata["name"] = name
+        else:
+            # The file's name should always be in the metadata as "name"
+            try:
+                filename = metadata["name"]
+            except KeyError:
+                raise MetadataError("metadata should include 'name' key")
         # Check where we're up to in the file
         # This tells us where the metadata starts, which will be stored in the index
         metadata_start = self.archive_file.tell()

diff --git a/src/test/python/pimlicotest/utils/pimarc/read.py b/src/test/python/pimlicotest/utils/pimarc/read.py
@@ -0,0 +1,70 @@
+import os
+import tempfile
+import unittest
+
+
+class PimarcReadTest(unittest.TestCase):
+    def setUp(self):
+        from pimlico import TEST_DATA_DIR
+        import os
+
+        self.input_path = os.path.join(TEST_DATA_DIR, "pimarc", "smallex.prc")
+
+
+class OpenArchiveTest(PimarcReadTest):
+    """
+    Just open an archive and do nothing with it.
+
+    """
+    def test_create(self):
+        from pimlico.utils.pimarc import PimarcReader
+
+        with PimarcReader(self.input_path):
+            # Just opened the archive: don't do anything more
+            pass
+
+
+class ReadIndexTest(PimarcReadTest):
+    """
+    Read in an archive's index and check its format.
+
+    We use a pre-prepared index where we know what the filenames should be and check they're write.
+
+    """
+    def test_create(self):
+        from pimlico.utils.pimarc import PimarcReader
+
+        # We know that the filenames in the archive should look like this
+        filename_base = "doc_{}"
+        with PimarcReader(self.input_path) as arc:
+            for i, filename in enumerate(arc.index):
+                expected_fn = filename_base.format(i)
+                self.assertEqual(expected_fn, filename, msg="filename read in archive did not match that expected")
+
+
+class ReadFilesTest(PimarcReadTest):
+    """
+    Read in an archive's index and each of its files.
+
+    We use a pre-prepared index where we know what the file content should be.
+    Each file contains random data, but starts with a fixed string containing the filename,
+    so we check that this has been correctly read.
+
+    """
+    def test_create(self):
+        from pimlico.utils.pimarc import PimarcReader
+
+        # We know that the files in the archive should start like this
+        file_start_base = "Start of doc_{}"
+        with PimarcReader(self.input_path) as arc:
+            for i, (metadata, file_data) in enumerate(arc):
+                expected_start = file_start_base.format(i)
+                # Decode the UTF-8 encoded data
+                file_text = file_data.decode("utf-8")
+                file_text_start = file_text[:len(expected_start)]
+                self.assertEqual(expected_start, file_text_start,
+                                 msg="text read in file in archive did not start with the expected string")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/test/python/pimlicotest/utils/pimarc/write.py b/src/test/python/pimlicotest/utils/pimarc/write.py
@@ -0,0 +1,55 @@
+import os
+import tempfile
+import unittest
+
+
+class PimarcWriterTest(unittest.TestCase):
+    def setUp(self):
+        # Create a temporary directory to use as our storage location
+        self.storage_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        import shutil
+        shutil.rmtree(self.storage_dir)
+
+
+class CreateEmptyArchiveTest(PimarcWriterTest):
+    """
+    Create an empty archive and write to disk.
+
+    """
+    def test_create(self):
+        from pimlico.utils.pimarc import PimarcWriter
+
+        with PimarcWriter(os.path.join(self.storage_dir, "test.prc")):
+            # Created the archive: don't do anything now
+            # The index will be written when we exit the block
+            pass
+
+
+def _generate_random_text(length=300):
+    """
+    Generate a string made up of random characters.
+    """
+    import string
+    import random
+    return "".join(random.choice(string.ascii_lowercase) for i in range(length))
+
+
+class WriteRandomDocumentsTest(PimarcWriterTest):
+    """
+    Create an empty archive and add some randomly-generated text documents.
+
+    """
+    def test_write(self):
+        from pimlico.utils.pimarc import PimarcWriter
+
+        with PimarcWriter(os.path.join(self.storage_dir, "test.prc")) as arc:
+            # Add 5 files to the archive
+            for i in range(5):
+                text = _generate_random_text()
+                arc.write_file(text.encode("utf-8"), "doc_{}".format(i))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/test/python/pimlicotest/utils/pimarc/write_read.py b/src/test/python/pimlicotest/utils/pimarc/write_read.py
@@ -0,0 +1,74 @@
+"""
+Test writing out data and reading it back in.
+
+"""
+import os
+import tempfile
+import unittest
+
+
+class PimarcWriteReadTest(unittest.TestCase):
+    def setUp(self):
+        # Create a temporary directory to use as our storage location
+        self.storage_dir = tempfile.mkdtemp()
+        self.archive_path = os.path.join(self.storage_dir, "test.prc")
+
+    def tearDown(self):
+        import shutil
+        shutil.rmtree(self.storage_dir)
+
+
+class WriteEmptyArchiveTest(PimarcWriteReadTest):
+    """
+    Create an empty archive and write to disk, then read it back in and check it's empty.
+
+    """
+    def test_create(self):
+        from pimlico.utils.pimarc import PimarcWriter, PimarcReader
+
+        with PimarcWriter(self.archive_path):
+            # Created the archive: don't do anything now
+            pass
+
+        with PimarcReader(self.archive_path) as arc:
+            self.assertEqual(len(arc), 0)
+            self.assertEqual(len(arc.index), 0)
+
+
+def _generate_random_text(length=300):
+    """
+    Generate a string made up of random characters.
+    """
+    import string
+    import random
+    return "".join(random.choice(string.ascii_lowercase) for i in range(length))
+
+
+class WriteRandomDocumentsTest(PimarcWriteReadTest):
+    """
+    Create an empty archive and add some randomly-generated text documents.
+    Read them back in and check they're the same.
+
+    """
+    def test_write(self):
+        from pimlico.utils.pimarc import PimarcWriter, PimarcReader
+        # Generate some random text
+        files_data = [_generate_random_text() for i in range(5)]
+
+        with PimarcWriter(self.archive_path) as arc:
+            # Add 5 files to the archive
+            for i, text in enumerate(files_data):
+                arc.write_file(text.encode("utf-8"), "doc_{}".format(i))
+
+        with PimarcReader(self.archive_path) as arc:
+            self.assertEqual(len(arc), len(files_data))
+
+            # Read each file back in and check it has the same content
+            for (metadata, file_data), expected_data in zip(arc, files_data):
+                self.assertEqual(file_data.decode("utf-8"), expected_data,
+                                 msg="data read in from archive doesn't match what we wrote out "
+                                     "for the corresponding file")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/data/pimarc/smallex.prc b/test/data/pimarc/smallex.prc
@@ -0,0 +1 @@
+ᅻ≮慭攢㨠≤潣弰≽밂却慲琠潦⁤潣弰⸠灬汵灶捨敧杪汹畴晢慴煭硸癪敢癰癳汩畮慱畦摶橲楫畦瑯潩慺穹楫穯摪睱潩祭汰摶硵晦来湳灬汥浣杩扳摣牨畩晹桹晲湸杹楸瑥汲煮煬慡癲潰空硰歩湭決牬橷桫獹敩汫獴橱瑪癢畩桰牯業敱祲湸捺橨灯湭獵牰獥牨楪硰祹汷敮晭穭牲歲汤楪慳橺穦晫瑴楶煰硥灬楡扳摲癹照摺浰汧浹癱瑬橺桹杧祧桳灷敥略潧歷異楮捡敯晩獭晢獡穧湡橩牰橨異睧潬祭特ᅻ≮慭攢㨠≤潣弱≽밂却慲琠潦⁤潣弱⸠楫牡晵牰桦穥浨捷浡摺扷摸祪畳捫汹潧楣灦捭瑦楦潦畧牦硴捳楪桤癤桫桥煫散摱慫祷癡慷睵扷汪智慦獸浫瑮硯敥慺煭楴晵睶牸祺畳橵扺灥硶扲湷潬浭獸煲祳潩睢楩癨汪瑭灭睴獶慹扦步瑺睤晣潳档祤督牱桲捨桢獭潢扺獴獲楷楴獣杮楱杹硤桶睩湴楩穳獤獬桩數潨摦煥扸湴潯楴汰条煺獸獰摡歴慩潫橯牥穬摸楩楤煤癴杹異歬歺獮晭桰潶癲捱癦慺摦硰ᅻ≮慭攢㨠≤潣弲≽밂却慲琠潦⁤潣弲⸠牧湢癫浲敥穹穢癩慰扸敢硷煩摦癦扬敺楫敧敬扭浥扦橷摡湷祺汶扭汯湪慬煰晫污扩歳摶扪潦慫牬祣楪牥極浤捯摪杢杴穰穦硸畩湸祱睸畫湯扵敭晵祥督獰穪祪汶灴智穳祴扫當楳橡摢楶慲湸汵牴浫慪穯睯汦楡煸獬湱敳汬摦楬摧祣敶桬硫浣牱桵睺湤杰楥步晧潰汲硴汳獮灰煲晳硶歷獫杬楶潪瑷睢湩杵正歰牨瑱扢来瑸扸确条灱慢煪祬橯汧畸慣摹捱扮瑣ᅻ≮慭攢㨠≤潣弳≽밂却慲琠潦⁤潣弳⸠獹獶灺桮瑷灺扵浶硷浬扴橩慳硷扤瑪湥慦癫煪杹潡牲晢癰畱潫浮捶浨瑧睰畩晷浡睵慩晡歩灪敵業杦硨癴杩潢祢祢浭浫晲祷穢杬敳桩桢敳畨祦湸獶穨硡祤硲晶摯灪硢浥湤桷硩獧癩穨祹敵扳慡捦止穰摤牤祡湱祷牬潷据摸确汪穹穥楰橩潵慧畯湦潮歰睨扬番睨潨獫汮牭捣扪浵番煮牯睦煦獯獤桩獭瑬批桬灬硹摲摯景楳牷灰穤晶据摭湩祧牶橣杤硭発灳穸ᅻ≮慭攢㨠≤潣弴≽밂却慲琠潦⁤潣弴⸠牱煥牢獡畩祯晨桫硺煬灳桹桫祸歰穸橧敷硯睯獣穴祮橹睺桵湧牺硶灧穵煥潫敦浨橣癦橶照潲睰慮浰硤晡硤瑹牳歬煭祤潮档慪瑪潺獲整煤灰歨牡整普浶敲物獳摡敮歺灬煬獹此扤捥浥楤歳杩捶湷敩瑸摬汤牧畸潶獥特硷橵摥牦睯潶睡慤晭汸摳歬獵橹畵煨穳湲煳橺灣扥浣灨灬穦硣捦湶穹橱敪桺牺睫穮潡扸火灸睷湢摳穱瑲智据煦慥楦桪煲硦穩獸摧晦瑷
diff --git a/test/data/pimarc/smallex.prci b/test/data/pimarc/smallex.prci
@@ -0,0 +1,5 @@
+doc_0	0	18
+doc_1	336	354
+doc_2	672	690
+doc_3	1008	1026
+doc_4	1344	1362