Skip to content

Commit

Permalink
Completed basic unit tests for Pimarc
Browse files Browse the repository at this point in the history
Archive format with reader and writer now completed. All the most basic features have unit tests now, which are passing.

Now it's probably time to start implementing the use of this format in Pimlico corpus storage.
  • Loading branch information
markgw committed Mar 30, 2020
1 parent 036a286 commit 6f6e3e2
Show file tree
Hide file tree
Showing 13 changed files with 252 additions and 12 deletions.
6 changes: 6 additions & 0 deletions bin/test/pimarc_tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env bash
# Temporary script to just run unit tests for the Pimarc file format.
# Once development of the readers/writers is finished, this will be removed.
# The tests are run by all_unit_tests.sh anyway.
DIR="$(cd "$( dirname $( readlink -f "${BASH_SOURCE[0]}" ))" && pwd )"
VIRTUALENV=$DIR/../../lib/test_env $DIR/../python -m unittest discover $DIR/../../src/test/python/pimlicotest/utils/pimarc/ "*.py"
2 changes: 1 addition & 1 deletion src/python/pimlico/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'future'])
# Reload the environment, so we see the newly install package(s)
import site
from imp import reload
from importlib import reload
reload(site)

from pimlico.core.dependencies.base import check_and_install
Expand Down
2 changes: 1 addition & 1 deletion src/python/pimlico/core/dependencies/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"""
from builtins import str
import sys
from imp import reload
from importlib import reload
from pkgutil import find_loader

import pkg_resources
Expand Down
5 changes: 2 additions & 3 deletions src/python/pimlico/utils/docs/modulegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
from importlib import import_module
from pkgutil import iter_modules

from past.builtins import basestring
from sphinx import __version__
from sphinx.ext.apidoc import format_heading

Expand Down Expand Up @@ -449,7 +448,7 @@ def generate_example_config(info, input_types, module_path, minimal=False):


def _val_to_config(val):
if isinstance(val, basestring):
if isinstance(val, str):
# This is easy: we can just use it
return val
# Some types can simply be converted to strings to get a good example
Expand All @@ -470,7 +469,7 @@ def _opt_type_to_config(otype):
if hasattr(otype, "_opt_type_example"):
return otype._opt_type_example
if type(otype) is type:
if issubclass(otype, basestring):
if issubclass(otype, str):
# Just a string, anything can go here
return "text"
elif issubclass(otype, int):
Expand Down
4 changes: 4 additions & 0 deletions src/python/pimlico/utils/pimarc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@
Restrictions on filenames:
Filenames may use any unicode characters, excluding EOF, newline and tab.
The standard filename for a Pimarc file is `.prc`. This file contains the archive's
data. A second file is always stored in the same location, with an identical filename,
except the extension `.prci`.
"""
from .reader import PimarcReader
from .writer import PimarcWriter
Expand Down
7 changes: 7 additions & 0 deletions src/python/pimlico/utils/pimarc/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ def get_data_start_byte(self, filename):
def __getitem__(self, item):
return self.get_metadata_start_byte(item), self.get_data_start_byte(item)

def __iter__(self):
""" Simply iterate over the filenames. You can access the data using these as args to other methods. """
return iter(self.filenames)

def __len__(self):
return len(self.filenames)

def append(self, filename, metadata_start, data_start):
if filename in self.filenames:
raise DuplicateFilename(filename)
Expand Down
4 changes: 4 additions & 0 deletions src/python/pimlico/utils/pimarc/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def open(self):
def __enter__(self):
self.archive_file = self.open()
self.index = PimarcIndex.load(self.index_filename)
return self

def __exit__(self, exc_type, exc_val, exc_tb):
self.archive_file.close()
Expand Down Expand Up @@ -100,6 +101,9 @@ def iter_files(self):
def __iter__(self):
return self.iter_files()

def __len__(self):
return len(self.index)


def _read_var_length_data(reader):
"""
Expand Down
29 changes: 22 additions & 7 deletions src/python/pimlico/utils/pimarc/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ def __init__(self, archive_filename, mode="w"):

def open(self):
"""
Open the archive file.
Open the archive file. Called by the context manager's enter method, so
you don't usually need to call this.
"""
return open(self.archive_filename, mode="ab" if self.append else "wb")
Expand All @@ -45,24 +46,38 @@ def __enter__(self):
else:
# Create an empty index
self.index = PimarcIndex()
return self

def __exit__(self, exc_type, exc_val, exc_tb):
self.archive_file.close()
# We need to store the updated index, which is only updated in memory while writing
self.index.save(self.index_filename)

def write_file(self, metadata, data):
def write_file(self, data, name=None, metadata=None):
"""
Append a write to the end of the archive. The metadata should be a dictionary
that can be encoded as JSON (which is how it will be stored). The data should
be a bytes object.
If you want to write text files, you should encode the text as UTF-8 to get a
bytes object and write that.
Setting `name=X` is simply a shorthand for setting `metadata["name"]=X`.
Either `name` or a metadata dict including the `name` key is required.
"""
# The file's name should always be in the metadata as "name"
try:
filename = metadata["name"]
except KeyError:
raise MetadataError("metadata should include 'name' key")
if metadata is None:
metadata = {}

if name is not None:
filename = name
metadata["name"] = name
else:
# The file's name should always be in the metadata as "name"
try:
filename = metadata["name"]
except KeyError:
raise MetadataError("metadata should include 'name' key")
# Check where we're up to in the file
# This tells us where the metadata starts, which will be stored in the index
metadata_start = self.archive_file.tell()
Expand Down
70 changes: 70 additions & 0 deletions src/test/python/pimlicotest/utils/pimarc/read.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import os
import tempfile
import unittest


class PimarcReadTest(unittest.TestCase):
def setUp(self):
from pimlico import TEST_DATA_DIR
import os

self.input_path = os.path.join(TEST_DATA_DIR, "pimarc", "smallex.prc")


class OpenArchiveTest(PimarcReadTest):
"""
Just open an archive and do nothing with it.
"""
def test_create(self):
from pimlico.utils.pimarc import PimarcReader

with PimarcReader(self.input_path):
# Just opened the archive: don't do anything more
pass


class ReadIndexTest(PimarcReadTest):
"""
Read in an archive's index and check its format.
We use a pre-prepared index where we know what the filenames should be and check they're write.
"""
def test_create(self):
from pimlico.utils.pimarc import PimarcReader

# We know that the filenames in the archive should look like this
filename_base = "doc_{}"
with PimarcReader(self.input_path) as arc:
for i, filename in enumerate(arc.index):
expected_fn = filename_base.format(i)
self.assertEqual(expected_fn, filename, msg="filename read in archive did not match that expected")


class ReadFilesTest(PimarcReadTest):
"""
Read in an archive's index and each of its files.
We use a pre-prepared index where we know what the file content should be.
Each file contains random data, but starts with a fixed string containing the filename,
so we check that this has been correctly read.
"""
def test_create(self):
from pimlico.utils.pimarc import PimarcReader

# We know that the files in the archive should start like this
file_start_base = "Start of doc_{}"
with PimarcReader(self.input_path) as arc:
for i, (metadata, file_data) in enumerate(arc):
expected_start = file_start_base.format(i)
# Decode the UTF-8 encoded data
file_text = file_data.decode("utf-8")
file_text_start = file_text[:len(expected_start)]
self.assertEqual(expected_start, file_text_start,
msg="text read in file in archive did not start with the expected string")


if __name__ == "__main__":
unittest.main()
55 changes: 55 additions & 0 deletions src/test/python/pimlicotest/utils/pimarc/write.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import os
import tempfile
import unittest


class PimarcWriterTest(unittest.TestCase):
def setUp(self):
# Create a temporary directory to use as our storage location
self.storage_dir = tempfile.mkdtemp()

def tearDown(self):
import shutil
shutil.rmtree(self.storage_dir)


class CreateEmptyArchiveTest(PimarcWriterTest):
"""
Create an empty archive and write to disk.
"""
def test_create(self):
from pimlico.utils.pimarc import PimarcWriter

with PimarcWriter(os.path.join(self.storage_dir, "test.prc")):
# Created the archive: don't do anything now
# The index will be written when we exit the block
pass


def _generate_random_text(length=300):
"""
Generate a string made up of random characters.
"""
import string
import random
return "".join(random.choice(string.ascii_lowercase) for i in range(length))


class WriteRandomDocumentsTest(PimarcWriterTest):
"""
Create an empty archive and add some randomly-generated text documents.
"""
def test_write(self):
from pimlico.utils.pimarc import PimarcWriter

with PimarcWriter(os.path.join(self.storage_dir, "test.prc")) as arc:
# Add 5 files to the archive
for i in range(5):
text = _generate_random_text()
arc.write_file(text.encode("utf-8"), "doc_{}".format(i))


if __name__ == "__main__":
unittest.main()
74 changes: 74 additions & 0 deletions src/test/python/pimlicotest/utils/pimarc/write_read.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
Test writing out data and reading it back in.
"""
import os
import tempfile
import unittest


class PimarcWriteReadTest(unittest.TestCase):
def setUp(self):
# Create a temporary directory to use as our storage location
self.storage_dir = tempfile.mkdtemp()
self.archive_path = os.path.join(self.storage_dir, "test.prc")

def tearDown(self):
import shutil
shutil.rmtree(self.storage_dir)


class WriteEmptyArchiveTest(PimarcWriteReadTest):
"""
Create an empty archive and write to disk, then read it back in and check it's empty.
"""
def test_create(self):
from pimlico.utils.pimarc import PimarcWriter, PimarcReader

with PimarcWriter(self.archive_path):
# Created the archive: don't do anything now
pass

with PimarcReader(self.archive_path) as arc:
self.assertEqual(len(arc), 0)
self.assertEqual(len(arc.index), 0)


def _generate_random_text(length=300):
"""
Generate a string made up of random characters.
"""
import string
import random
return "".join(random.choice(string.ascii_lowercase) for i in range(length))


class WriteRandomDocumentsTest(PimarcWriteReadTest):
"""
Create an empty archive and add some randomly-generated text documents.
Read them back in and check they're the same.
"""
def test_write(self):
from pimlico.utils.pimarc import PimarcWriter, PimarcReader
# Generate some random text
files_data = [_generate_random_text() for i in range(5)]

with PimarcWriter(self.archive_path) as arc:
# Add 5 files to the archive
for i, text in enumerate(files_data):
arc.write_file(text.encode("utf-8"), "doc_{}".format(i))

with PimarcReader(self.archive_path) as arc:
self.assertEqual(len(arc), len(files_data))

# Read each file back in and check it has the same content
for (metadata, file_data), expected_data in zip(arc, files_data):
self.assertEqual(file_data.decode("utf-8"), expected_data,
msg="data read in from archive doesn't match what we wrote out "
"for the corresponding file")


if __name__ == "__main__":
unittest.main()
1 change: 1 addition & 0 deletions test/data/pimarc/smallex.prc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ᅻ≮慭攢㨠≤潣弰≽밂却慲琠潦⁤潣弰⸠灬汵灶捨敧杪汹畴晢慴煭硸癪敢癰癳汩畮慱畦摶橲楫畦瑯潩慺穹楫穯摪睱潩祭汰摶硵晦来湳灬汥浣杩扳摣牨畩晹桹晲湸杹楸瑥汲煮煬慡癲潰空硰歩湭決牬橷桫獹敩汫獴橱瑪癢畩桰牯業敱祲湸捺橨灯湭獵牰獥牨楪硰祹汷敮晭穭牲歲汤楪慳橺穦晫瑴楶煰硥灬楡扳摲癹照摺浰汧浹癱瑬橺桹杧祧桳灷敥略潧歷異楮捡敯晩獭晢獡穧湡橩牰橨異睧潬祭特ᅻ≮慭攢㨠≤潣弱≽밂却慲琠潦⁤潣弱⸠楫牡晵牰桦穥浨捷浡摺扷摸祪畳捫汹潧楣灦捭瑦楦潦畧牦硴捳楪桤癤桫桥煫散摱慫祷癡慷睵扷汪智慦獸浫瑮硯敥慺煭楴晵睶牸祺畳橵扺灥硶扲湷潬浭獸煲祳潩睢楩癨汪瑭灭睴獶慹扦步瑺睤晣潳档祤督牱桲捨桢獭潢扺獴獲楷楴獣杮楱杹硤桶睩湴楩穳獤獬桩數潨摦煥扸湴潯楴汰条煺獸獰摡歴慩潫橯牥穬摸楩楤煤癴杹異歬歺獮晭桰潶癲捱癦慺摦硰ᅻ≮慭攢㨠≤潣弲≽밂却慲琠潦⁤潣弲⸠牧湢癫浲敥穹穢癩慰扸敢硷煩摦癦扬敺楫敧敬扭浥扦橷摡湷祺汶扭汯湪慬煰晫污扩歳摶扪潦慫牬祣楪牥極浤捯摪杢杴穰穦硸畩湸祱睸畫湯扵敭晵祥督獰穪祪汶灴智穳祴扫當楳橡摢楶慲湸汵牴浫慪穯睯汦楡煸獬湱敳汬摦楬摧祣敶桬硫浣牱桵睺湤杰楥步晧潰汲硴汳獮灰煲晳硶歷獫杬楶潪瑷睢湩杵正歰牨瑱扢来瑸扸确条灱慢煪祬橯汧畸慣摹捱扮瑣ᅻ≮慭攢㨠≤潣弳≽밂却慲琠潦⁤潣弳⸠獹獶灺桮瑷灺扵浶硷浬扴橩慳硷扤瑪湥慦癫煪杹潡牲晢癰畱潫浮捶浨瑧睰畩晷浡睵慩晡歩灪敵業杦硨癴杩潢祢祢浭浫晲祷穢杬敳桩桢敳畨祦湸獶穨硡祤硲晶摯灪硢浥湤桷硩獧癩穨祹敵扳慡捦止穰摤牤祡湱祷牬潷据摸确汪穹穥楰橩潵慧畯湦潮歰睨扬番睨潨獫汮牭捣扪浵番煮牯睦煦獯獤桩獭瑬批桬灬硹摲摯景楳牷灰穤晶据摭湩祧牶橣杤硭発灳穸ᅻ≮慭攢㨠≤潣弴≽밂却慲琠潦⁤潣弴⸠牱煥牢獡畩祯晨桫硺煬灳桹桫祸歰穸橧敷硯睯獣穴祮橹睺桵湧牺硶灧穵煥潫敦浨橣癦橶照潲睰慮浰硤晡硤瑹牳歬煭祤潮档慪瑪潺獲整煤灰歨牡整普浶敲物獳摡敮歺灬煬獹此扤捥浥楤歳杩捶湷敩瑸摬汤牧畸潶獥特硷橵摥牦睯潶睡慤晭汸摳歬獵橹畵煨穳湲煳橺灣扥浣灨灬穦硣捦湶穹橱敪桺牺睫穮潡扸火灸睷湢摳穱瑲智据煦慥楦桪煲硦穩獸摧晦瑷
5 changes: 5 additions & 0 deletions test/data/pimarc/smallex.prci
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
doc_0 0 18
doc_1 336 354
doc_2 672 690
doc_3 1008 1026
doc_4 1344 1362

0 comments on commit 6f6e3e2

Please sign in to comment.