-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Completed basic unit tests for Pimarc
Archive format with reader and writer now completed. All the most basic features have unit tests now, which are passing. Now it's probably time to start implementing the use of this format in Pimlico corpus storage.
- Loading branch information
Showing
13 changed files
with
252 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
#!/usr/bin/env bash | ||
# Temporary script to just run unit tests for the Pimarc file format. | ||
# Once development of the readers/writers is finished, this will be removed. | ||
# The tests are run by all_unit_tests.sh anyway. | ||
DIR="$(cd "$( dirname $( readlink -f "${BASH_SOURCE[0]}" ))" && pwd )" | ||
VIRTUALENV=$DIR/../../lib/test_env $DIR/../python -m unittest discover $DIR/../../src/test/python/pimlicotest/utils/pimarc/ "*.py" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
import os | ||
import tempfile | ||
import unittest | ||
|
||
|
||
class PimarcReadTest(unittest.TestCase): | ||
def setUp(self): | ||
from pimlico import TEST_DATA_DIR | ||
import os | ||
|
||
self.input_path = os.path.join(TEST_DATA_DIR, "pimarc", "smallex.prc") | ||
|
||
|
||
class OpenArchiveTest(PimarcReadTest): | ||
""" | ||
Just open an archive and do nothing with it. | ||
""" | ||
def test_create(self): | ||
from pimlico.utils.pimarc import PimarcReader | ||
|
||
with PimarcReader(self.input_path): | ||
# Just opened the archive: don't do anything more | ||
pass | ||
|
||
|
||
class ReadIndexTest(PimarcReadTest): | ||
""" | ||
Read in an archive's index and check its format. | ||
We use a pre-prepared index where we know what the filenames should be and check they're write. | ||
""" | ||
def test_create(self): | ||
from pimlico.utils.pimarc import PimarcReader | ||
|
||
# We know that the filenames in the archive should look like this | ||
filename_base = "doc_{}" | ||
with PimarcReader(self.input_path) as arc: | ||
for i, filename in enumerate(arc.index): | ||
expected_fn = filename_base.format(i) | ||
self.assertEqual(expected_fn, filename, msg="filename read in archive did not match that expected") | ||
|
||
|
||
class ReadFilesTest(PimarcReadTest): | ||
""" | ||
Read in an archive's index and each of its files. | ||
We use a pre-prepared index where we know what the file content should be. | ||
Each file contains random data, but starts with a fixed string containing the filename, | ||
so we check that this has been correctly read. | ||
""" | ||
def test_create(self): | ||
from pimlico.utils.pimarc import PimarcReader | ||
|
||
# We know that the files in the archive should start like this | ||
file_start_base = "Start of doc_{}" | ||
with PimarcReader(self.input_path) as arc: | ||
for i, (metadata, file_data) in enumerate(arc): | ||
expected_start = file_start_base.format(i) | ||
# Decode the UTF-8 encoded data | ||
file_text = file_data.decode("utf-8") | ||
file_text_start = file_text[:len(expected_start)] | ||
self.assertEqual(expected_start, file_text_start, | ||
msg="text read in file in archive did not start with the expected string") | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import os | ||
import tempfile | ||
import unittest | ||
|
||
|
||
class PimarcWriterTest(unittest.TestCase): | ||
def setUp(self): | ||
# Create a temporary directory to use as our storage location | ||
self.storage_dir = tempfile.mkdtemp() | ||
|
||
def tearDown(self): | ||
import shutil | ||
shutil.rmtree(self.storage_dir) | ||
|
||
|
||
class CreateEmptyArchiveTest(PimarcWriterTest): | ||
""" | ||
Create an empty archive and write to disk. | ||
""" | ||
def test_create(self): | ||
from pimlico.utils.pimarc import PimarcWriter | ||
|
||
with PimarcWriter(os.path.join(self.storage_dir, "test.prc")): | ||
# Created the archive: don't do anything now | ||
# The index will be written when we exit the block | ||
pass | ||
|
||
|
||
def _generate_random_text(length=300): | ||
""" | ||
Generate a string made up of random characters. | ||
""" | ||
import string | ||
import random | ||
return "".join(random.choice(string.ascii_lowercase) for i in range(length)) | ||
|
||
|
||
class WriteRandomDocumentsTest(PimarcWriterTest): | ||
""" | ||
Create an empty archive and add some randomly-generated text documents. | ||
""" | ||
def test_write(self): | ||
from pimlico.utils.pimarc import PimarcWriter | ||
|
||
with PimarcWriter(os.path.join(self.storage_dir, "test.prc")) as arc: | ||
# Add 5 files to the archive | ||
for i in range(5): | ||
text = _generate_random_text() | ||
arc.write_file(text.encode("utf-8"), "doc_{}".format(i)) | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
""" | ||
Test writing out data and reading it back in. | ||
""" | ||
import os | ||
import tempfile | ||
import unittest | ||
|
||
|
||
class PimarcWriteReadTest(unittest.TestCase): | ||
def setUp(self): | ||
# Create a temporary directory to use as our storage location | ||
self.storage_dir = tempfile.mkdtemp() | ||
self.archive_path = os.path.join(self.storage_dir, "test.prc") | ||
|
||
def tearDown(self): | ||
import shutil | ||
shutil.rmtree(self.storage_dir) | ||
|
||
|
||
class WriteEmptyArchiveTest(PimarcWriteReadTest): | ||
""" | ||
Create an empty archive and write to disk, then read it back in and check it's empty. | ||
""" | ||
def test_create(self): | ||
from pimlico.utils.pimarc import PimarcWriter, PimarcReader | ||
|
||
with PimarcWriter(self.archive_path): | ||
# Created the archive: don't do anything now | ||
pass | ||
|
||
with PimarcReader(self.archive_path) as arc: | ||
self.assertEqual(len(arc), 0) | ||
self.assertEqual(len(arc.index), 0) | ||
|
||
|
||
def _generate_random_text(length=300): | ||
""" | ||
Generate a string made up of random characters. | ||
""" | ||
import string | ||
import random | ||
return "".join(random.choice(string.ascii_lowercase) for i in range(length)) | ||
|
||
|
||
class WriteRandomDocumentsTest(PimarcWriteReadTest): | ||
""" | ||
Create an empty archive and add some randomly-generated text documents. | ||
Read them back in and check they're the same. | ||
""" | ||
def test_write(self): | ||
from pimlico.utils.pimarc import PimarcWriter, PimarcReader | ||
# Generate some random text | ||
files_data = [_generate_random_text() for i in range(5)] | ||
|
||
with PimarcWriter(self.archive_path) as arc: | ||
# Add 5 files to the archive | ||
for i, text in enumerate(files_data): | ||
arc.write_file(text.encode("utf-8"), "doc_{}".format(i)) | ||
|
||
with PimarcReader(self.archive_path) as arc: | ||
self.assertEqual(len(arc), len(files_data)) | ||
|
||
# Read each file back in and check it has the same content | ||
for (metadata, file_data), expected_data in zip(arc, files_data): | ||
self.assertEqual(file_data.decode("utf-8"), expected_data, | ||
msg="data read in from archive doesn't match what we wrote out " | ||
"for the corresponding file") | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
ᅻ≮慭攢㨠≤潣弰≽밂却慲琠潦潣弰⸠灬汵灶捨敧杪汹畴晢慴煭硸癪敢癰癳汩畮慱畦摶橲楫畦瑯潩慺穹楫穯摪睱潩祭汰摶硵晦来湳灬汥浣杩扳摣牨畩晹桹晲湸杹楸瑥汲煮煬慡癲潰空硰歩湭決牬橷桫獹敩汫獴橱瑪癢畩桰牯業敱祲湸捺橨灯湭獵牰獥牨楪硰祹汷敮晭穭牲歲汤楪慳橺穦晫瑴楶煰硥灬楡扳摲癹照摺浰汧浹癱瑬橺桹杧祧桳灷敥略潧歷異楮捡敯晩獭晢獡穧湡橩牰橨異睧潬祭特ᅻ≮慭攢㨠≤潣弱≽밂却慲琠潦潣弱⸠楫牡晵牰桦穥浨捷浡摺扷摸祪畳捫汹潧楣灦捭瑦楦潦畧牦硴捳楪桤癤桫桥煫散摱慫祷癡慷睵扷汪智慦獸浫瑮硯敥慺煭楴晵睶牸祺畳橵扺灥硶扲湷潬浭獸煲祳潩睢楩癨汪瑭灭睴獶慹扦步瑺睤晣潳档祤督牱桲捨桢獭潢扺獴獲楷楴獣杮楱杹硤桶睩湴楩穳獤獬桩數潨摦煥扸湴潯楴汰条煺獸獰摡歴慩潫橯牥穬摸楩楤煤癴杹異歬歺獮晭桰潶癲捱癦慺摦硰ᅻ≮慭攢㨠≤潣弲≽밂却慲琠潦潣弲⸠牧湢癫浲敥穹穢癩慰扸敢硷煩摦癦扬敺楫敧敬扭浥扦橷摡湷祺汶扭汯湪慬煰晫污扩歳摶扪潦慫牬祣楪牥極浤捯摪杢杴穰穦硸畩湸祱睸畫湯扵敭晵祥督獰穪祪汶灴智穳祴扫當楳橡摢楶慲湸汵牴浫慪穯睯汦楡煸獬湱敳汬摦楬摧祣敶桬硫浣牱桵睺湤杰楥步晧潰汲硴汳獮灰煲晳硶歷獫杬楶潪瑷睢湩杵正歰牨瑱扢来瑸扸确条灱慢煪祬橯汧畸慣摹捱扮瑣ᅻ≮慭攢㨠≤潣弳≽밂却慲琠潦潣弳⸠獹獶灺桮瑷灺扵浶硷浬扴橩慳硷扤瑪湥慦癫煪杹潡牲晢癰畱潫浮捶浨瑧睰畩晷浡睵慩晡歩灪敵業杦硨癴杩潢祢祢浭浫晲祷穢杬敳桩桢敳畨祦湸獶穨硡祤硲晶摯灪硢浥湤桷硩獧癩穨祹敵扳慡捦止穰摤牤祡湱祷牬潷据摸确汪穹穥楰橩潵慧畯湦潮歰睨扬番睨潨獫汮牭捣扪浵番煮牯睦煦獯獤桩獭瑬批桬灬硹摲摯景楳牷灰穤晶据摭湩祧牶橣杤硭発灳穸ᅻ≮慭攢㨠≤潣弴≽밂却慲琠潦潣弴⸠牱煥牢獡畩祯晨桫硺煬灳桹桫祸歰穸橧敷硯睯獣穴祮橹睺桵湧牺硶灧穵煥潫敦浨橣癦橶照潲睰慮浰硤晡硤瑹牳歬煭祤潮档慪瑪潺獲整煤灰歨牡整普浶敲物獳摡敮歺灬煬獹此扤捥浥楤歳杩捶湷敩瑸摬汤牧畸潶獥特硷橵摥牦睯潶睡慤晭汸摳歬獵橹畵煨穳湲煳橺灣扥浣灨灬穦硣捦湶穹橱敪桺牺睫穮潡扸火灸睷湢摳穱瑲智据煦慥楦桪煲硦穩獸摧晦瑷 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
doc_0 0 18 | ||
doc_1 336 354 | ||
doc_2 672 690 | ||
doc_3 1008 1026 | ||
doc_4 1344 1362 |