/
test_parsers.py
108 lines (80 loc) · 2.9 KB
/
test_parsers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import shutil
from subprocess import PIPE, run
import sys
import pytest
import pikepdf
from pikepdf import Dictionary, Object, Operator, Pdf, Stream, parse_content_stream
from pikepdf._qpdf import StreamParser
# pylint: disable=useless-super-delegation
class PrintParser(StreamParser):
def __init__(self):
super().__init__()
def handle_object(self, obj):
print(repr(obj))
def handle_eof(self):
print("--EOF--")
class ExceptionParser(StreamParser):
def __init__(self):
super().__init__()
def handle_object(self, obj): # pylint: disable=unused-argument
raise ValueError("I take exception to this")
def handle_eof(self):
print("--EOF--")
def test_open_pdf(resources):
pdf = Pdf.open(resources / 'graph.pdf')
page = pdf.pages[0]
Object._parse_stream(page, PrintParser())
def test_parser_exception(resources):
pdf = Pdf.open(resources / 'graph.pdf')
stream = pdf.pages[0]['/Contents']
with pytest.raises(ValueError):
Object._parse_stream(stream, ExceptionParser())
@pytest.mark.skipif(shutil.which('pdftotext') is None, reason="poppler not installed")
@pytest.mark.skipif(sys.version_info < (3, 6), reason="subprocess.run on 3.5")
def test_text_filter(resources, outdir):
input_pdf = resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf'
# Ensure the test PDF has detect we can find
proc = run(
['pdftotext', str(input_pdf), '-'], check=True, stdout=PIPE, encoding='utf-8'
)
assert proc.stdout.strip() != '', "Need input test file that contains text"
pdf = Pdf.open(input_pdf)
page = pdf.pages[0]
keep = []
for operands, command in parse_content_stream(
page, """TJ Tj ' " BT ET Td TD Tm T* Tc Tw Tz TL Tf Tr Ts"""
):
if command == Operator('Tj'):
print("skipping Tj")
continue
keep.append((operands, command))
new_stream = Stream(pdf, keep)
print(new_stream.read_bytes()) # pylint: disable=no-member
page['/Contents'] = new_stream
page['/Rotate'] = 90
pdf.save(outdir / 'notext.pdf', True)
proc = run(
['pdftotext', str(outdir / 'notext.pdf'), '-'],
check=True,
stdout=PIPE,
encoding='utf-8',
)
assert proc.stdout.strip() == '', "Expected text to be removed"
def test_invalid_stream_object():
with pytest.raises(TypeError):
parse_content_stream(Dictionary({"/Hi": 3}))
# @pytest.mark.parametrize(
# "test_file,expected",
# [
# ("fourpages.pdf", True),
# ("graph.pdf", False),
# ("veraPDF test suite 6-2-10-t02-pass-a.pdf", True),
# ("veraPDF test suite 6-2-3-3-t01-fail-c.pdf", False),
# ('sandwich.pdf', True),
# ],
# )
# def test_has_text(resources, test_file, expected):
# pdf = Pdf.open(resources / test_file)
# for p in pdf.pages:
# page = Page(p)
# assert page.has_text() == expected