-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_pdf.py
174 lines (135 loc) · 5.47 KB
/
check_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# filters output
import argparse
import os
import shutil
import subprocess
import time
import PyPDF2
import re
# TODO rozdzielić sprawdzanie DPI i trybu kolor, dodać osobną funkcję do ekstrakcji PDF
def convert_to_dict(line):
# Decode the bytes object to a string
line = line.decode()
parts = line.split(':')
name = parts[0]
items = parts[1].split()
d = {'name': name}
for item in items:
key, value = item.split('=')
d[key] = value
return d
def check_bitmaps():
for page in range(len(pdf.pages)):
wrong_dpi = False
wrong_colorspace = False
global wrong_dpi_pages
global wrong_colorspace_pages
current_page = page + 1
command = 'pdfimages.exe'
arguments = ['-list', '-f', f'{current_page}', '-l', f'{current_page}', f'{filename}', f'.tmp\\{current_page}']
proc = subprocess.Popen([command] + arguments, stdout=subprocess.PIPE)
proc.wait()
while True:
line = bytes(proc.stdout.readline())
if not line:
break
# the real code does filter here
d = convert_to_dict(line)
resolution_x = round(float(d['vdpi']), 0)
resolution_y = round(float(d['hdpi']), 0)
color_space = d['colorspace']
if (resolution_x or resolution_y < 200) and wrong_dpi is False:
# print(f'Znaleziono obrazek z DPI mniejszym niż 200: {ResolutionX}x{ResolutionY}')
# wrong_dpi_pages = wrong_dpi_pages + 1
if wrong_dpi_pages:
wrong_dpi_pages += ', '
wrong_dpi_pages += f'{current_page}'
wrong_dpi = True
if color_space in ("DeviceRGB", "ICCBased", "DeviceGray") and wrong_colorspace is False:
# wrong_colorspace_pages = wrong_colorspace_pages + 1
if wrong_colorspace_pages:
wrong_colorspace_pages += ', '
wrong_colorspace_pages += f'{current_page}'
wrong_colorspace = True
if wrong_dpi and wrong_colorspace is True:
break
# if wrong_dpi_pages > 0:
# print(f'Strona {current_page}: znaleziono obrazy ({wrong_dpi_pages}) mniejsze niż 200 dpi.')
#
# if wrong_colorspace_pages > 0:
# print(f'Strona {current_page}: znaleziono obrazy ({wrong_colorspace_pages}) w trybie RGB')
def check_objects():
for page in range(len(pdf.pages)):
global rgb_fill_pages
global rgb_outline_pages
current_page = page + 1
command = 'pdfcpu.exe'
arguments = ['extract', '-m', 'content', '-p', f'{current_page}', f'{filename}', '.tmp']
proc = subprocess.Popen([command] + arguments, stdout=subprocess.PIPE)
proc.wait()
name = os.path.basename(filename)
name, extension = os.path.splitext(filename)
# Initialize the count
# Open the file
with open(f'.tmp\\{name}_Content_page_{current_page}.txt', 'r') as f:
# Read the contents of the file into a string
contents = f.read()
matches = re.search(r"\d.* rg|\d.* scn", contents)
if matches is not None:
if rgb_fill_pages:
rgb_fill_pages += ', '
rgb_fill_pages += f'{current_page}'
matches = re.search(r"\d.* RG|\d.* SCN", contents)
if matches is not None:
if rgb_outline_pages:
rgb_outline_pages += ', '
rgb_outline_pages += f'{current_page}'
# if rgb_fill_pages > 0:
# print(f'Strona {current_page}: znaleziono wypełnienia RGB ({rgb_fill_pages}).')
#
# if rgb_outline_pages > 0:
# print(f'Strona {current_page}: znaleziono obrysy RGB ({rgb_outline_pages}).')
start_time = time.time()
parser = argparse.ArgumentParser()
group = parser.add_mutually_exclusive_group()
group.add_argument('filename', nargs='?')
args = parser.parse_args()
filename = args.filename
pdf = PyPDF2.PdfReader(filename)
if not os.path.exists(".tmp"):
os.makedirs(".tmp")
wrong_dpi_pages = ""
wrong_colorspace_pages = ""
rgb_fill_pages = ""
rgb_outline_pages = ""
# check_dpi_colorspace(page)
check_objects()
check_bitmaps()
with open("result.log", "w") as f:
if rgb_fill_pages:
log = f'Ostrzeżenie: strona zawiera wypełnienia inne niż CMYK: {rgb_fill_pages}'
print(log)
f.write(f'{log}\n')
if rgb_outline_pages:
log = f'Ostrzeżenie: strona zawiera kontury inne niż CMYK: {rgb_outline_pages}'
print(log)
f.write(f'{log}\n')
if wrong_dpi_pages:
print(f'Błąd: strona zawiera bitmapy o rozdzielczości mniejszej niż 200 dpi: {wrong_dpi_pages}')
print(log)
f.write(f'{log}\n')
if wrong_colorspace_pages:
log = f'Ostrzeżenie: strona zawiera bitmapy w trybie innym niż CMYK: {wrong_colorspace_pages}'
print(log)
f.writelines(f'{log}\n')
if all(not s for s in (rgb_fill_pages, rgb_outline_pages, wrong_dpi_pages, wrong_colorspace_pages)):
log = 'Nie znaleziono problemów.'
print(log)
f.write(f'{log}\n')
f.close()
shutil.rmtree('.tmp', ignore_errors=False, onerror=None)
# if wrong_dpi + wrong_colorspace + rgb_fill_pages + rgb_outline_pages == 0:
# print('Nie znaleziono problemów.')
end_time = time.time()
elapsed_time = end_time - start_time
print("\nOperacja ukończona w", round(elapsed_time, 2), "s.")