-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
415 lines (367 loc) · 13.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
from typing import Dict, List, Union, Callable
from PIL import Image
from pathlib import Path
import zlib
import numpy
from pikepdf import Pdf, PdfImage, Name, parse_content_stream, unparse_content_stream, ContentStreamInstruction
from decimal import Decimal
import cv2
import zipfile
import io
import numpy as np
# from quality import improve_text_in_image
import fire
from enum import Enum
from logging import getLogger
from sentry_sdk import capture_message
logger = getLogger(__name__)
def w_sentry(fn: Callable, *args, **kwargs) -> bool:
try:
fn(*args, **kwargs)
return True
except Exception as e:
logger.info("Sentry setup issue", exc_info=e)
return False
class MethodChoice(Enum):
geos = "GEOS"
colors_replacement = "Replace colors"
openCV2 = "OpenCV2"
@staticmethod
def from_str(label):
if label in ('geos',):
return MethodChoice.geos
elif label in ('colors_replacement',):
return MethodChoice.colors_replacement
elif label in ('openCV2',):
return MethodChoice.openCV2
else:
raise NotImplementedError
def hex_to_rbg(hex_color: str) -> tuple:
"""
Convert hex color to rgb
example: #FFFFFF to (255, 255, 255)
:param hex_color:
:return:
"""
r = int(hex_color[1:3], 16)
g = int(hex_color[3:5], 16)
b = int(hex_color[5:7], 16)
return r, g, b
def rgb_to_hex(rgb_color: tuple) -> str:
"""
Convert rgb color to hex
example: (255, 255, 255) to #FFFFFF
:param rgb_color:
:return:
"""
r, g, b = rgb_color
return '#%02x%02x%02x' % (r, g, b)
def hex_to_rgba(hex_color: str, alpha: int = 255):
"""
Convert hex color to rgba
example: #FFFFFF to (255, 255, 255, 255)
:param hex_color:
:param alpha:
:return:
"""
r, g, b = hex_to_rbg(hex_color)
return r, g, b, alpha
def rgba_to_hex(rgba_color: tuple):
"""
Convert rgba color to hex
example: (255, 255, 255, 128) to #FFFFFF
:param rgba_color:
:return:
"""
r, g, b, a = rgba_color
return '#%02x%02x%02x' % (r, g, b)
def replace_images_in_zip(in_zip: Path, out_zip: Path, replacements: Dict[str, io.BytesIO]) -> Path:
"""
Replace images in zip
:param in_zip:
:param out_zip:
:param replacements:
:return:
"""
with zipfile.ZipFile(in_zip, 'r') as zin:
with zipfile.ZipFile(out_zip, 'w') as zout:
zout.comment = zin.comment # preserve the comment
for item in zin.infolist():
# copy all files except the images to replace
if item.filename not in replacements.keys():
with zout.open(item.filename, 'w') as f:
f.write(zin.read(item.filename))
else: # replace the image
with zout.open(item.filename, 'w') as f:
f.write(replacements[item.filename].getvalue())
return out_zip
def remove_watermark_from_cv_image(img: numpy.ndarray) -> numpy.ndarray:
"""
Remove watermark from open cv image
:param img:
:return: image without watermark
"""
# convert image to hsv colorspace
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
h, s, v = cv2.split(hsv)
# threshold saturation image
thresh1 = cv2.threshold(s, 92, 255, cv2.THRESH_BINARY)[1]
# threshold value image and invert
thresh2 = cv2.threshold(v, 128, 255, cv2.THRESH_BINARY)[1]
thresh2 = 255 - thresh2
# combine the two threshold images as a mask
mask = cv2.add(thresh1, thresh2)
# use mask to remove lines in background of input
result = img.copy()
result[mask == 0] = (255, 255, 255)
return result
def replace_colors_in_image(image: Image, replacements: Dict[str, str]) -> Image:
"""
Replace colors in image with new colors from replacements {to_replace_hex: new_hex}
example = {
'#f0f0f0': '#FFFFFF',
'#c0c0c0': '#FFFFFF',
}
:param image:
:param replacements:
:return:
"""
image = image.convert('RGBA')
data = np.array(image) # "data" is a height x width x 4 numpy array
red, green, blue, alpha = data.T # Temporarily unpack the bands for readability
for old_c, new_c in replacements.items():
old_color = hex_to_rbg(old_c)
new_color = hex_to_rbg(new_c)
white_areas = (red == old_color[0]) & (blue == old_color[1]) & (green == old_color[2])
data[..., :-1][white_areas.T] = new_color
return Image.fromarray(data)
def remove_watermark_from_pil_image(image: Image, method_choice: MethodChoice,
replacements: Dict[str, str] = None) -> Image:
"""
Remove watermark from pil image
:param image:
:param method_choice:
:param replacements:
:return:
"""
if method_choice == MethodChoice.openCV2:
# noinspection PyTypeChecker
opencv_image = cv2.cvtColor(numpy.array(image), cv2.COLOR_RGB2BGR)
opencv_image = remove_watermark_from_cv_image(opencv_image)
image = Image.fromarray(cv2.cvtColor(opencv_image, cv2.COLOR_BGR2RGB))
return image
else:
return replace_colors_in_image(image, replacements or {
'#f0f0f0': '#FFFFFF',
'#c0c0c0': '#FFFFFF',
'#b4b4fe': '#FFFFFF',
})
def generate_output_path(input_path: Path) -> Path:
"""
Generate output_path from input_path
:param input_path:
:return:
"""
return input_path.parent / (input_path.stem + f"_generated{input_path.suffix}")
def remove_tj_maj(instructions: List[ContentStreamInstruction], tj_start_with: Union[List[bytes], str]) -> \
List[ContentStreamInstruction]:
"""Remove all "TJ" operators if it contains tj_start_with from PDF."""
dest = tj_start_with
if isinstance(tj_start_with, str):
dest = list(map(lambda c: bytes(c, 'utf-8'), tj_start_with))
for i in range(len(instructions)):
instruction = instructions[i]
op = instruction.operator.__str__()
if op == 'TJ':
source1 = []
source2 = []
for j in range(len(instruction.operands[0])):
if type(instruction.operands[0][j]) not in [int, float, Decimal]:
operand_str = instruction.operands[0][j].__str__()
source2.append(bytes(operand_str, 'utf-8'))
for k in range(len(operand_str)):
source1.append(bytes(operand_str[k], 'utf-8'))
for j in range(len(dest)):
if dest[j] != source1[j]:
break
if j == len(dest) - 1:
instructions[i] = None
for j in range(len(dest)):
if dest[j] != source2[j]:
break
if j == len(dest) - 1:
instructions[i] = None
return [x for x in instructions if x is not None]
def remove_tjs_min(instructions: List[ContentStreamInstruction], text: Union[List[bytes], str]) -> \
List[ContentStreamInstruction]:
"""Remove all successive "Tj" if it combined equals to text from PDF."""
indexes = []
found = ""
for i in range(len(instructions)):
instruction = instructions[i]
op = instruction.operator.__str__()
if op == 'Tj':
indexes.append(i)
found += instruction.operands[0].__str__()
else:
if found:
if not text.startswith(found):
indexes = []
found = ""
elif text == found:
for index in indexes:
instructions[index] = None
indexes = []
found = ""
else:
indexes = []
found = ""
return [x for x in instructions if x is not None]
def remove_by_reversed_orders(instructions: List[ContentStreamInstruction], orders: Dict[str, List[int]]) -> \
List[ContentStreamInstruction]:
"""Remove all instructions by reversed orders of operators from PDF."""
indexes = {}
for i in reversed(range(len(instructions))):
instruction = instructions[i]
op = instruction.operator.__str__()
if indexes.get(op, 0) in orders.get(op, []):
instructions[i] = None
indexes[op] = indexes.get(op, 0) + 1
return [x for x in instructions if x is not None]
def remove_watermark_from_geos_pdf(input_file: Path, output_file: Path) -> str:
"""
Remove watermark from pdf (exported from any GEOS app) and save to output_file.
:param input_file:
:param output_file:
:return:
"""
pdf = Pdf.open(input_file)
for page_number, page in enumerate(pdf.pages):
page_instructions = parse_content_stream(page)
previous_len = len(page_instructions)
# VERSION EVALUATION
page_instructions = remove_tjs_min(page_instructions, "VERSION EVALUATION")
# alt: VERSION EVALUATION
new_len = len(page_instructions)
if new_len == previous_len:
page_instructions = remove_by_reversed_orders(page_instructions, {
'f': [0, ],
})
message = f"{input_file} at page {page_number + 1} => we used 'f' operator to remove watermark."
logger.warning(message)
w_sentry(capture_message, message)
# Trial - XXX
page_instructions = remove_tj_maj(page_instructions, "Trial - ")
page_instructions = remove_tj_maj(page_instructions,
[b'\x007', b'\x00U', b'\x00L', b'\x00D', b'\x00O', b'\x00\x03', b'\x00\x10',
b'\x00\x03'])
# UnRegistered
page_instructions = remove_tj_maj(page_instructions, "UnRegistered")
# save page
new_content_stream = unparse_content_stream(page_instructions)
page.Contents = pdf.make_stream(new_content_stream) # override page contents
pdf.save(output_file)
return str(output_file)
def remove_watermark_from_pdf(input_file: Path, output_file: Path, method_choice: MethodChoice = None) -> str:
"""
Remove watermark from pdf and save to output_file
:param input_file:
:param output_file:
:param method_choice:
:return:
"""
if method_choice == MethodChoice.geos:
return remove_watermark_from_geos_pdf(input_file, output_file)
pdf = Pdf.open(input_file)
for page in pdf.pages:
for image_key in page.images.keys():
raw_image = page.images[image_key]
pdf_image = PdfImage(raw_image)
pil_image = pdf_image.as_pil_image()
pil_image = remove_watermark_from_pil_image(pil_image, method_choice)
raw_image.write(zlib.compress(pil_image.tobytes()), filter=Name("/FlateDecode"))
pdf.save(output_file)
return str(output_file)
def remove_watermark_from_docx(input_file: Path, output_file: Path, method_choice: MethodChoice = None) -> str:
"""
Remove watermark from docx and save to output_file
:param input_file:
:param output_file:
:param method_choice:
:return:
"""
z = zipfile.ZipFile(input_file)
all_files = z.namelist()
# get all files in word/media/ directory
images = list(filter(lambda x: x.startswith('word/media/'), all_files))
replacements = {}
for image in images:
with z.open(image) as f:
pil_image = Image.open(f)
# pil_image = improve_text_in_image(pil_image)
pil_image = remove_watermark_from_pil_image(pil_image, method_choice)
image_file_tmp = io.BytesIO()
pil_image.save(image_file_tmp, format="PNG")
replacements[image] = image_file_tmp
z.close()
return str(replace_images_in_zip(input_file, output_file, replacements))
def remove_watermark_from_image(input_file: Path, output_file: Path, method_choice: MethodChoice) -> str:
"""
Remove watermark from image
:param input_file:
:param output_file:
:param method_choice:
:return:
"""
pil_image = Image.open(input_file)
pil_image = remove_watermark_from_pil_image(pil_image, method_choice)
pil_image.save(output_file)
return str(output_file)
def main(input_file: Union[str, Path], output_file: Union[str, Path] = None, method_choice: MethodChoice = None) -> str:
"""
Entry point
:param input_file:
:param output_file:
:param method_choice:
:return:
"""
input_path = Path(input_file)
if not input_path or not input_path.exists():
raise FileNotFoundError(f"File {input_path} does not exist")
# generate output path
if not output_file:
output_path = generate_output_path(input_path)
else:
output_path = Path(output_file)
# remove output file if it exists
if output_path.exists():
output_path.unlink()
# remove watermark
if str(input_path.suffix).lower() == ".pdf":
return remove_watermark_from_pdf(input_path, output_path, method_choice)
elif str(input_path.suffix).lower() == ".docx":
return remove_watermark_from_docx(input_path, output_path, method_choice)
elif str(input_path.suffix).lower() in [".png", ".jpg", ".jpeg"]:
return remove_watermark_from_image(input_path, output_path, method_choice)
else:
raise Exception(f"Unsupported file type: {input_path.suffix}")
def mmain(input_files: List[Union[str, Path]], output_dir: Union[str, Path] = None,
method_choice: MethodChoice = None) -> List[str]:
"""
Entry point
:param input_files:
:param output_dir:
:param method_choice:
:return:
"""
output_files = []
for input_file in input_files:
input_path = Path(input_file)
if not output_dir:
output_dir = input_path.parent
output_file = output_dir / (input_path.stem + f"_generated{input_path.suffix}")
output_file = main(input_file, output_file, method_choice)
output_files.append(output_file)
return output_files
if __name__ == "__main__":
fire.Fire(main)