-
Notifications
You must be signed in to change notification settings - Fork 2
image_quality_check #647
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
image_quality_check #647
Changes from all commits
d6b163c
88f199c
5ecde02
52d1afe
e783ed9
5cc96ec
456e238
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,7 +7,7 @@ | |
| from pymongo import MongoClient | ||
| from utils import convert_to | ||
|
|
||
| from .db_types import User, Presentation, Check, Consumers, Logs | ||
| from .db_types import User, Presentation, Check, Consumers, Logs, Image | ||
|
|
||
| client = MongoClient("mongodb://mongodb:27017") | ||
| db = client['pres-parser-db'] | ||
|
|
@@ -21,11 +21,32 @@ | |
| logs_collection = db.create_collection( | ||
| 'logs', capped=True, size=5242880) if not db['logs'] else db['logs'] | ||
| celery_check_collection = db['celery_check'] # collection for mapping celery_task to check | ||
| images_collection = db['images'] # коллекция для хранения изображений | ||
|
|
||
|
|
||
| def get_client(): | ||
| return client | ||
|
|
||
| def get_images(check_id): | ||
| images = images_collection.find({'check_id': str(check_id)}) | ||
| if images is not None: | ||
| image_list = [] | ||
| for img in images: | ||
| image_list.append(Image(img)) | ||
| return image_list | ||
| else: | ||
| return None | ||
|
|
||
| def save_image_to_db(check_id, image_data, caption, image_size): | ||
| image = Image({ | ||
| 'check_id': check_id, | ||
| 'image_data': image_data, | ||
| 'caption': caption, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. откуда берется подпись / что она из себя представляет? есть ли у всех изображений - или это подпись "Рис. 2 - ...", которую кто-то может не указать? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Да, это подпись "Рис. 2 - ..." |
||
| 'image_size': image_size | ||
| }) | ||
| images_collection.insert_one(image.pack()) | ||
| print(str(check_id) + " " + str(caption)) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Удалите отладочные комментари |
||
|
|
||
|
|
||
| # Returns user if user was created and None if already exists | ||
| def add_user(username, password_hash='', is_LTI=False): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| from ..base_check import BaseReportCriterion, answer | ||
| import cv2 | ||
| import numpy as np | ||
|
|
||
| class ImageQualityCheck(BaseReportCriterion): | ||
| label = "Проверка качества изображений" | ||
| description = '' | ||
| id = 'image_quality_check' | ||
| # необходимо подобрать min_laplacian и min_entropy | ||
| def __init__(self, file_info, min_laplacian=10, min_entropy=1): | ||
| super().__init__(file_info) | ||
| self.images = self.file.images | ||
| self.min_laplacian = min_laplacian | ||
| self.min_entropy = min_entropy | ||
| self.laplacian_score = None | ||
| self.entropy_score = None | ||
|
|
||
| def check(self): | ||
| deny_list = [] | ||
| if self.images: | ||
| for img in self.images: | ||
| image_array = np.frombuffer(img.image_data, dtype=np.uint8) | ||
| img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR) | ||
|
|
||
| if img_cv is None: | ||
| deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.<br>") | ||
| continue | ||
|
|
||
| self.find_params(img_cv) | ||
|
|
||
| if self.laplacian_score is None or self.entropy_score is None: | ||
| deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.<br>") | ||
| continue | ||
|
|
||
| if self.laplacian_score < self.min_laplacian: | ||
| deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкий показатель лапласиана: {self.laplacian_score} (минимум {self.min_laplacian}).<br>") | ||
|
|
||
| if self.entropy_score < self.min_entropy: | ||
| deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкую энтропию: {self.entropy_score} (минимум {self.min_entropy}).<br>") | ||
|
Comment on lines
+35
to
+39
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Рядовой студент точно не будет знать (или гуглить), кто такое лапласиан и энтропия - возможно, стоит сделать пояснение (= пользователь должен понять, что и как ему исправить) |
||
| else: | ||
| return answer(False, 'Изображения не найдены!') | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Гипотетически работа может быть без рисунков (и вроде как это не будет нарушением) |
||
| if deny_list: | ||
| return answer(False, f'Изображения нечитаемы! <br>{"".join(deny_list)}') | ||
| else: | ||
| return answer(True, 'Изображения корректны!') | ||
|
|
||
| def find_params(self, image): | ||
| if image is None or image.size == 0: | ||
| return None, None | ||
| gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | ||
| self.laplacian_score = cv2.Laplacian(gray_image, cv2.CV_64F).var() | ||
| hist, _ = np.histogram(gray_image.flatten(), bins=256, range=[0, 256]) | ||
| hist = hist / hist.sum() | ||
| self.entropy_score = -np.sum(hist * np.log2(hist + 1e-10)) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,18 +8,39 @@ | |
| from main.reports.md_uploader import MdUploader | ||
| from utils import convert_to | ||
|
|
||
| logger = logging.getLogger('root_logger') | ||
| from os.path import basename | ||
| from app.db.db_methods import add_check | ||
| from app.db.db_types import Check | ||
|
|
||
| logger = logging.getLogger('root_logger') | ||
|
|
||
| def parse(filepath, pdf_filepath): | ||
| from app.db.db_methods import files_info_collection | ||
|
|
||
| tmp_filepath = filepath.lower() | ||
| try: | ||
| if tmp_filepath.endswith(('.odp', '.ppt', '.pptx')): | ||
| new_filepath = filepath | ||
| if tmp_filepath.endswith(('.odp', '.ppt')): | ||
| logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.") | ||
| new_filepath = convert_to(filepath, target_format='pptx') | ||
| file_object = PresentationPPTX(new_filepath) | ||
|
|
||
| presentation = PresentationPPTX(new_filepath) | ||
|
|
||
| check = Check({ | ||
| 'filename': basename(new_filepath), | ||
| }) | ||
|
|
||
| file_id = 0 | ||
| file = files_info_collection.find_one({'name': basename(new_filepath)}) | ||
| if file: | ||
| file_id = file['_id'] | ||
|
|
||
| check_id = add_check(file_id, check) | ||
| presentation.extract_images_with_captions(check_id) | ||
| file_object = presentation | ||
|
Comment on lines
+30
to
+41
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Не совсем понимаю происходящее, но проверки по имени файла (и затем поиск её по нему) - не подходит, поскольку студенты могут загружать файл с одним и тем же названием по несколько раз (при этом содержимое разное) |
||
|
|
||
|
|
||
| elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )): | ||
| new_filepath = filepath | ||
| if tmp_filepath.endswith(('.doc', '.odt')): | ||
|
|
@@ -28,7 +49,19 @@ def parse(filepath, pdf_filepath): | |
|
|
||
| docx = DocxUploader() | ||
| docx.upload(new_filepath, pdf_filepath) | ||
|
|
||
| check = Check({ | ||
| 'filename': basename(new_filepath), | ||
| }) | ||
|
|
||
| file_id = 0 | ||
| file = files_info_collection.find_one({'name': basename(new_filepath)}) | ||
| if file: | ||
| file_id = file['_id'] | ||
|
|
||
| check_id = add_check(file_id, check) | ||
| docx.parse() | ||
| docx.extract_images_with_captions(check_id) | ||
| file_object = docx | ||
|
Comment on lines
+53
to
65
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. избавьтесь от дублирования кода (с блоком выше) |
||
|
|
||
| elif tmp_filepath.endswith('.md' ): | ||
|
|
@@ -54,4 +87,4 @@ def save_to_temp_file(file): | |
| temp_file.write(file.read()) | ||
| temp_file.close() | ||
| file.seek(0) | ||
| return temp_file.name | ||
| return temp_file.name | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -242,6 +242,71 @@ def show_chapters(self, work_type): | |
| chapters_str += " " + header["text"] + "<br>" | ||
| return chapters_str | ||
|
|
||
| def extract_images_with_captions(self, check_id): | ||
| from app.db.db_methods import save_image_to_db, get_images | ||
|
|
||
| emu_to_cm = 360000 | ||
| image_found = False | ||
| image_data = None | ||
| if not self.images: | ||
| # Проход по всем параграфам документа | ||
| for i, paragraph in enumerate(self.file.paragraphs): | ||
| width_emu = None | ||
| height_emu = None | ||
| # Проверяем, есть ли в параграфе встроенные объекты | ||
| for run in paragraph.runs: | ||
| if "graphic" in run._element.xml: # может быть изображение | ||
|
|
||
| # Извлечение бинарных данных изображения | ||
| image_streams = run._element.findall('.//a:blip', namespaces={ | ||
| 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Правильно ли я понимаю, что 2006 тут и дальше - это год какого-то стандарта? не может ли в документе быть другого? (и вдруг мы что-то пропустим) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Да, 2006 в пространствах имен указывает на год, когда были определены стандарты Office Open XML. |
||
| for image_stream in image_streams: | ||
| embed_id = image_stream.get( | ||
| '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed') | ||
| if embed_id: | ||
| image_found = True | ||
| image_part = self.file.part.related_parts[embed_id] | ||
| image_data = image_part.blob | ||
| extent = run._element.find('.//wp:extent', namespaces={ | ||
| 'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) | ||
| if extent is not None: | ||
| width_emu = int(extent.get('cx')) | ||
| height_emu = int(extent.get('cy')) | ||
| width_cm = width_emu / emu_to_cm | ||
| height_cm = height_emu / emu_to_cm | ||
| # Если мы уже нашли изображение, ищем следующий непустой параграф для подписи | ||
| if image_found: | ||
| # Переход к следующему параграфу | ||
| next_paragraph_index = i + 1 | ||
|
|
||
| # Проверяем, есть ли следующий параграф | ||
| if next_paragraph_index < len(self.file.paragraphs): | ||
| while next_paragraph_index < len(self.file.paragraphs): | ||
| next_paragraph = self.file.paragraphs[next_paragraph_index] | ||
| next_paragraph_text = next_paragraph.text.strip() | ||
|
|
||
| # Проверка, не содержит ли следующий параграф также изображение | ||
| contains_image = any( | ||
| "graphic" in run._element.xml for run in next_paragraph.runs | ||
| ) | ||
|
|
||
| # Если параграф не содержит изображения и текст не пуст, то это подпись | ||
| if not contains_image and next_paragraph_text: | ||
| # Сохраняем изображение и его подпись | ||
| save_image_to_db(check_id, image_data, next_paragraph_text, (width_cm, height_cm)) | ||
| break | ||
| else: | ||
| save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm)) | ||
| break | ||
| else: | ||
| save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm)) | ||
|
|
||
| image_found = False # Сброс флага, чтобы искать следующее изображение | ||
| image_data = None # Очистка данных изображения | ||
| self.images = get_images(check_id) | ||
|
|
||
|
|
||
|
|
||
|
|
||
| def main(args): | ||
| file = args.file | ||
|
|
||
HadronCollider marked this conversation as resolved.
Show resolved
Hide resolved
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,3 +35,4 @@ filetype==1.2.0 | |
| language-tool-python==2.8.1 | ||
| markdown==3.4.4 | ||
| md2pdf==1.0.1 | ||
| opencv-python==4.5.5.64 | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
возможно, полезно будет сохранять ещё какую-то информацию об изображении (страница, где она расположена или другая информация, которая поможет найти это изображение в тексте?)