In [15]:
#---------------------------------------------------------------
# imports
#---------------------------------------------------------------
## conda activate bengali_ai
import os, errno
import cv2
import glob
import shutil
import argparse
from IPython.display import Video, Image
import pytube
from pytube import YouTube 
import zipfile
from paddleocr import PaddleOCR
import sys
sys.path.append('../')
from coreLib.utils import create_dir, is_supported, LOG_INFO, video_to_images, zipdir,\
    calculate_ssim, calculate_hausdorff_distance, inpaintredBox, viz_img_pair,\
         count_matched_bboxes, unique_frames_to_pdf
from coreLib.config import Hparams

### data path

In [16]:

##---------------------------------
data_dir        =   "../datas/"
# isYouTube_video =   "Compiler.mp4" ## name of the presentation slide video
isYouTube_video =   "video_MLT_pdf.mp4"
# isYouTube_video =   "https://www.youtube.com/watch?v=k6lCD0iVExo"
imgs_dir        =   "../images/"
output_pdf_path =   "../outputs/"
##---------------------------------

In [17]:
## Whether "imgs_dir" is exist or not
try:
    os.makedirs(imgs_dir)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

In [18]:
 ## youtube video presentations
if is_supported(isYouTube_video):
    yt = YouTube(isYouTube_video)
    # this method will download the highest resolution that video is available
    yt_video = yt.streams.get_highest_resolution()
    file_name = yt_video.download()

    ## check whether file is exist or not
    _file_name = file_name.split("/")[-1]
    file_exist = os.path.join(data_dir, _file_name)
    if os.path.exists(file_exist):
        os.remove(file_exist)
    shutil.move(file_name, data_dir)
    file_name = file_name.split("/")[-1]
    filename = os.path.join(data_dir, file_name)
    LOG_INFO(f"{filename}",mcolor="green") 

    #choosing dynamic fps based on video length for making computation fast without sacrificing vital informations
    if(yt.length>=1800):
        fps = 0.5
    elif(yt.length<1800 and yt.length>1200):
        fps = 0.75
    else:
        fps = 1.0
else:
    filename = os.path.join(data_dir, isYouTube_video) 
    LOG_INFO(f"{filename}",mcolor="green") 
    video = cv2.VideoCapture(filename)
    length = video.get(cv2.CAP_PROP_POS_MSEC)
    #choosing dynamic fps based on video length for making computation fast without sacrificing vital informations
    if(length>=1800):
        fps = 0.5
    elif(length<1800 and length>1200):
        fps = 0.75
    else:
        fps = 1.0

[32m#LOG     :[0m[32m../datas/video_MLT_pdf.mp4[0m


### extract key frames (setting fps)

In [19]:
## Controllable Parameters
hp = Hparams(filename, fps)
LOG_INFO(f"{hp.frames_per_second}", mcolor="green")

if(hp.embed_video):
    Video(filename,embed=True)

## collecting least frames,required for finding unique slides
frames, images_path = video_to_images(filename, imgs_dir, frames_per_second=hp.frames_per_second)
LOG_INFO(f"Frames Number: {len(frames)}", mcolor="green")
LOG_INFO(f"Images Path: {images_path}", mcolor="green")

# Converting saved images directory into zip directory so that we can remove the folder later
with zipfile.ZipFile('images.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipdir(images_path, zipf) # f'./images'

## check images whether extracted: f'./images'
images = os.listdir(images_path)
images.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))

[32m#LOG     :[0m[32m1.0[0m
[32m#LOG     :[0m[32mFrames Number: 232[0m
[32m#LOG     :[0m[32mImages Path: ../images/video_MLT_pdf[0m


### cleaning annotation from the extracted frames if there exit

In [20]:
## Whether "imgs_dir/rmv_annotation" is exist or not
temp_img_dir = os.path.join(imgs_dir, "rmv_annotation")
try:
    os.makedirs(temp_img_dir)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

In [6]:
## annotation check & remove in the key frames
for idx in range(len(images)):
    img = cv2.imread(os.path.join(images_path, images[idx]))
    # remove annotattion paintng
    annot_res = inpaintredBox(img) 
    # cv2.imwrite(os.path.join(images_path, images[idx]), annot_res)
    cv2.imwrite(os.path.join(temp_img_dir, images[idx]), annot_res)


### FILTER - 1 (apply SSIM)

In [7]:
## filtered images w.r.t. SSIM ### <<<----------------- FILTER - 1
filtered_images_path = create_dir(imgs_dir, 'filtered')
for idx in range(len(images) - 1):
    img1 = cv2.imread(os.path.join(images_path, images[idx]))
    img2 = cv2.imread(os.path.join(images_path, images[idx+1]))
    score = calculate_ssim(hp, img1, img2, filtered_images_path, images[idx])

In [8]:
## filtered images: f'./filtered' 
images = os.listdir(filtered_images_path)
images.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))

with zipfile.ZipFile('filtered.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipdir(filtered_images_path, zipf) # f'./filtered'

### FILTER - 2 (PaddleOCR: apply deep learning method for detecting unique frames)

In [9]:
## frame pair visualization
"""
    * Source: https://gist.github.com/mstankie/71e49f628beac320953e0460b8ee78c2
    * Declare PaddleOCR class
"""
ocr = PaddleOCR(use_angle_cls=True, lang='ar',use_gpu = True)  
if(hp.visualize_img_pairs):
    viz_img_pair(ocr=ocr, images=images, filtered_images_path=filtered_images_path, i=4, j=5)

## Detect all unique and informative images and save them in folder name unique
unique_images_path = create_dir(imgs_dir, 'unique')  #f'./unique' 
'''
    detect all unique and informative images and save them
    images using db_resnet50 text detection algorithm of paddleocr
'''
files = os.listdir(filtered_images_path) # f'./filtered'
files.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))

for i in range(len(files)-1):
    prev = 1
    for j in range(i+1, len(files)):
        img=cv2.imread(os.path.join(filtered_images_path, files[i]))
        img1=cv2.imread(os.path.join(filtered_images_path, files[j]))

        try:
            img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
            img1=cv2.cvtColor(img1,cv2.COLOR_BGR2RGB)
        except Exception as e:
            LOG_INFO(f"{e}",mcolor="red") 
        
        count,_ = count_matched_bboxes(hp, img,img1,detector=ocr)
        if(count>= prev):
            prev = count
            cv2.imwrite(os.path.join(unique_images_path, files[i]), img1)
        else:    
            break

## save unique Frames to pdf without final filtering # './unique'
if(hp.withoutfinal_filter):
    unique_frames_to_pdf(hp, unique_images_path, output_pdf_path, True) 


[2023/06/12 18:22:29] ppocr DEBUG: Namespace(alpha=1.0, benchmark=False, beta=1.0, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='/home/rezwan/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_box_type='quad', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_limit_side_len=960, det_limit_type='max', det_model_dir='/home/rezwan/.paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model_dir=None, e2e_pgnet_mode='fast', e2e_pgnet_score_thresh=

### FILTER - 3

In [10]:
# final filtering
"""
*   at this point,it's possible that there still can exist few more redundant samples,
    they don't always look like redundant because of complex animation or other stuffs but 
    according to their other key features like "mid to near high bbox overlap coverage", 
    they are redundant,we try to do one last filtering to detect and eliminate those redundant samples.
"""
images = os.listdir(unique_images_path)
images.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))

for idx in range(len(images) - 1):
    image = cv2.imread(os.path.join(unique_images_path, images[idx]))
    image1 = cv2.imread(os.path.join(unique_images_path, images[idx+1]))
    if(hp.is_ssim):
        score = calculate_ssim(hp, image, image1, unique_images_path, images[idx], write_img=False)
        if(score>hp.ssim_threshold):
            os.remove(os.path.join(unique_images_path, images[idx]))
            continue
    try:
        img=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
        img1=cv2.cvtColor(image1,cv2.COLOR_BGR2RGB)
    except Exception as e:
        LOG_INFO(f"{e}",mcolor="red") 

    count,minimum = count_matched_bboxes(hp, img,img1,detector=ocr)
    if(count>= minimum * hp.conf_thr):
        os.remove(os.path.join(unique_images_path, images[idx]))

LOG_INFO(f"Final Filtering Length: {len(os.listdir(unique_images_path))}", mcolor="green") 

[32m#LOG     :[0m[32mFinal Filtering Length: 5[0m


### Save the final filter PDF

In [11]:
## Save final PDF
unique_frames_to_pdf(hp, unique_images_path, output_pdf_path)

[32m#LOG     :[0m[32mConverting Images to pdf...[0m
[32m#LOG     :[0m[32mPDF Created![0m
[32m#LOG     :[0m[32mPDF saved at: ../outputs/video_MLT_pdf.mp4.pdf[0m


In [12]:
## Save as a unique.zip
with zipfile.ZipFile('unique.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipdir(unique_images_path, zipf)

### extra: removing frames and moving .zip

In [13]:
## romove the the folders of extracted images
if(hp.rmdir):
    sub_folders_pathname = imgs_dir
    sub_folders_list = glob.glob(sub_folders_pathname)
    for sub_folder in sub_folders_list:
        shutil.rmtree(sub_folder)

In [14]:
## move .zip file to "data_dir" path
# images.zip
_file_exist = os.path.join(data_dir, 'images.zip')
if os.path.exists(_file_exist):
    os.remove(_file_exist)
shutil.move(f'./images.zip', data_dir) 

# filtered.zip
_file_exist = os.path.join(data_dir, 'filtered.zip')
if os.path.exists(_file_exist):
    os.remove(_file_exist)
shutil.move(f'./filtered.zip', data_dir) 

# unique.zip
_file_exist = os.path.join(data_dir, 'unique.zip')
if os.path.exists(_file_exist):
    os.remove(_file_exist)
shutil.move(f'./unique.zip', data_dir)  

'../datas/unique.zip'