# Youtube Ad Skipper

In [1]:
# Optical Character Recognition (OCR) Library Install
#1. Install GTK for Windows- https://github.com/tschoonj/GTK-for-Windows-Runtime-Environment-Installer/releases
#2. Create a new venv
# DocTR for PyTorch
#3.pip install "python-doctr[torch]" pyautogui
#Ref: https://github.com/mindee/doctr/issues/701

import os
GTK_FOLDER = r'C:\Program Files\GTK3-Runtime Win64\bin'
os.environ['PATH'] = GTK_FOLDER + os.pathsep + os.environ.get('PATH', '')
os.environ['USE_TORCH'] = '1'
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

In [2]:
# Import needed libraries
import time
import datetime
import pyautogui
from PIL import Image
import cv2
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
import torch

In [3]:
#When fail-safe mode is True, moving the mouse to the upper-left will abort your program:
pyautogui.FAILSAFE = True

In [4]:
#Load OCR Model from docTR: https://mindee.github.io/doctr/using_doctr/using_models.html
#docTR_ocr_model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)

#Half-precision (or FP16) is a binary floating-point format that occupies 16 bits in computer memory.
##Advantages: Faster inference, Less memory usage
### Ref: https://mindee.github.io/doctr/v0.7.0/using_doctr/using_model_export.html
docTR_ocr_model = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True).cuda().half()

#Initialize Input and Output Paths
flow_name= "yt_ad_skip" # get from config yaml

# Create Input Logs Directory if not exists
screenshot_store_path_prefix= "./logs/logs_flow_input/"
flow_screenshot_store_path= os.path.join(screenshot_store_path_prefix, flow_name)
print("Creating Directory for Inputs:",flow_screenshot_store_path)
os.makedirs(flow_screenshot_store_path, exist_ok=True)

# Create Output Logs Directory if not exists
scr_segments_store_path_prefix= "./logs/logs_flow_output/"
flow_scr_segments_store_path= os.path.join(scr_segments_store_path_prefix, flow_name)
print("Creating Directory for Outputs:",flow_scr_segments_store_path)
os.makedirs(flow_scr_segments_store_path, exist_ok=True)

Creating Directory for Inputs: ./logs/logs_flow_input/yt_ad_skip
Creating Directory for Outputs: ./logs/logs_flow_output/yt_ad_skip


## Get Youtube Screenshots every second. Check if "Skip" button is present and click on it

Note: The latency/time taken for OCR completion is ~4 Seconds with CPU and ~1 Seconds with GPU with FP16 precision. So this demo may not satisy the constraint of taking action every second based on the hardware used to execute the program.
- Run the OCR pipeline using CUDA GPUs to achieve the goal .
- Current pipeline can be used as-is for RPA tasks for which few seconds of latency is acceptable

In [8]:
# For demo, wait until skip button comes on Youtube player and run this cell to trigger the automation, run for 1-step
timeInSec_run_flow_step=1 

# For demo, time to wait after User triggers this cell and switch the Active Window to Youtube player, verify if click is successful
timeInSec_manually_switch_youtube_window=10
time.sleep(timeInSec_manually_switch_youtube_window)

for step_num in range(timeInSec_run_flow_step):
    time.sleep(1)
    curr_scr_shot_filename = os.path.join(flow_screenshot_store_path, str(step_num) + '.png')
    get_screenshot= pyautogui.screenshot(curr_scr_shot_filename) 
    
    # Perform OCR on the Screenshot and Search for the Keyword
    search_keyword= "skip"
    ocr_start_time = datetime.datetime.now()
    
    # Load the Screenshot
    curr_screenshot_doc = DocumentFile.from_images(curr_scr_shot_filename)
    ocr_result = docTR_ocr_model(curr_screenshot_doc)
    
    ocr_complete_time = datetime.datetime.now()
    ocr_time_duration = ocr_complete_time - ocr_start_time
    ocr_time_duration_milli_sec = round(ocr_time_duration.total_seconds()*1000)
    print(f"OCR on the Screenshot is Complete. Time taken for OCR with CUDA FP16 is {ocr_time_duration_milli_sec} ms.")
    
    # Export the OCR Results and flatten it to find the keyword
    json_output = ocr_result.export()
    page_words = [[word for block in page['blocks'] for line in block['lines'] for word in line['words']] for page in json_output['pages']]
    page_dims = [page['dimensions'] for page in json_output['pages']]
    
    # Get all the word coordinates in absolute dimensions, format in [xmin, ymin, xmax, ymax]
    words_abs_coords = [
        [[int(round(word['geometry'][0][0] * dims[1])), int(round(word['geometry'][0][1] * dims[0])), int(round(word['geometry'][1][0] * dims[1])), int(round(word['geometry'][1][1] * dims[0]))] for word in words]
        for words, dims in zip(page_words, page_dims)
    ]
    
    all_words_ocr=page_words[0]
    all_words_coords= words_abs_coords[0]
    num_words=len(all_words_ocr)
    
    # Create directory for current_step outputs if not exists
    curr_step_directory = "step_"+str(step_num) 
    flow_curr_step_scr_segments_path= os.path.join(flow_scr_segments_store_path, curr_step_directory)
    print("Creating Directory for Current Step Outputs:",flow_curr_step_scr_segments_path)
    os.makedirs(flow_curr_step_scr_segments_path, exist_ok=True)
    
    keyword_match_locs= []
    curr_screenshot_image = cv2.imread(curr_scr_shot_filename)
    for word_ind in range(num_words):
        curr_word_location= all_words_coords[word_ind]
        x1, y1, x2, y2 = curr_word_location
        cropped_image = curr_screenshot_image[y1:y2, x1:x2]
        cropped_image_filename = os.path.join(flow_curr_step_scr_segments_path, str(word_ind) + '.png')
        cv2.imwrite(cropped_image_filename, cropped_image)
        
        curr_word_value= all_words_ocr[word_ind]['value']
        curr_word_value_lower= curr_word_value.lower()
        if search_keyword in curr_word_value_lower:
            keyword_match_locs.append(curr_word_location)
            
    #To-do: Multiple matches of Keyword is not handled currently    
    if len(keyword_match_locs) ==1 :
        keyword_location= keyword_match_locs[0]
        print("Found the keyword- ", search_keyword, "at: ", keyword_location)
        xmin, ymin, xmax, ymax= keyword_location
        cx = (xmin + xmax) // 2
        cy = (ymin + ymax) // 2
        
        # Move the mouse to cx, cy coordinates and click it.
        pyautogui.click(cx, cy) 
    else:
        continue
    

OCR on the Screenshot is Complete. Time taken for OCR with CUDA FP16 is 863 ms.
Creating Directory for Current Step Outputs: ./logs/logs_flow_output/yt_ad_skip\step_0
Found the keyword-  skip at:  [1146, 459, 1189, 482]


In [None]:
# Uncomment to see the OCR Results with Bounding Boxes for all Detections
#ocr_result.show()