In [1]:
import numpy as np
import xgboost as xgb
import torch

In [2]:
import cv2
import mediapipe as mp
from math import hypot
import numpy as np
import pygetwindow as gw
import pyautogui
import time

In [3]:
model = torch.jit.load("cnn_embedder_ts.pt", map_location="cpu")
model.eval()

RecursiveScriptModule(
  original_name=CNNEmbedder
  (features): RecursiveScriptModule(
    original_name=Sequential
    (0): RecursiveScriptModule(original_name=Conv2d)
    (1): RecursiveScriptModule(original_name=ReLU)
    (2): RecursiveScriptModule(original_name=MaxPool2d)
    (3): RecursiveScriptModule(original_name=Conv2d)
    (4): RecursiveScriptModule(original_name=ReLU)
    (5): RecursiveScriptModule(original_name=MaxPool2d)
  )
  (embed): RecursiveScriptModule(original_name=Linear)
  (classifier): RecursiveScriptModule(original_name=Linear)
)

In [4]:
xgbc = xgb.XGBClassifier()
xgbc.load_model("xgb_embedder.json") 

In [5]:
def predict_from_list(matrix_list):
    arr = np.array(matrix_list, dtype=np.float32)
    if arr.ndim == 2:
        arr = arr[np.newaxis,...]
    tensor = torch.from_numpy(arr).unsqueeze(1)  # (N,1,H,W)
    with torch.no_grad():
        _, emb = model(tensor)
    emb_np = emb.numpy()
    return xgbc.predict(emb_np)

In [11]:
# --- MediaPipe setup ---
mpHands = mp.solutions.hands
hands = mpHands.Hands(
    static_image_mode=False,
    model_complexity=1,
    min_detection_confidence=0.75,
    min_tracking_confidence=0.75,
    max_num_hands=1
)
Draw = mp.solutions.drawing_utils

# --- Gesture thresholds (in pixels) -- 
up_val = 10    # below this = “pinch” → scroll down
down_val  = 10    # above this = “unpinch” → scroll up
DEBOUNCE_MS = 500   # ignore new scrolls for this many ms

last_scroll_time = 0

# --- Camera setup ---
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise RuntimeError("Could not open webcam")

def find_pdf_window():
    """Return the first window ending with '.pdf', or None."""
    for w in gw.getAllTitles():
        if w and '.pdf' in w.lower():
            print("window found")
            return gw.getWindowsWithTitle(w)[0]
    return None

# wait until user opens a PDF
print("Waiting for a PDF window to appear…")
pdf_win = None
while pdf_win is None:
    pdf_win = find_pdf_window()
    time.sleep(0.5)
print(f"Found PDF: '{pdf_win.title}'")
pdf_win.activate()  # bring it to front
print("--- Main loop ---")
# --- Main loop ---
p1_0 = 0
p2_0 = 0
pts_0 = np.zeros((21,6))
t0 = time.time()
pts = np.zeros((21,6))
try:
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.flip(frame, 1)
        frameRGB = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        result = hands.process(frameRGB)
        
        if result.multi_hand_landmarks:
            handlm = result.multi_hand_landmarks[0]
            h, w, _ = frame.shape
            t1 = time.time()
            for idx, lm in enumerate(handlm.landmark):
                x_px, y_px = int(lm.x * w), int(lm.y * h)
                row_to_add = np.array([idx, x_px, y_px,t1 - t0,0,0])
                pts[int(idx)] = row_to_add
                
            Draw.draw_landmarks(frame, handlm, mpHands.HAND_CONNECTIONS)
            t0  = t1
            points = pts.copy()
            points[:,1:3] = pts[:,1:3] - pts_0[:,1:3]
            points[:,4] = points[:,1] / points[:,3]
            points[:,5] = points[:,2] / points[:,3]
            
            pred = predict_from_list(points)
            pts_0 = points.copy()
            print(pred)
            now = time.time() * 1000
            if now - last_scroll_time > DEBOUNCE_MS:
                if pred == 2:
                    pyautogui.scroll(-300)   # scroll down
                    last_scroll_time = now
                    print('down')
                elif pred == 1:
                    pyautogui.scroll(300)    # scroll up
                    last_scroll_time = now
                    print('up')

        cv2.imshow('PDF Scroll Control', frame)
        if cv2.waitKey(1) & 0xFF == 27:
            break

finally:
    cap.release()
    cv2.destroyAllWindows()


Waiting for a PDF window to appear…
window found
Found PDF: 'IET Microwaves Antenna   Prop - 2017 - Catarinucci - Microwave characterisation of polylactic acid for 3D‐printed.pdf - WPS Office'
--- Main loop ---
[0]
[2]
down
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[0]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]
[0]
[1]
[0]
[1]
[0]
[1]
up
[0]
[1]


KeyboardInterrupt: 

In [8]:
po[:,1:3] = pts[:,1:3] - pts_0[:,1:3]

NameError: name 'po' is not defined

In [None]:
p = np.array([1,2,3,5,3,4,5,34,35,45,4,3,46,457,56,86,78,678,3412,43,42,35,346,56,768,76,978,9,78,45,23,523,5,34,453,576,8,769,768,978,97,8,567,46,7,8,9,0])
p = p.reshape(6,8)
p

array([[   1,    2,    3,    5,    3,    4,    5,   34],
       [  35,   45,    4,    3,   46,  457,   56,   86],
       [  78,  678, 3412,   43,   42,   35,  346,   56],
       [ 768,   76,  978,    9,   78,   45,   23,  523],
       [   5,   34,  453,  576,    8,  769,  768,  978],
       [  97,    8,  567,   46,    7,    8,    9,    0]])

In [None]:
p[:,6]

array([  5,  56, 346,  23, 768,   9])