In [None]:
!pip install requests pandas
!pip install feedparser
!pip install pyoai
!pip install opencv-python imagehash Pillow



Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m81.3/81.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=abf1694b5121250cc00a953112a715daeeea3aac932ae04be084d49e3b174e1c
  Stored in directory: /root/.cache/pip/wheels/3b/25/2a/105d6a15df6914f4d15047691c6c28f9052cc1173e40285d03
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0


In [None]:
import requests
from lxml import etree
from time import sleep
import re
from collections import defaultdict

def get_year(date_str):
    """Robust year extraction handling multiple formats"""
    if not date_str:
        return None

    patterns = [
        r'\b(18|19|20)\d{2}\b',  # YYYY
        r'(\d{4})-\d{2}-\d{2}',   # YYYY-MM-DD
        r'(\d{4})-\d{4}',         # YYYY-YYYY range
        r'[cC]irca\s(\d{4})',     # Circa YYYY
        r'\b(\d{4})\b',           # Just year as standalone number
    ]

    for pattern in patterns:
        match = re.search(pattern, date_str)
        if match:
            year_str = match.group(1) if match.groups() else match.group(0)
            try:
                year = int(year_str)
                if 1800 <= year <= 2100:  # Sanity check
                    return year
            except ValueError:
                continue
    return None

def count_vehicle_terms(texts, term_counts):
    """Count occurrences of each vehicle term in texts"""
    if not texts:
        return False

    vehicle_terms = [
        'auto', 'voertuig', 'wagen', 'kar',
        'vliegtuig', 'schip', 'schepen',
        'vaartuig', 'schuit', 'motor', 'fiets'
    ]

    found_any = False
    for text in texts:
        lower_text = text.lower()
        for term in vehicle_terms:
            if term in lower_text:
                term_counts[term] += 1
                found_any = True
    return found_any

def parse_resolution(resolution_str):
    """Parse resolution string into width and height (in pixels) with validation"""
    if not resolution_str:
        return None

    # Common resolution patterns with validation
    patterns = [
        r'(?P<width>\d{3,})\s*[x√ó]\s*(?P<height>\d{3,})(?!\d)',  # 1920x1080 or 1920√ó1080
        r'(?P<width>\d{3,})\s*[x√ó]\s*(?P<height>\d{3,})\s*[pP]',  # 1920x1080p
        r'(?P<height>\d{3,})\s*[pP]',                             # 1080p
        r'(?P<width>\d{3,})\s*[x√ó]\s*(?P<height>\d{3,})\s*pixels', # 1920x1080 pixels
        r'(\d{3,})\s*[*x√ó]\s*(\d{3,})'                            # Alternative separators
    ]

    MIN_RESOLUTION = 160  # Minimum reasonable dimension (QQVGA is 160x120)

    for pattern in patterns:
        match = re.search(pattern, resolution_str)
        if match:
            groups = match.groupdict()
            try:
                if 'width' in groups and 'height' in groups:
                    width = int(groups['width'])
                    height = int(groups['height'])
                elif 'height' in groups:
                    height = int(groups['height'])
                    width = int(height * 16 / 9)  # Assume 16:9 if only height given
                else:
                    continue

                # Validate resolution makes sense
                if width >= MIN_RESOLUTION and height >= MIN_RESOLUTION:
                    return (width, height)
            except (ValueError, TypeError):
                continue
    return None

def get_pixel_count(resolution):
    """Calculate total pixels for comparison"""
    if resolution:
        return resolution[0] * resolution[1]
    return float('inf')

def main():
    base_url = 'https://www.openbeelden.nl/feeds/oai/'
    namespaces = {
        'oai': 'http://www.openarchives.org/OAI/2.0/',
        'dc': 'http://purl.org/dc/elements/1.1/',
        'ebucore': 'urn:ebu:metadata-schema:ebucore'
    }

    # Counters
    total_records = 0
    matching_records = 0
    no_date_count = 0
    alt_date_matches = 0
    malformed_dates = 0
    vehicle_matches = 0
    term_counts = defaultdict(int)

    # Resolution tracking
    lowest_resolution = None
    lowest_resolution_record = None
    resolution_field = None

    # Auto-stop configuration
    MAX_UNCHANGED_BATCHES = 5
    unchanged_batches = 0
    last_vehicle_count = 0

    vehicle_check_fields = [
        'dc:title', 'dc:description',
        'dc:subject', 'dc:coverage'
    ]

    params = {'verb': 'ListRecords', 'metadataPrefix': 'oai_dc', 'set': 'openimages'}

    print("Starting enhanced collection scan...")
    print("Now checking for videos from 1930-1949 that contain vehicle terms\n")
    print("Tracking terms: auto, voertuig, wagen, kar, vliegtuig, schip, schepen, vaartuig, schuit, motor, fiets")
    print(f"Will auto-stop after {MAX_UNCHANGED_BATCHES} batches with no new vehicle matches")

    try:
        while True:
            response = requests.get(base_url, params=params)
            root = etree.fromstring(response.content)
            records = root.xpath('//oai:ListRecords/oai:record', namespaces=namespaces)

            if not records:
                break

            batch_size = len(records)
            total_records += batch_size
            batch_matches = 0
            batch_vehicle_matches = 0

            for record in records:
                # Check resolution in various possible fields
                resolution = None
                resolution_fields_to_check = [
                    './/ebucore:width/text()',  # EBUCore standard
                    './/ebucore:height/text()',
                    './/dc:format/text()',      # Common format field
                    './/dc:description[contains(., "resolution")]/text()'
                ]

                for field in resolution_fields_to_check:
                    values = record.xpath(field, namespaces=namespaces)
                    for value in values:
                        current_res = parse_resolution(value)
                        if current_res:
                            resolution = current_res
                            resolution_field = field
                            break
                    if resolution:
                        break

                # Track lowest resolution
                if resolution:
                    if (lowest_resolution is None or
                        get_pixel_count(resolution) < get_pixel_count(lowest_resolution)):
                        lowest_resolution = resolution
                        lowest_resolution_record = record.xpath('.//dc:identifier/text()', namespaces=namespaces)
                        lowest_resolution_field = resolution_field

                # Standard processing for date and vehicle terms
                dates = record.xpath('.//dc:date/text()', namespaces=namespaces)
                year = None

                for date_str in dates:
                    year = get_year(date_str)
                    if year:
                        break

                if not year:
                    for field in ['dc:coverage', 'dc:temporal', 'dc:dateAccepted', 'dc:dateCopyrighted']:
                        alt_dates = record.xpath(f'.//{field}/text()', namespaces=namespaces)
                        for date_str in alt_dates:
                            year = get_year(date_str)
                            if year:
                                alt_date_matches += 1
                                break
                        if year:
                            break

                if not dates and not year:
                    no_date_count += 1
                elif dates and not year:
                    malformed_dates += 1

                if year and 1930 <= year <= 1949:
                    batch_matches += 1

                    all_texts = []
                    for field in vehicle_check_fields:
                        all_texts.extend(record.xpath(f'.//{field}/text()', namespaces=namespaces))

                    if count_vehicle_terms(all_texts, term_counts):
                        batch_vehicle_matches += 1

            matching_records += batch_matches
            vehicle_matches += batch_vehicle_matches

            # Check for auto-stop condition
            if vehicle_matches == last_vehicle_count:
                unchanged_batches += 1
                if unchanged_batches >= MAX_UNCHANGED_BATCHES:
                    print("\nAuto-stop triggered: No new vehicle matches in last",
                          MAX_UNCHANGED_BATCHES, "batches")
                    break
            else:
                unchanged_batches = 0
                last_vehicle_count = vehicle_matches

            print(f"Batch: {batch_size} records | {batch_matches} 1930-1949 records | "
                  f"{batch_vehicle_matches} vehicle term matches | "
                  f"Total: {total_records} | Cumulative vehicle matches: {vehicle_matches} | "
                  f"Unchanged batches: {unchanged_batches}/{MAX_UNCHANGED_BATCHES}")

            token = root.xpath('//oai:resumptionToken/text()', namespaces=namespaces)
            if not token or not token[0]:
                break

            params = {'verb': 'ListRecords', 'resumptionToken': token[0]}
            sleep(0.5)

    except Exception as e:
        print(f"\nError occurred: {e}")

    # Final report
    print("\n=== FINAL RESULTS ===")
    print(f"Total records processed: {total_records}")
    print(f"Records from 1930-1949: {matching_records}")
    print(f"Records from 1930-1949 containing vehicle terms: {vehicle_matches}")

    print("\n=== TERM COUNTS ===")
    for term, count in sorted(term_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"{term}: {count}")

    print("\n=== VIDEO RESOLUTION ===")
    if lowest_resolution:
        print(f"Lowest resolution found: {lowest_resolution[0]}x{lowest_resolution[1]}")
        if lowest_resolution_record:
            print(f"Found in record: {lowest_resolution_record[0] if lowest_resolution_record else 'Unknown'}")
        print(f"Found in field: {lowest_resolution_field}")
    else:
        print("No resolution information found in metadata")

    print("\n=== OTHER STATS ===")
    print(f"Records with no date field: {no_date_count}")
    print(f"Records with malformed dates: {malformed_dates}")
    print(f"Matches found in alternative date fields: {alt_date_matches}")
    print(f"Percentage matching (1930-1949): {matching_records/max(1,total_records)*100:.1f}%")
    print(f"Percentage with vehicle terms from matching years: {vehicle_matches/max(1,matching_records)*100:.1f}%")

if __name__ == '__main__':
    main()

Starting enhanced collection scan...
Now checking for videos from 1930-1949 that contain vehicle terms

Tracking terms: auto, voertuig, wagen, kar, vliegtuig, schip, schepen, vaartuig, schuit, motor, fiets
Will auto-stop after 5 batches with no new vehicle matches
Batch: 100 records | 18 1930-1949 records | 9 vehicle term matches | Total: 100 | Cumulative vehicle matches: 9 | Unchanged batches: 0/5
Batch: 100 records | 21 1930-1949 records | 6 vehicle term matches | Total: 200 | Cumulative vehicle matches: 15 | Unchanged batches: 0/5
Batch: 100 records | 24 1930-1949 records | 18 vehicle term matches | Total: 300 | Cumulative vehicle matches: 33 | Unchanged batches: 0/5
Batch: 100 records | 39 1930-1949 records | 20 vehicle term matches | Total: 400 | Cumulative vehicle matches: 53 | Unchanged batches: 0/5
Batch: 100 records | 99 1930-1949 records | 24 vehicle term matches | Total: 500 | Cumulative vehicle matches: 77 | Unchanged batches: 0/5
Batch: 100 records | 99 1930-1949 records |

Video finder including objects
# Nieuwe sectie

In [None]:
!pip install imagehash



# Video cutter

In [None]:
import cv2
import numpy as np
import os
from PIL import Image
import imagehash

# === PARAMETERS ===
video_folder = 'test'
output_folder = 'test_extracted'
os.makedirs(output_folder, exist_ok=True)

edge_margin = 30                # Pixels at frame edges to check motion
edge_motion_threshold = 0.1     # Lower = stricter; less edge motion allowed
frame_step = 2                  # Process every nth frame
hash_similarity_threshold = 5   # Max allowed hash difference (0 = exact same image)
output_resolution = (640, 640)  # Final resolution (after crop and resize)

def is_similar_to_last(frame, last_hash):
    pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    current_hash = imagehash.phash(pil_image)
    if last_hash is None:
        return False, current_hash
    hash_diff = abs(current_hash - last_hash)
    return hash_diff < hash_similarity_threshold, current_hash

def center_crop_to_square(image):
    height, width = image.shape[:2]
    min_dim = min(height, width)
    start_x = (width - min_dim) // 2
    start_y = (height - min_dim) // 2
    return image[start_y:start_y + min_dim, start_x:start_x + min_dim]

def process_video(video_path, video_name, saved_idx_start):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print(f"‚ùå Failed to open {video_path}")
        return 0  # No frames saved

    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = frame_count / fps if fps else 0
    print(f"üìº Video Info: FPS={fps:.2f}, Frames={frame_count}, Duration={duration:.2f}s")

    prev_gray = None
    frame_idx = 0
    saved_idx = saved_idx_start
    last_saved_hash = None
    frames_saved = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            print(f"üîö End of video or read error at frame {frame_idx}")
            break

        if frame_idx % frame_step != 0:
            frame_idx += 1
            continue

        if frame_idx % 500 == 0:
            print(f"üìç Processing frame {frame_idx}...")

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        if prev_gray is not None:
            diff = cv2.absdiff(gray, prev_gray)
            motion_map = np.sum(diff, axis=0)

            total_motion = np.sum(motion_map)
            if total_motion == 0:
                edge_motion_ratio = 0
            else:
                left_motion = np.sum(motion_map[:edge_margin])
                right_motion = np.sum(motion_map[-edge_margin:])
                edge_motion_ratio = (left_motion + right_motion) / total_motion

            if edge_motion_ratio < edge_motion_threshold:
                is_similar, current_hash = is_similar_to_last(frame, last_saved_hash)
                if not is_similar:
                    # Crop to square and resize
                    cropped = center_crop_to_square(frame)
                    resized = cv2.resize(cropped, output_resolution)

                    # Create filename with video name and frame number
                    video_basename = os.path.splitext(video_name)[0]
                    filename = os.path.join(output_folder, f"{video_basename}_frame_{saved_idx:06d}.jpg")
                    cv2.imwrite(filename, resized)
                    saved_idx += 1
                    frames_saved += 1
                    last_saved_hash = current_hash

        prev_gray = gray
        frame_idx += 1

    cap.release()
    print(f"‚úÖ Done: {frame_idx} frames processed, {frames_saved} saved.")
    return frames_saved

# === PROCESS ALL VIDEOS ===
global_idx = 0
for filename in os.listdir(video_folder):
    if filename.lower().endswith(('.mp4', '.mov', '.avi', '.mkv')):
        video_path = os.path.join(video_folder, filename)
        print(f"\nüöÄ Starting: {filename}")
        frames_saved = process_video(video_path, filename, global_idx)
        global_idx += frames_saved  # Ensure unique names

print("\nüéâ All videos processed.")


üöÄ Starting: WEEKNUMMER672-HRE00015290_3254000_3492000.mp4
üìº Video Info: FPS=25.00, Frames=5949, Duration=237.96s
üìç Processing frame 0...
üìç Processing frame 500...
üìç Processing frame 1000...
üìç Processing frame 1500...
üìç Processing frame 2000...
üìç Processing frame 2500...
üìç Processing frame 3000...
üìç Processing frame 3500...
üìç Processing frame 4000...
üìç Processing frame 4500...
üìç Processing frame 5000...
üìç Processing frame 5500...
üîö End of video or read error at frame 5949
‚úÖ Done: 5949 frames processed, 143 saved.

üöÄ Starting: WEEKNUMMER584-HRE0000E904_181000_321000.mp4
üìº Video Info: FPS=25.00, Frames=3499, Duration=139.96s
üìç Processing frame 0...
üìç Processing frame 500...
üìç Processing frame 1000...
üìç Processing frame 1500...
üìç Processing frame 2000...
üìç Processing frame 2500...
üìç Processing frame 3000...
üîö End of video or read error at frame 3499
‚úÖ Done: 3499 frames processed, 253 saved.

üöÄ Starting: WEEK

In [None]:
from google.colab import files
import shutil

# Define folder names
folders = ['test_extracted', 'veld_extracted']

# Zip the folders to download them as a single file
for folder in folders:
    shutil.make_archive(folder, 'zip', folder)

# Download the zip files
for folder in folders:
    files.download(f"{folder}.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloader

In [None]:
import requests
from lxml import etree
from time import sleep
import re
from collections import defaultdict
import os
from urllib.parse import urlparse, urljoin

def get_year(date_str):
    """Robust year extraction handling multiple formats"""
    if not date_str:
        return None

    patterns = [
        r'\b(18|19|20)\d{2}\b',  # YYYY
        r'(\d{4})-\d{2}-\d{2}',   # YYYY-MM-DD
        r'(\d{4})-\d{4}',         # YYYY-YYYY range
        r'[cC]irca\s(\d{4})',     # Circa YYYY
        r'\b(\d{4})\b',           # Just year as standalone number
    ]

    for pattern in patterns:
        match = re.search(pattern, date_str)
        if match:
            year_str = match.group(1) if match.groups() else match.group(0)
            try:
                year = int(year_str)
                if 1800 <= year <= 2100:  # Sanity check
                    return year
            except ValueError:
                continue
    return None

def count_vehicle_terms(texts, term_counts):
    """Count occurrences of each vehicle term in texts"""
    if not texts:
        return False

    vehicle_terms = [
        'auto', 'voertuig', 'wagen', 'kar',
        'vliegtuig', 'schip', 'schepen',
        'vaartuig', 'schuit', 'motor', 'fiets'
    ]

    found_any = False
    for text in texts:
        lower_text = text.lower()
        for term in vehicle_terms:
            if term in lower_text:
                term_counts[term] += 1
                found_any = True
    return found_any

def parse_resolution(resolution_str):
    """Parse resolution string into width and height (in pixels) with validation"""
    if not resolution_str:
        return None

    patterns = [
        r'(?P<width>\d{3,})\s*[x√ó]\s*(?P<height>\d{3,})(?!\d)',
        r'(?P<width>\d{3,})\s*[x√ó]\s*(?P<height>\d{3,})\s*[pP]',
        r'(?P<height>\d{3,})\s*[pP]',
        r'(?P<width>\d{3,})\s*[x√ó]\s*(?P<height>\d{3,})\s*pixels',
        r'(\d{3,})\s*[*x√ó]\s*(\d{3,})'
    ]

    MIN_RESOLUTION = 160

    for pattern in patterns:
        match = re.search(pattern, resolution_str)
        if match:
            groups = match.groupdict()
            try:
                if 'width' in groups and 'height' in groups:
                    width = int(groups['width'])
                    height = int(groups['height'])
                elif 'height' in groups:
                    height = int(groups['height'])
                    width = int(height * 16 / 9)
                else:
                    continue

                if width >= MIN_RESOLUTION and height >= MIN_RESOLUTION:
                    return (width, height)
            except (ValueError, TypeError):
                continue
    return None

def get_pixel_count(resolution):
    """Calculate total pixels for comparison"""
    if resolution:
        return resolution[0] * resolution[1]
    return float('inf')

def get_video_url(record, namespaces):
    """Extract video page URL from Open Beelden record"""
    # Try direct media links first
    identifiers = record.xpath('.//dc:identifier/text()', namespaces=namespaces)
    for ident in identifiers:
        if ident.startswith('http') and 'openbeelden.nl' in ident.lower():
            return ident

    # Try relations and sources
    relations = record.xpath('.//dc:relation/text()', namespaces=namespaces)
    for rel in relations:
        if rel.startswith('http') and 'openbeelden.nl' in rel.lower():
            return rel

    # Construct from OAI identifier as last resort
    oai_id = record.xpath('.//oai:header/oai:identifier/text()', namespaces=namespaces)
    if oai_id:
        item_id = oai_id[0].split(':')[-1]
        return f"https://openbeelden.nl/media/{item_id}"

    return None

def download_video(page_url, identifier, download_dir='downloaded_videos2'):
    """Download video by first finding the actual video source from the page"""
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    try:
        # Create safe filename
        safe_id = re.sub(r'[^\w\-]', '_', str(identifier))
        filename = f"{safe_id}.mp4"
        filepath = os.path.join(download_dir, filename)

        if os.path.exists(filepath):
            print(f"Already exists: {filename}")
            return filepath

        print(f"\nFetching video page: {page_url}")

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html'
        }

        # First get the HTML page
        response = requests.get(page_url, headers=headers, timeout=30)
        response.raise_for_status()

        # Parse the HTML to find video source
        html = etree.HTML(response.text)

        # Try to find video source in multiple ways
        video_src = None

        # Method 1: Look for <video> source tags
        video_sources = html.xpath('//video/source/@src')
        if video_sources:
            for src in video_sources:
                if src.endswith('.mp4'):
                    video_src = src
                    break

        # Method 2: Look for direct MP4 links in the page
        if not video_src:
            mp4_links = re.findall(r'(https?://[^\s"\'<>]+?\.mp4)', response.text)
            if mp4_links:
                video_src = mp4_links[0]

        # Method 3: Look for download buttons
        if not video_src:
            download_links = html.xpath('//a[contains(@href,"download") or contains(@href,".mp4")]/@href')
            for link in download_links:
                if link.endswith('.mp4'):
                    video_src = link
                    break

        if not video_src:
            print("Could not find video source in page")
            return None

        # Handle relative URLs
        if not video_src.startswith('http'):
            video_src = urljoin(page_url, video_src)

        print(f"Found video source: {video_src}")

        # Now download the actual video
        print(f"Downloading video...")

        with requests.get(video_src, stream=True, timeout=30, headers=headers) as r:
            r.raise_for_status()
            total_size = int(r.headers.get('content-length', 0))

            with open(filepath, 'wb') as f:
                downloaded = 0
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        downloaded += len(chunk)
                        if total_size > 0:
                            print(f"Progress: {downloaded}/{total_size} bytes ({downloaded/total_size:.1%})", end='\r')
                        else:
                            print(f"Progress: {downloaded} bytes", end='\r')

        # Verify minimum file size (MP4 header is at least 8 bytes)
        file_size = os.path.getsize(filepath)
        if file_size < 1024:  # At least 1KB
            os.remove(filepath)
            print("\nDownloaded file is too small to be valid")
            return None

        print(f"\nSuccessfully saved: {filepath} ({file_size/1024:.1f} KB)")
        return filepath

    except requests.exceptions.RequestException as e:
        print(f"\nNetwork error downloading video: {str(e)}")
    except Exception as e:
        print(f"\nError downloading video: {str(e)}")

    if 'filepath' in locals() and os.path.exists(filepath):
        os.remove(filepath)
    return None

def main():
    base_url = 'https://www.openbeelden.nl/feeds/oai/'
    namespaces = {
        'oai': 'http://www.openarchives.org/OAI/2.0/',
        'dc': 'http://purl.org/dc/elements/1.1/',
        'ebucore': 'urn:ebu:metadata-schema:ebucore'
    }

    # Counters
    total_records = 0
    matching_records = 0
    vehicle_matches = 0
    downloaded_count = 0
    term_counts = defaultdict(int)

    # Resolution tracking
    lowest_resolution = None
    lowest_resolution_record = None

    # Configuration
    MAX_UNCHANGED_BATCHES = 5
    unchanged_batches = 0
    last_vehicle_count = 0
    vehicle_check_fields = ['dc:title', 'dc:description', 'dc:subject', 'dc:coverage']
    params = {'verb': 'ListRecords', 'metadataPrefix': 'oai_dc', 'set': 'openimages'}

    print("Starting Open Beelden video collector...")
    print("Target: 1930-1949 videos containing vehicle terms")
    print("Tracking terms:", ', '.join(['auto', 'voertuig', 'wagen', 'kar', 'vliegtuig',
                                      'schip', 'schepen', 'vaartuig', 'schuit', 'motor', 'fiets']))
    print(f"Auto-stop after {MAX_UNCHANGED_BATCHES} batches with no new matches\n")

    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'application/xml'
    })

    try:
        while True:
            try:
                print("\nFetching next batch of records...")
                response = session.get(base_url, params=params, timeout=30)
                response.raise_for_status()
                root = etree.fromstring(response.content)
            except (requests.RequestException, etree.ParseError) as e:
                print(f"Error fetching/parsing batch: {str(e)}")
                sleep(5)
                continue

            records = root.xpath('//oai:ListRecords/oai:record', namespaces=namespaces)

            if not records:
                print("No more records found")
                break

            batch_size = len(records)
            total_records += batch_size
            batch_matches = 0
            batch_vehicle_matches = 0
            batch_downloads = 0

            for record in records:
                # Get year
                year = None
                for date_str in record.xpath('.//dc:date/text()', namespaces=namespaces):
                    year = get_year(date_str)
                    if year:
                        break

                if year and 1930 <= year <= 1949:
                    batch_matches += 1

                    # Check for vehicle terms
                    texts = []
                    for field in vehicle_check_fields:
                        texts.extend(record.xpath(f'.//{field}/text()', namespaces=namespaces))

                    if count_vehicle_terms(texts, term_counts):
                        batch_vehicle_matches += 1

                        # Try to download
                        video_page_url = get_video_url(record, namespaces)
                        if video_page_url:
                            print(f"\nFound matching video page: {video_page_url}")
                            identifiers = record.xpath('.//dc:identifier/text()', namespaces=namespaces)
                            identifier = identifiers[0] if identifiers else str(total_records)

                            if download_video(video_page_url, identifier):
                                batch_downloads += 1
                        else:
                            print("Could not download video for matching record")

                # Check resolution (optional)
                for res_str in record.xpath('.//dc:format/text()', namespaces=namespaces):
                    res = parse_resolution(res_str)
                    if res and (lowest_resolution is None or get_pixel_count(res) < get_pixel_count(lowest_resolution)):
                        lowest_resolution = res
                        lowest_resolution_record = record.xpath('.//dc:title/text()', namespaces=namespaces)[:1]

            # Update counters
            matching_records += batch_matches
            vehicle_matches += batch_vehicle_matches
            downloaded_count += batch_downloads

            # Check auto-stop condition
            if vehicle_matches == last_vehicle_count:
                unchanged_batches += 1
                if unchanged_batches >= MAX_UNCHANGED_BATCHES:
                    print(f"\nAuto-stop: No new matches in {MAX_UNCHANGED_BATCHES} batches")
                    break
            else:
                unchanged_batches = 0
                last_vehicle_count = vehicle_matches

            # Progress report
            print(f"\nBatch {total_records//batch_size} complete:")
            print(f"- Records processed: {batch_size} (Total: {total_records})")
            print(f"- 1930-1949 matches: {batch_matches} (Total: {matching_records})")
            print(f"- Vehicle term matches: {batch_vehicle_matches} (Total: {vehicle_matches})")
            print(f"- Downloads this batch: {batch_downloads} (Total: {downloaded_count})")
            print(f"- Unchanged batches: {unchanged_batches}/{MAX_UNCHANGED_BATCHES}")

            # Get next batch
            token = root.xpath('//oai:resumptionToken/text()', namespaces=namespaces)
            if not token:
                print("No more batches available")
                break
            params = {'verb': 'ListRecords', 'resumptionToken': token[0]}
            sleep(2)  # Be polite with delay between requests

    except KeyboardInterrupt:
        print("\nProcess interrupted by user")
    except Exception as e:
        print(f"\nFatal error: {str(e)}")

    # Final report
    print("\n=== COLLECTION STATISTICS ===")
    print(f"Total records processed: {total_records}")
    print(f"Records from 1930-1949: {matching_records} ({matching_records/max(1,total_records)*100:.1f}%)")
    print(f"Records with vehicle terms: {vehicle_matches} ({vehicle_matches/max(1,matching_records)*100:.1f}%)")
    print(f"Videos successfully downloaded: {downloaded_count}")

    print("\n=== TERM FREQUENCIES ===")
    for term, count in sorted(term_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"{term}: {count}")

    if lowest_resolution:
        title = lowest_resolution_record[0] if lowest_resolution_record else "Unknown"
        print(f"\nLowest resolution video: {lowest_resolution[0]}x{lowest_resolution[1]} - {title}")

if __name__ == '__main__':
    main()

Download + cutter

In [None]:
import requests
from lxml import etree
from time import sleep
import re
from collections import defaultdict
import os
from urllib.parse import urlparse, urljoin
import cv2
import numpy as np
from PIL import Image
import imagehash

# === VIDEO COLLECTION PARAMETERS ===
DOWNLOAD_DIR = 'collected_videos'
FRAME_OUTPUT_DIR = 'extracted_frames'
MAX_VIDEOS_TO_KEEP = 5  # Number of videos to keep in storage at once
MIN_FRAMES_PER_VIDEO = 5  # Minimum frames to extract before considering deletion

# === FRAME EXTRACTION PARAMETERS ===
EDGE_MARGIN = 30                # Pixels at frame edges to check motion
EDGE_MOTION_THRESHOLD = 0.1     # Lower = stricter; less edge motion allowed
FRAME_STEP = 2                  # Process every nth frame
HASH_SIMILARITY_THRESHOLD = 5   # Max allowed hash difference (0 = exact same image)
OUTPUT_RESOLUTION = (640, 640)  # Final resolution (after crop and resize)

def get_year(date_str):
    """Robust year extraction handling multiple formats"""
    if not date_str:
        return None

    patterns = [
        r'\b(18|19|20)\d{2}\b',  # YYYY
        r'(\d{4})-\d{2}-\d{2}',   # YYYY-MM-DD
        r'(\d{4})-\d{4}',         # YYYY-YYYY range
        r'[cC]irca\s(\d{4})',     # Circa YYYY
        r'\b(\d{4})\b',           # Just year as standalone number
    ]

    for pattern in patterns:
        match = re.search(pattern, date_str)
        if match:
            year_str = match.group(1) if match.groups() else match.group(0)
            try:
                year = int(year_str)
                if 1800 <= year <= 2100:  # Sanity check
                    return year
            except ValueError:
                continue
    return None

def count_vehicle_terms(texts, term_counts):
    """Count occurrences of each vehicle term in texts"""
    if not texts:
        return False

    vehicle_terms = [
        'auto', 'voertuig', 'wagen', 'kar',
        'vliegtuig', 'schip', 'schepen',
        'vaartuig', 'schuit', 'motor', 'fiets'
    ]

    found_any = False
    for text in texts:
        lower_text = text.lower()
        for term in vehicle_terms:
            if term in lower_text:
                term_counts[term] += 1
                found_any = True
    return found_any

def get_video_url(record, namespaces):
    """Extract video page URL from Open Beelden record"""
    # Try direct media links first
    identifiers = record.xpath('.//dc:identifier/text()', namespaces=namespaces)
    for ident in identifiers:
        if ident.startswith('http') and 'openbeelden.nl' in ident.lower():
            return ident

    # Try relations and sources
    relations = record.xpath('.//dc:relation/text()', namespaces=namespaces)
    for rel in relations:
        if rel.startswith('http') and 'openbeelden.nl' in rel.lower():
            return rel

    # Construct from OAI identifier as last resort
    oai_id = record.xpath('.//oai:header/oai:identifier/text()', namespaces=namespaces)
    if oai_id:
        item_id = oai_id[0].split(':')[-1]
        return f"https://openbeelden.nl/media/{item_id}"

    return None

def download_video(page_url, identifier, download_dir=DOWNLOAD_DIR):
    """Download video by first finding the actual video source from the page"""
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    try:
        # Create safe filename
        safe_id = re.sub(r'[^\w\-]', '_', str(identifier))
        filename = f"{safe_id}.mp4"
        filepath = os.path.join(download_dir, filename)

        if os.path.exists(filepath):
            print(f"Already exists: {filename}")
            return filepath

        print(f"\nFetching video page: {page_url}")

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html'
        }

        # First get the HTML page
        response = requests.get(page_url, headers=headers, timeout=30)
        response.raise_for_status()

        # Parse the HTML to find video source
        html = etree.HTML(response.text)

        # Try to find video source in multiple ways
        video_src = None

        # Method 1: Look for <video> source tags
        video_sources = html.xpath('//video/source/@src')
        if video_sources:
            for src in video_sources:
                if src.endswith('.mp4'):
                    video_src = src
                    break

        # Method 2: Look for direct MP4 links in the page
        if not video_src:
            mp4_links = re.findall(r'(https?://[^\s"\'<>]+?\.mp4)', response.text)
            if mp4_links:
                video_src = mp4_links[0]

        # Method 3: Look for download buttons
        if not video_src:
            download_links = html.xpath('//a[contains(@href,"download") or contains(@href,".mp4")]/@href')
            for link in download_links:
                if link.endswith('.mp4'):
                    video_src = link
                    break

        if not video_src:
            print("Could not find video source in page")
            return None

        # Handle relative URLs
        if not video_src.startswith('http'):
            video_src = urljoin(page_url, video_src)

        print(f"Found video source: {video_src}")

        # Now download the actual video
        print(f"Downloading video...")

        with requests.get(video_src, stream=True, timeout=30, headers=headers) as r:
            r.raise_for_status()
            total_size = int(r.headers.get('content-length', 0))

            with open(filepath, 'wb') as f:
                downloaded = 0
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        downloaded += len(chunk)
                        if total_size > 0:
                            print(f"Progress: {downloaded}/{total_size} bytes ({downloaded/total_size:.1%})", end='\r')
                        else:
                            print(f"Progress: {downloaded} bytes", end='\r')

        # Verify minimum file size (MP4 header is at least 8 bytes)
        file_size = os.path.getsize(filepath)
        if file_size < 1024:  # At least 1KB
            os.remove(filepath)
            print("\nDownloaded file is too small to be valid")
            return None

        print(f"\nSuccessfully saved: {filepath} ({file_size/1024:.1f} KB)")
        return filepath

    except requests.exceptions.RequestException as e:
        print(f"\nNetwork error downloading video: {str(e)}")
    except Exception as e:
        print(f"\nError downloading video: {str(e)}")

    if 'filepath' in locals() and os.path.exists(filepath):
        os.remove(filepath)
    return None

def is_similar_to_last(frame, last_hash):
    pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    current_hash = imagehash.phash(pil_image)
    if last_hash is None:
        return False, current_hash
    hash_diff = abs(current_hash - last_hash)
    return hash_diff < HASH_SIMILARITY_THRESHOLD, current_hash

def center_crop_to_square(image):
    height, width = image.shape[:2]
    min_dim = min(height, width)
    start_x = (width - min_dim) // 2
    start_y = (height - min_dim) // 2
    return image[start_y:start_y + min_dim, start_x:start_x + min_dim]

def extract_frames(video_path, video_name):
    """Extract frames from video with motion detection and similarity checking"""
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print(f"‚ùå Failed to open {video_path}")
        return 0

    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = frame_count / fps if fps else 0
    print(f"üìº Video Info: {os.path.basename(video_path)} - FPS={fps:.2f}, Frames={frame_count}, Duration={duration:.2f}s")

    prev_gray = None
    frame_idx = 0
    saved_idx = 0
    last_saved_hash = None
    frames_saved = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            print(f"üîö End of video or read error at frame {frame_idx}")
            break

        if frame_idx % FRAME_STEP != 0:
            frame_idx += 1
            continue

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        if prev_gray is not None:
            diff = cv2.absdiff(gray, prev_gray)
            motion_map = np.sum(diff, axis=0)

            total_motion = np.sum(motion_map)
            if total_motion == 0:
                edge_motion_ratio = 0
            else:
                left_motion = np.sum(motion_map[:EDGE_MARGIN])
                right_motion = np.sum(motion_map[-EDGE_MARGIN:])
                edge_motion_ratio = (left_motion + right_motion) / total_motion

            if edge_motion_ratio < EDGE_MOTION_THRESHOLD:
                is_similar, current_hash = is_similar_to_last(frame, last_saved_hash)
                if not is_similar:
                    # Crop to square and resize
                    cropped = center_crop_to_square(frame)
                    resized = cv2.resize(cropped, OUTPUT_RESOLUTION)

                    # Create filename with video name and frame number
                    video_basename = os.path.splitext(video_name)[0]
                    filename = os.path.join(FRAME_OUTPUT_DIR, f"{video_basename}_frame_{saved_idx:06d}.jpg")
                    cv2.imwrite(filename, resized)
                    saved_idx += 1
                    frames_saved += 1
                    last_saved_hash = current_hash

        prev_gray = gray
        frame_idx += 1

    cap.release()
    print(f"‚úÖ Extracted {frames_saved} frames from {os.path.basename(video_path)}")
    return frames_saved

def cleanup_videos(download_dir, max_to_keep=MAX_VIDEOS_TO_KEEP):
    """Delete oldest videos to maintain storage limit"""
    video_files = [f for f in os.listdir(download_dir) if f.lower().endswith(('.mp4', '.mov', '.avi', '.mkv'))]

    if len(video_files) <= max_to_keep:
        return

    # Sort by modification time (oldest first)
    video_files.sort(key=lambda x: os.path.getmtime(os.path.join(download_dir, x)))

    # Delete oldest files
    for video_file in video_files[:-max_to_keep]:
        try:
            filepath = os.path.join(download_dir, video_file)
            os.remove(filepath)
            print(f"üóëÔ∏è Deleted video to save space: {video_file}")
        except Exception as e:
            print(f"Error deleting video {video_file}: {str(e)}")

def main():
    base_url = 'https://www.openbeelden.nl/feeds/oai/'
    namespaces = {
        'oai': 'http://www.openarchives.org/OAI/2.0/',
        'dc': 'http://purl.org/dc/elements/1.1/',
        'ebucore': 'urn:ebu:metadata-schema:ebucore'
    }

    # Counters
    total_records = 0
    matching_records = 0
    vehicle_matches = 0
    downloaded_count = 0
    frames_extracted = 0
    term_counts = defaultdict(int)

    # Configuration
    MAX_UNCHANGED_BATCHES = 5
    unchanged_batches = 0
    last_vehicle_count = 0
    vehicle_check_fields = ['dc:title', 'dc:description', 'dc:subject', 'dc:coverage']
    params = {'verb': 'ListRecords', 'metadataPrefix': 'oai_dc', 'set': 'openimages'}

    # Create output directories
    os.makedirs(DOWNLOAD_DIR, exist_ok=True)
    os.makedirs(FRAME_OUTPUT_DIR, exist_ok=True)

    print("Starting Open Beelden video collection and frame extraction pipeline...")
    print(f"Configuration:")
    print(f"- Max videos to keep: {MAX_VIDEOS_TO_KEEP}")
    print(f"- Frame extraction settings: Step={FRAME_STEP}, EdgeMargin={EDGE_MARGIN}, MotionThreshold={EDGE_MOTION_THRESHOLD}")
    print(f"- Output resolution: {OUTPUT_RESOLUTION[0]}x{OUTPUT_RESOLUTION[1]}")
    print(f"- Minimum frames per video before deletion: {MIN_FRAMES_PER_VIDEO}\n")

    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'application/xml'
    })

    try:
        while True:
            try:
                print("\nFetching next batch of records...")
                response = session.get(base_url, params=params, timeout=30)
                response.raise_for_status()
                root = etree.fromstring(response.content)
            except (requests.RequestException, etree.ParseError) as e:
                print(f"Error fetching/parsing batch: {str(e)}")
                sleep(5)
                continue

            records = root.xpath('//oai:ListRecords/oai:record', namespaces=namespaces)

            if not records:
                print("No more records found")
                break

            batch_size = len(records)
            total_records += batch_size
            batch_matches = 0
            batch_vehicle_matches = 0
            batch_downloads = 0
            batch_frames = 0

            for record in records:
                # Get year
                year = None
                for date_str in record.xpath('.//dc:date/text()', namespaces=namespaces):
                    year = get_year(date_str)
                    if year:
                        break

                if year and 1930 <= year <= 1949:
                    batch_matches += 1

                    # Check for vehicle terms
                    texts = []
                    for field in vehicle_check_fields:
                        texts.extend(record.xpath(f'.//{field}/text()', namespaces=namespaces))

                    if count_vehicle_terms(texts, term_counts):
                        batch_vehicle_matches += 1

                        # Try to download
                        video_page_url = get_video_url(record, namespaces)
                        if video_page_url:
                            print(f"\nFound matching video page: {video_page_url}")
                            identifiers = record.xpath('.//dc:identifier/text()', namespaces=namespaces)
                            identifier = identifiers[0] if identifiers else str(total_records)

                            video_path = download_video(video_page_url, identifier)
                            if video_path:
                                batch_downloads += 1

                                # Extract frames from the downloaded video
                                video_name = os.path.basename(video_path)
                                frames_saved = extract_frames(video_path, video_name)
                                batch_frames += frames_saved
                                frames_extracted += frames_saved

                                # Delete video if we got enough frames (and we have too many videos)
                                if frames_saved >= MIN_FRAMES_PER_VIDEO:
                                    try:
                                        os.remove(video_path)
                                        print(f"üóëÔ∏è Deleted video after extracting {frames_saved} frames")
                                    except Exception as e:
                                        print(f"Error deleting video: {str(e)}")

                                # Clean up if we have too many videos
                                cleanup_videos(DOWNLOAD_DIR)
                        else:
                            print("Could not download video for matching record")

            # Update counters
            matching_records += batch_matches
            vehicle_matches += batch_vehicle_matches
            downloaded_count += batch_downloads

            # Check auto-stop condition
            if vehicle_matches == last_vehicle_count:
                unchanged_batches += 1
                if unchanged_batches >= MAX_UNCHANGED_BATCHES:
                    print(f"\nAuto-stop: No new matches in {MAX_UNCHANGED_BATCHES} batches")
                    break
            else:
                unchanged_batches = 0
                last_vehicle_count = vehicle_matches

            # Progress report
            print(f"\nBatch {total_records//batch_size} complete:")
            print(f"- Records processed: {batch_size} (Total: {total_records})")
            print(f"- 1930-1949 matches: {batch_matches} (Total: {matching_records})")
            print(f"- Vehicle term matches: {batch_vehicle_matches} (Total: {vehicle_matches})")
            print(f"- Videos downloaded: {batch_downloads} (Total: {downloaded_count})")
            print(f"- Frames extracted: {batch_frames} (Total: {frames_extracted})")
            print(f"- Unchanged batches: {unchanged_batches}/{MAX_UNCHANGED_BATCHES}")

            # Get next batch
            token = root.xpath('//oai:resumptionToken/text()', namespaces=namespaces)
            if not token:
                print("No more batches available")
                break
            params = {'verb': 'ListRecords', 'resumptionToken': token[0]}
            sleep(2)  # Be polite with delay between requests

    except KeyboardInterrupt:
        print("\nProcess interrupted by user")
    except Exception as e:
        print(f"\nFatal error: {str(e)}")

    # Final report
    print("\n=== PIPELINE STATISTICS ===")
    print(f"Total records processed: {total_records}")
    print(f"Records from 1930-1949: {matching_records} ({matching_records/max(1,total_records)*100:.1f}%)")
    print(f"Records with vehicle terms: {vehicle_matches} ({vehicle_matches/max(1,matching_records)*100:.1f}%)")
    print(f"Videos downloaded: {downloaded_count}")
    print(f"Frames extracted: {frames_extracted}")

    print("\n=== TERM FREQUENCIES ===")
    for term, count in sorted(term_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"{term}: {count}")

    # Clean up any remaining videos
    print("\nCleaning up remaining video files...")
    cleanup_videos(DOWNLOAD_DIR, max_to_keep=0)

if __name__ == '__main__':
    main()


Starting Open Beelden video collection and frame extraction pipeline...
Configuration:
- Max videos to keep: 5
- Frame extraction settings: Step=2, EdgeMargin=30, MotionThreshold=0.1
- Output resolution: 640x640
- Minimum frames per video before deletion: 5


Fetching next batch of records...

Found matching video page: https://openbeelden.nl/media/1480471

Fetching video page: https://openbeelden.nl/media/1480471
Found video source: https://openbeelden.nl/files/14/80/1481683.1480475.HET_NEDERLAND-FHD00Z029MC.mp4
Downloading video...
Progress: 81134935 bytes
Successfully saved: collected_videos/PGM26730.mp4 (79233.3 KB)
üìº Video Info: PGM26730.mp4 - FPS=25.00, Frames=15897, Duration=635.88s
üîö End of video or read error at frame 15897
‚úÖ Extracted 695 frames from PGM26730.mp4
üóëÔ∏è Deleted video after extracting 695 frames

Found matching video page: https://openbeelden.nl/media/1480391

Fetching video page: https://openbeelden.nl/media/1480391
Found video source: https://openbe

In [None]:
import shutil
from google.colab import files

# Zip the output directory
shutil.make_archive('extracted_frames', 'zip', '/content/extracted_frames')

# Download the zip file
files.download('extracted_frames.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>