<!-- TABS -->
# Apply a chunker for search

:::note
Note that applying a chunker is ***not*** mandatory for search.
If your data is already chunked (e.g. short text snippets or audio) or if you
are searching through something like images, which can't be chunked, then this
won't be necessary.
:::

In [None]:
# <tab: Text>
from pinnacledb import objectmodel

CHUNK_SIZE = 200

@objectmodel(flatten=True, model_update_kwargs={'document_embedded': False})
def chunker(text):
    text = text.split()
    chunks = [' '.join(text[i:i + CHUNK_SIZE]) for i in range(0, len(text), CHUNK_SIZE)]
    return chunks

In [None]:
# <tab: PDF>
!pip install PyPDF2
from pinnacledb import objectmodel

CHUNK_SIZE = 500

@objectmodel(flatten=True, model_update_kwargs={'document_embedded': False})
def chunker(pdf_file):
    reader = PyPDF2.PdfReader(pdf_file)
    num_pages = len(reader.pages)
    print(f'Number of pages {num_pages}')
    text = []    
    for i in range(num_pages):
        page = reader.pages[i]        
        page_text = page.extract_text()
        text.append(page_text)
    text = '\n\n'.join(text)
    chunks = [text[i:i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
    return chunks

In [None]:
# <testing: >
!curl -O 'https://arxiv.org/pdf/2303.08774.pdf?fbclid=IwAR2XS6JT2NLIP4MjFn9npot34FhddoqStNbLwIvWETf5ZGlCPsIbuYneo8s&mibextid=Zxz2cZ'
chunks = chunker('2303.08774.pdf')
len(chunks)

In [None]:
# <tab: Video>
!pip install opencv-python
import cv2
import tqdm
from PIL import Image
from pinnacledb.ext.pillow import pil_image
from pinnacledb import ObjectModel, Schema


@objectmodel(
    flatten=True,
    model_update_kwargs={'document_embedded': False},
    output_schema=Schema(identifier='output-schema', fields={'image': pil_image}),
)
def chunker(video_file):
    # Set the sampling frequency for frames
    sample_freq = 10
    
    # Open the video file using OpenCV
    cap = cv2.VideoCapture(video_file)
    
    # Initialize variables
    frame_count = 0
    fps = cap.get(cv2.CAP_PROP_FPS)
    extracted_frames = []
    progress = tqdm.tqdm()

    # Iterate through video frames
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Get the current timestamp based on frame count and FPS
        current_timestamp = frame_count // fps
        
        # Sample frames based on the specified frequency
        if frame_count % sample_freq == 0:
            extracted_frames.append({
                'image': Image.fromarray(frame[:,:,::-1]),  # Convert BGR to RGB
                'current_timestamp': current_timestamp,
            })
        frame_count += 1
        progress.update(1)
    
    # Release resources
    cap.release()
    cv2.destroyAllWindows()
    
    # Return the list of extracted frames
    return extracted_frames

In [None]:
# <tab: Audio>
from pinnacledb import objectmodel, Schema

CHUNK_SIZE = 10  # in seconds

@objectmodel(
    flatten=True,
    model_update_kwargs={'document_embedded': False},
    output_schema=Schema(identifier='output-schema', fields={'audio': datatype}),
)
def chunker(audio):
    chunks = []
    for i in range(0, len(audio), CHUNK_SIZE):
        chunks.append(audio[1][i: i + CHUNK_SIZE])
    return [(audio[0], chunk) for chunk in chunks]

Now we apply this chunker to the data by wrapping the chunker in `Listener`:

In [None]:
from pinnacledb import Listener

upstream_listener = Listener(
    model=chunker,
    select=select,
    key='x',
)

db.add(upstream_listener)
indexing_key = upstream_listener.outputs