# Logan Kinajil-Moran CSC259 Final Project

# Video Compression

In [206]:
import os

video_name = "akiyo_cif.y4m"
video_path = f"sample_videos/{video_name}"

# Get initial size (using Mac Byte Division)
file_size = os.path.getsize(video_path) / (1000 * 1000) 

print(f"Original Size of \"{video_name}\": {file_size:.2f}MB")

Original Size of "akiyo_cif.y4m": 45.62MB


## Getting Frames into an Array

In [207]:
import numpy as np

def get_vid_info(yuv_filename):
    frame_marker = b'FRAME\n' 

    with open(yuv_filename, "rb") as file:
        content = file.read()

    # Convert the content to a string
    content_str = content.decode("utf-8", errors="ignore")

    # Find the end of the main header
    header_end = content_str.find("\n") + 1  # Include the newline character

    width = int(content_str.split()[1].strip("W"))
    height = int(content_str.split()[2].strip("H"))
    y_plane_size = width * height
    uv_plane_size = width * height
    frame_size = y_plane_size + 2 * uv_plane_size

    # Read YUV frames
    frames = []
    with open(yuv_filename, "rb") as yuv_file:
        # Skip the header
        header = yuv_file.read(int(header_end))
        while True:
            marker = yuv_file.read(len(frame_marker))
            frame_data = yuv_file.read(frame_size)
            if len(frame_data) < frame_size:
                break

            frames.append(frame_data)

    return frames, header, width, height

frames, header, width, height = get_vid_info("sample_videos/akiyo_cif.y4m")

print("Y4M Video Header:", header)


Y4M Video Header: b'YUV4MPEG2 W352 H288 F30000:1001 Ip A128:117\n'


# Methods for handling video and frames

In [208]:
import os

def load_frame(frame, width=width, height=height):
    y_size = width * height
    uv_size = (width) * (height)
    
    y = np.frombuffer(frame[0:y_size], dtype=np.uint8).reshape((height, width))
    u = np.frombuffer(frame[y_size:y_size + uv_size], dtype=np.uint8).reshape((height, width))
    v = np.frombuffer(frame[y_size + uv_size:], dtype=np.uint8).reshape((height, width))
    
    return y, u, v

def frames_into_bytes(frames):
    byte_frames = []
    for frame in frames:
        y, u, v = frame
        byte_frames.append(y.astype(np.uint8).tobytes() +
                            u.astype(np.uint8).tobytes() +
                            v.astype(np.uint8).tobytes())
    return byte_frames


def get_output_video(frames, output_name, width=width, height=height, framerate="30000:1001", chroma="C420"):
    frame_marker = b'FRAME\n'
    output_path = f"output_videos/{output_name}.y4m"
    header = f"YUV4MPEG2 W{width} H{height} F{framerate} p A128:117\n".encode('utf-8')

    with open(f"output_videos/{output_name}.y4m", "wb") as output_video:
        output_video.write(header)
        for frame in frames:
            output_video.write(frame_marker)
            output_video.write(frame)

    print(f"Size of {output_name}.y4m: {os.path.getsize(output_path) / (1000 * 1000):.2f}")


# Chroma Subsampling

In [209]:
import numpy as np

def chroma_subsample_420(plane):
    return plane[::2, ::2] 

def chroma_upsample_420(plane):
    upsampled = np.zeros((height, width), dtype=plane.dtype)
    
    for i in range(height // 2):
        for j in range(width // 2):
            y_idx = i * 2
            x_idx = j * 2
            upsampled[y_idx:y_idx+2, x_idx:x_idx+2] = plane[i,j]
    
    return upsampled

def apply_chroma_subsampling(frames, chroma_subsampling_type):
    subsampled_frames = []

    for frame in frames:
        y, u, v = load_frame(frame)
        u_sub = chroma_subsample_420(u)
        v_sub = chroma_subsample_420(v)
        subsampled_frames.append(y.astype(np.uint8).tobytes() +
                                    u_sub.astype(np.uint8).tobytes() +
                                    v_sub.astype(np.uint8).tobytes())
    
    return subsampled_frames

# Reducing the Frame Rate
Note: This must be manually reduced in the video output function to match new frames

In [217]:
def reduce_frame_rate(frames, rate):
    return frames[::rate]

def two_thirds_frame_rate(frames):
    # Retain 2/3 of the frames
    total_frames = len(frames)
    keep_indices = [i for i in range(total_frames) if i % 3 != 2]
    return [frames[i] for i in keep_indices]


# Inter Frame Encoding

In [211]:
import numpy as np

BLOCK_SIZE = 16
SEARCH_RANGE = 8

# Motion compensation and estimation
def motion_estimation(frame1, frame2, block_size=BLOCK_SIZE, search_range=SEARCH_RANGE):
    motion_vectors = np.zeros((frame1.shape[0] // block_size, frame1.shape[1] // block_size, 2), dtype=int)
    height, width = frame1.shape

    for i in range(0, height, block_size):
        for j in range(0, width, block_size):
            current_block = frame1[i:i+block_size, j:j+block_size]
            best_match = (0, 0)
            min_sad = float('inf')

            for dx in range(-search_range, search_range + 1):
                for dy in range(-search_range, search_range + 1):
                    ref_x, ref_y = i + dx, j + dy
                    if ref_x < 0 or ref_y < 0 or ref_x + block_size > height or ref_y + block_size > width:
                        continue
                    ref_block = frame2[ref_x:ref_x+block_size, ref_y:ref_y+block_size]
                    sad = np.sum(np.abs(current_block - ref_block))
                    if sad < min_sad:
                        min_sad = sad
                        best_match = (dx, dy)

            motion_vectors[i // block_size, j // block_size] = best_match
    return motion_vectors

def motion_compensation(frame, motion_vectors, block_size=BLOCK_SIZE):
    predicted_frame = np.zeros_like(frame)

    height, width = frame.shape
    for i in range(0, height, block_size):
        for j in range(0, width, block_size):
            dx, dy = motion_vectors[i // block_size, j // block_size]
            ref_x, ref_y = i + dx, j + dy
            predicted_frame[i:i+block_size, j:j+block_size] = frame[ref_x:ref_x+block_size, ref_y:ref_y+block_size]

    return predicted_frame

# Residual Calculations
def calculate_residual(actual_frame, predicted_frame):
    return actual_frame - predicted_frame

def reconstruct_frame(predicted_frame, residual):
    reconstructed_frame = predicted_frame + residual
    return np.clip(reconstructed_frame, 0, 255).astype(np.uint8)

# DCT and Quantization

In [212]:
from scipy.fftpack import dct, idct

DCT_BLOCK_SIZE = 8

# JPEG standard quantization matrix
QUANTIZATION_MATRIX = np.array([
    [16, 11, 10, 16, 24, 40, 51, 61],
    [12, 12, 14, 19, 26, 58, 60, 55],
    [14, 13, 16, 24, 40, 57, 69, 56],
    [14, 17, 22, 29, 51, 87, 80, 62],
    [18, 22, 37, 56, 68, 109, 103, 77],
    [24, 35, 55, 64, 81, 104, 113, 92],
    [49, 64, 78, 87, 103, 121, 120, 101],
    [72, 92, 95, 98, 112, 100, 103, 99]
])

# converts frame to frequency domain
def apply_dct(block):
    return dct(dct(block.T, norm='ortho').T, norm='ortho')

# converts frequency domain to spatial domain
def apply_idct(dct_block):
    return idct(idct(dct_block.T, norm='ortho').T, norm='ortho')

def quantize_dct(dct_block, quantization_matrix):
    return np.round(dct_block / quantization_matrix).astype(np.int16)

def dequantize_dct(quantized_block, quantization_matrix):
    return (quantized_block * quantization_matrix).astype(np.int16)

def run_length_encode(block):
    flat_block = np.array(block.flatten(), dtype=int)
    encoded = []
    prev_value = flat_block[0]
    count = 1

    for value in flat_block[1:]:
        if value == prev_value:
            count += 1
        else:
            encoded.append((int(prev_value), count))
            prev_value = value
            count = 1
    encoded.append((int(prev_value), count))
    return encoded

def run_length_decode(encoded_block):
    decoded_block = []
    for value, run_length in encoded_block:
        decoded_block.extend([value] * run_length)
    return np.array(decoded_block, dtype=int)

# Performing DCT and Quantization to entire frames
def encode_frame(frame):
    residual_dct_quantized = np.zeros_like(frame, dtype=float)
    encoded_blocks = []

    for i in range(0, height, DCT_BLOCK_SIZE):
        for j in range(0, width, DCT_BLOCK_SIZE):
            block = frame[i:i+DCT_BLOCK_SIZE, j:j+DCT_BLOCK_SIZE]
            dct_block = apply_dct(block)
            quantized_block = quantize_dct(dct_block, QUANTIZATION_MATRIX)
            residual_dct_quantized[i:i+DCT_BLOCK_SIZE, j:j+DCT_BLOCK_SIZE] = quantized_block.astype(int)
            encoded_blocks.append(run_length_encode(quantized_block))

    return encoded_blocks

def decode_frame(encoded_blocks, height, width):
    reconstructed_frame = np.zeros((height, width), dtype=float)
    block_idx = 0

    for i in range(0, height, DCT_BLOCK_SIZE):
        for j in range(0, width, DCT_BLOCK_SIZE):
            decoded_block_flat = run_length_decode(encoded_blocks[block_idx])
            decoded_block = decoded_block_flat.reshape(DCT_BLOCK_SIZE, DCT_BLOCK_SIZE)
            dequantize = dequantize_dct(decoded_block, QUANTIZATION_MATRIX)
            idct = apply_idct(dequantize)
            reconstructed_frame[i:i+DCT_BLOCK_SIZE, j:j+DCT_BLOCK_SIZE] = idct
            block_idx += 1

    return reconstructed_frame.astype(int)

# Main Video Compression Pipeline

In [218]:
from tqdm import tqdm

def pipeline(frames, width, height):
    # reduce frame rate (if desired)
    # frames_to_process = reduce_frame_rate(frames, 1) 

    frames_to_process = two_thirds_frame_rate(frames)

    # load keyframes
    key_frame_y, key_frame_u, key_frame_v = load_frame(frames_to_process[0], width, height)
    stored_to_write = [(key_frame_y, key_frame_u, key_frame_v)]
    compressed_frames = [(key_frame_y, key_frame_u, key_frame_v)]  
    num_to_process = len(frames_to_process)
    
    for i in tqdm(range(1, num_to_process)):
        current_frame_y, current_frame_u, current_frame_v = load_frame(frames_to_process[i], width, height)
        prev_y, prev_u, prev_v = load_frame(frames_to_process[i-1], width, height)
        
        motion_vectors = motion_estimation(prev_y, current_frame_y)

        predicted_frame_y = motion_compensation(prev_y, motion_vectors)
        residual_y = calculate_residual(current_frame_y, predicted_frame_y)

        encoded_blocks_y = encode_frame(residual_y)

        subsampled_u = chroma_subsample_420(current_frame_u)
        subsampled_v = chroma_subsample_420(current_frame_v)

        # At this point, we can store compressed information
        stored_to_write.append((motion_vectors, subsampled_u, subsampled_v, encoded_blocks_y))

        reconstructed_frame_u = chroma_upsample_420(subsampled_u)
        reconstructed_frame_v = chroma_upsample_420(subsampled_v)

        reconstructed_residual_y = decode_frame(encoded_blocks_y, height, width)
        reconstructed_frame_y = reconstruct_frame(predicted_frame_y, reconstructed_residual_y)

        reconstructed_frame_y = np.clip(reconstructed_frame_y, 0, 255)
        reconstructed_frame_u = np.clip(reconstructed_frame_u, 0, 255)
        reconstructed_frame_v = np.clip(reconstructed_frame_v, 0, 255)

        reconstructed_frame = (
            reconstructed_frame_y,
            reconstructed_frame_u,
            reconstructed_frame_v
        )

        compressed_frames.append(reconstructed_frame)
    
    return compressed_frames, stored_to_write

frames_to_process, store = pipeline(frames, width, height)

100%|██████████| 99/99 [04:01<00:00,  2.44s/it]


# Writing and Reading Stored Compressed Information

In [None]:
import struct

def write_compressed_file(store, filename):
    with open(filename, "wb") as f:
        for keyframe in store[0]:
            for row in keyframe:
                for value in row:
                    f.write(struct.pack("B", value))
        
        # Encoded frames
        for frame in range(1, len(store)):
            encoded_blocks = store[frame][3]
            motion_vectors = store[frame][0]
            
            u_frame = store[frame][1]
            v_frame = store[frame][2]

            # Write motion vectors
            for row in motion_vectors:
                for mv in row:
                    f.write(struct.pack("bb", mv[0], mv[1]))

            # Write U and V frames
            for row in u_frame:
                for value in row:
                    f.write(struct.pack("B", value))

            for row in v_frame:
                for value in row:
                    f.write(struct.pack("B", value)) 
            
            # Write encoded blocks
            for block in encoded_blocks:
                # print("block_len", len(block))
                f.write(struct.pack("B", len(block)))
                for combo in block:
                    f.write(struct.pack("bB", combo[0], combo[1]))

def read_compressed_file(filename, height, width, block_size):
    keyframes = []
    store = []
    motion_vector_rows = height // block_size
    motion_vector_cols = width // block_size
    
    with open(filename, "rb") as f:
        for _ in range(3):  # We want 3 keyframes
            keyframe = []
            for _ in range(height): 
                row = []
                for _ in range(width): 
                    value = struct.unpack("B", f.read(1))[0]
                    row.append(value)
                keyframe.append(row)
            keyframes.append(keyframe)
        
        while True:
            try:
                encoded_blocks = []
                motion_vectors = np.zeros((motion_vector_rows, motion_vector_cols, 2), dtype=np.int64)
                u_frame = np.zeros((height // 2, width // 2), dtype=np.int64)
                v_frame = np.zeros((height // 2, width // 2), dtype=np.int64)

                # Read motion vectors
                for i in range(motion_vector_rows):
                    for j in range(motion_vector_cols):
                        mv = struct.unpack("bb", f.read(2))
                        motion_vectors[i, j, 0], motion_vectors[i, j, 1] = mv

                # Read U frame
                for i in range(height // 2): 
                    for j in range(width // 2):
                        value = struct.unpack("B", f.read(1))[0]
                        u_frame[i, j] = value

                # Read V frame
                for i in range(height // 2): 
                    for j in range(width // 2): 
                        value = struct.unpack("B", f.read(1))[0]
                        v_frame[i, j] = value

                # Read encoded blocks
                for _ in range(height * width // block_size):
                    block = []
                    rle_size = struct.unpack("B", f.read(1))[0]
                    # print("rle_size", rle_size)
                    for _ in range(rle_size):
                        length, value = struct.unpack("bB", f.read(2))  # Read run-length (1 byte) and value (1 byte)
                        block.append((length, value))
                    encoded_blocks.append(block)

                store.append((motion_vectors, u_frame, v_frame, encoded_blocks))
                
            except struct.error:
                break

    return keyframes, store

def store_to_frames(keyframes, store):
    # converts compressed frames from read file into uncompressed frames for video viewing
    frames = []
    keyframe_y, keyframe_u, keyframe_v = keyframes
    frames.append((np.array(keyframe_y), np.array(keyframe_u), np.array(keyframe_v)))

    for i, _ in enumerate(store):
        motion_vectors, u_frame, v_frame, encoded_blocks = store[i]
        prev_y, prev_u, prev_v = frames[i]
        predicted_frame_y = motion_compensation(prev_y, motion_vectors)

        residual_y = decode_frame(encoded_blocks, height, width)
        reconstructed_frame_y = reconstruct_frame(predicted_frame_y, residual_y)

        reconstructed_frame_u = chroma_upsample_420(u_frame)
        reconstructed_frame_v = chroma_upsample_420(v_frame)

        reconstructed_frame_y = np.clip(reconstructed_frame_y, 0, 255)
        reconstructed_frame_u = np.clip(reconstructed_frame_u, 0, 255)
        reconstructed_frame_v = np.clip(reconstructed_frame_v, 0, 255)
        
        reconstructed_frame = (
            reconstructed_frame_y,
            reconstructed_frame_u,
            reconstructed_frame_v
        )

        frames.append(reconstructed_frame)

    return frames

write_compressed_file(store, "test.compressed")