# Autoencoder Video Webcam

In [4]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D
from tensorflow.keras.models import Model
import tkinter as tk
from PIL import Image, ImageTk
import threading
import queue
import time

# Define the autoencoder model with input size matching the original video resolution
def build_autoencoder(input_shape):
    input_img = Input(shape=input_shape)  # Input shape will match the video resolution (e.g., 480x640x3 for 640x480 images)

    # Encoder
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
    encoded = MaxPooling2D((2, 2), padding='same')(x)

    # Decoder
    x = Conv2D(16, (3, 3), activation='relu', padding='same')(encoded)
    x = UpSampling2D((2, 2))(x)
    x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    decoded = Conv2D(3, (3, 3), activation='sigmoid', padding='same')(x)  # Output shape matches input

    autoencoder = Model(input_img, decoded)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')

    return autoencoder

# Initialize video capture and retrieve the frame size
cap = cv2.VideoCapture(0)

# Debugging: Check if webcam is opened properly
if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

ret, frame = cap.read()
if not ret:
    print("Error: Could not read from the webcam.")
    exit()

# Get the frame dimensions
frame_height, frame_width = frame.shape[:2]

# Get framerate
framerate = cap.get(cv2.CAP_PROP_FPS)

# Initialize the autoencoder with the original frame size
autoencoder = build_autoencoder((frame_height, frame_width, 3))

# Preprocess the frame for the autoencoder
def preprocess_frame(frame):
    normalized = frame.astype('float32') / 255.0  # Normalize pixel values
    return np.expand_dims(normalized, axis=0)  # Add batch dimension

# Postprocess the autoencoded frame for display
def postprocess_frame(frame):
    frame = np.squeeze(frame, axis=0)  # Remove batch dimension
    frame = (frame * 255).astype('uint8')  # Rescale pixel values
    return frame  # Return the frame without resizing

# Frame buffer
frame_queue = queue.Queue(maxsize=100)

# Create Tkinter window before any Tkinter variables or widgets
window = tk.Tk()
window.title("Live RGB Autoencoder Training")

# Now you can define Tkinter variables
training_loss = tk.DoubleVar(value=0.0)  # Variable to hold the training loss
resolution_var = tk.StringVar()  # To display the resolution
framerate_var = tk.StringVar()  # To display the framerate
scale_factor = tk.DoubleVar(value=0.25)  # Variable to hold the scaling factor

# Set initial values for resolution and framerate
resolution_var.set(f"Resolution: {frame_width}x{frame_height}")
framerate_var.set(f"Framerate: {framerate} FPS")

# Labels for the images
original_label = tk.Label(window)
original_label.grid(row=0, column=0, padx=10, pady=10)
decoded_label = tk.Label(window)
decoded_label.grid(row=0, column=1, padx=10, pady=10)

# Label for displaying the training loss
loss_label = tk.Label(window, textvariable=training_loss, font=("Helvetica", 12))
loss_label.grid(row=1, column=0, columnspan=2, pady=10)

# Label for displaying resolution
resolution_label = tk.Label(window, textvariable=resolution_var, font=("Helvetica", 12))
resolution_label.grid(row=2, column=0, columnspan=2, pady=5)

# Label for displaying framerate
framerate_label = tk.Label(window, textvariable=framerate_var, font=("Helvetica", 12))
framerate_label.grid(row=3, column=0, columnspan=2, pady=5)

# Dropdown for selecting the scaling factor
scaling_options = [0.25, 0.5, 0.75, 1.0]
scale_menu = tk.OptionMenu(window, scale_factor, *scaling_options)
scale_menu.grid(row=4, column=0, columnspan=2, pady=5)

# Training thread
def training_thread(autoencoder, frame_queue, stop_event, training_loss):
    while not stop_event.is_set():
        if not frame_queue.empty():
            batch = []
            while not frame_queue.empty() and len(batch) < 32:  # Batch size of 32
                frame = frame_queue.get()
                batch.append(frame)
            
            if batch:
                batch = np.vstack(batch)  # Combine frames into a batch
                loss = autoencoder.train_on_batch(batch, batch)
                training_loss.set(loss)  # Update the training loss in the GUI
                print(f"Training loss: {loss:.6f}")
        else:
            time.sleep(0.01)

def update():
    ret, frame = cap.read()

    # Debugging: Check if frames are being captured
    if not ret:
        print("Error: Could not read frame from webcam.")
        window.after(10, update)  # Try again after 10ms
        return

    # Flip the frame horizontally for a mirror effect
    frame = cv2.flip(frame, 1)

    # Preprocess the frame for display in Tkinter (original)
    original_image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB

    # Preprocess the frame for the autoencoder
    preprocessed_frame = preprocess_frame(frame)

    # Add the frame to the frame queue for training
    if not frame_queue.full():
        frame_queue.put(preprocessed_frame)

    # Predict the autoencoded frame
    autoencoded_frame = autoencoder.predict(preprocessed_frame, verbose=0)

    # Postprocess the autoencoded frame for display (keep 1:1 resolution)
    decoded_image = postprocess_frame(autoencoded_frame)

    # Resize the images according to the scaling factor
    scale = scale_factor.get()
    display_width = int(frame_width * scale)
    display_height = int(frame_height * scale)

    # Resize both original and decoded images
    original_image_resized = cv2.resize(original_image, (display_width, display_height))
    decoded_image_resized = cv2.resize(decoded_image, (display_width, display_height))

    # Convert images to ImageTk for display in Tkinter
    original_image_pil = Image.fromarray(original_image_resized)
    decoded_image_pil = Image.fromarray(decoded_image_resized)
    original_image_tk = ImageTk.PhotoImage(image=original_image_pil)
    decoded_image_tk = ImageTk.PhotoImage(image=decoded_image_pil)

    # Update the labels with the new images
    original_label.config(image=original_image_tk)
    original_label.image = original_image_tk
    decoded_label.config(image=decoded_image_tk)
    decoded_label.image = decoded_image_tk

    window.after(10, update)  # Update the frame every 10ms

# Event to signal the training thread to stop
stop_event = threading.Event()

# Start the training thread
trainer = threading.Thread(target=training_thread, args=(autoencoder, frame_queue, stop_event, training_loss))
trainer.start()

# Start the update loop
update()

# Define a function to handle window closure
def on_closing():
    print("Closing application...")
    stop_event.set()  # Signal the training thread to stop
    trainer.join()    # Wait for the training thread to finish
    cap.release()
    cv2.destroyAllWindows()
    window.destroy()

# Add a close button
close_button = tk.Button(window, text="Close", command=on_closing)
close_button.grid(row=5, column=0, columnspan=2, pady=10)

# Bind the window close event
window.protocol("WM_DELETE_WINDOW", on_closing)

# Start the Tkinter event loop
window.mainloop()


Error: Could not open webcam.
Error: Could not read from the webcam.


[ WARN:0@52.992] global cap_v4l.cpp:999 open VIDEOIO(V4L2:/dev/video0): can't open camera by index
[ERROR:0@52.992] global obsensor_uvc_stream_channel.cpp:158 getStreamChannelGroup Camera index out of range


AttributeError: 'NoneType' object has no attribute 'shape'

: 

# Transformer Video Encoded/Decoder

In [3]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Add, Embedding, Reshape
from tensorflow.keras.models import Model
import tkinter as tk
from PIL import Image, ImageTk
import threading
import queue
import time
import os

# Check for GPU availability
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPUs Available: {len(gpus)}")
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print("No GPUs available.")

# Define the Transformer-based autoencoder model
def build_autoencoder(image_size=128):
    patch_size = 16   # Size of each patch
    num_patches = (image_size // patch_size) ** 2
    projection_dim = 64
    num_heads = 4
    transformer_units = [projection_dim * 2, projection_dim]
    transformer_layers = 4

    class Patches(tf.keras.layers.Layer):
        def __init__(self, patch_size):
            super(Patches, self).__init__()
            self.patch_size = patch_size

        def call(self, images):
            batch_size = tf.shape(images)[0]
            patches = tf.image.extract_patches(
                images=images,
                sizes=[1, self.patch_size, self.patch_size, 1],
                strides=[1, self.patch_size, self.patch_size, 1],
                rates=[1, 1, 1, 1],
                padding='VALID',
            )
            patch_dims = patches.shape[-1]
            patches = tf.reshape(patches, [batch_size, -1, patch_dims])
            return patches

    class PatchEncoder(tf.keras.layers.Layer):
        def __init__(self, num_patches, projection_dim):
            super(PatchEncoder, self).__init__()
            self.num_patches = num_patches
            self.projection = Dense(units=projection_dim)
            self.position_embedding = Embedding(
                input_dim=num_patches, output_dim=projection_dim
            )

        def call(self, patches):
            positions = tf.range(start=0, limit=self.num_patches, delta=1)
            encoded = self.projection(patches) + self.position_embedding(positions)
            return encoded

    input_shape = (image_size, image_size, 3)
    inputs = Input(shape=input_shape)

    patches = Patches(patch_size)(inputs)
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

    for _ in range(transformer_layers):
        x1 = LayerNormalization(epsilon=1e-6)(encoded_patches)
        attention_output = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
        )(x1, x1)
        x2 = Add()([attention_output, encoded_patches])
        x3 = LayerNormalization(epsilon=1e-6)(x2)
        x3 = Dense(transformer_units[0], activation='relu')(x3)
        x3 = Dense(transformer_units[1], activation='relu')(x3)
        encoded_patches = Add()([x3, x2])

    latent_space = encoded_patches
    x = Dense(patch_size * patch_size * 3)(latent_space)
    x = Reshape((num_patches, patch_size, patch_size, 3))(x)

    def combine_patches(patches, image_size, patch_size):
        batch_size = tf.shape(patches)[0]
        n_patches = image_size // patch_size
        patches = tf.reshape(patches, (batch_size, n_patches, n_patches, patch_size, patch_size, 3))
        patches = tf.transpose(patches, perm=[0, 1, 3, 2, 4, 5])
        patches = tf.reshape(patches, (batch_size, image_size, image_size, 3))
        return patches

    reconstructed = tf.keras.layers.Lambda(
        lambda x: combine_patches(x, image_size, patch_size))(x)

    autoencoder = Model(inputs=inputs, outputs=reconstructed)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    encoder = Model(inputs=inputs, outputs=latent_space)

    return autoencoder, encoder

autoencoder, encoder = build_autoencoder()

def preprocess_frame(frame, image_size):
    resized = cv2.resize(frame, (image_size, image_size))
    normalized = resized.astype('float32') / 255.0
    return np.expand_dims(normalized, axis=0)

def postprocess_frame(frame, image_size):
    frame = np.squeeze(frame, axis=0)
    frame = (frame * 255).astype('uint8')
    return frame

frame_queue = queue.Queue(maxsize=100)

def training_thread(autoencoder, frame_queue, stop_event):
    while not stop_event.is_set():
        if not frame_queue.empty():
            batch = []
            while not frame_queue.empty() and len(batch) < 8:
                frame = frame_queue.get()
                batch.append(frame)
            if batch:
                batch = np.vstack(batch)
                loss = autoencoder.train_on_batch(batch, batch)
                print(f"Training loss: {loss:.6f}")
        else:
            time.sleep(0.01)
    print("Training thread exiting...")

window = tk.Tk()
window.title("Live RGB Autoencoder Training with Transformer")

def on_closing():
    print("Closing application...")
    stop_event.set()
    cap.release()
    window.destroy()

# Get the current working directory and check the video file exists
current_dir = os.getcwd()
video_path = os.path.join(current_dir, 'OurPlanet.mp4')  # Adjust to your file

# Check if the video file exists
if not os.path.exists(video_path):
    print(f"Video file not found: {video_path}")
else:
    print(f"Opening video: {video_path}")

cap = cv2.VideoCapture(video_path)

# Check if video capture is opened successfully
if not cap.isOpened():
    print("Error: Cannot open video file")

original_label = tk.Label(window)
original_label.grid(row=0, column=0, padx=10, pady=10)
decoded_label = tk.Label(window)
decoded_label.grid(row=0, column=1, padx=10, pady=10)

close_button = tk.Button(window, text="Close", command=on_closing)
close_button.grid(row=4, column=0, columnspan=2, pady=10)

bandwidth_label = tk.Label(window, text="Calculating bandwidth...")
bandwidth_label.grid(row=3, column=0, columnspan=2, pady=10)

display_sizes = [(320, 240), (640, 480), (800, 600)]
selected_display_size = tk.StringVar()
selected_display_size.set(f"{display_sizes[1][0]}x{display_sizes[1][1]}")

def update_display_size(*args):
    size_str = selected_display_size.get()
    width, height = map(int, size_str.split('x'))
    print(f"Selected display size: {width}x{height}")

size_options = [f"{w}x{h}" for w, h in display_sizes]
size_menu = tk.OptionMenu(window, selected_display_size, *size_options, command=update_display_size)
size_menu.grid(row=1, column=0, columnspan=2, pady=10)

image_sizes = [64, 128, 256, 512]
selected_image_size = tk.IntVar()
selected_image_size.set(image_sizes[1])

def update_image_size(*args):
    global trainer
    size = selected_image_size.get()
    print(f"Selected image size: {size}")
    stop_event.set()
    trainer.join()
    global autoencoder, encoder
    autoencoder, encoder = build_autoencoder(image_size=size)
    with frame_queue.mutex:
        frame_queue.queue.clear()
    stop_event.clear()
    trainer = threading.Thread(target=training_thread, args=(autoencoder, frame_queue, stop_event), daemon=True)
    trainer.start()

image_size_menu = tk.OptionMenu(window, selected_image_size, *image_sizes, command=update_image_size)
image_size_menu.grid(row=2, column=0, columnspan=2, pady=10)

def update():
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame or end of video. Restarting video.")
        cap.set(cv2.CAP_PROP_POS_FRAMES, 0)  # Restart the video
        ret, frame = cap.read()

    # Preprocessing and flipping the frame
    frame = cv2.flip(frame, 1)
    size_str = selected_display_size.get()
    display_width, display_height = map(int, size_str.split('x'))
    image_size = selected_image_size.get()
    
    # Resize and convert the frame for original image display
    original_image = cv2.resize(frame, (display_width, display_height))
    original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)
    
    # Preprocess the frame for the autoencoder
    preprocessed_frame = preprocess_frame(frame, image_size)

    # Add preprocessed frame to the training queue if space is available
    if not frame_queue.full():
        frame_queue.put(preprocessed_frame)

    # Generate the autoencoded frame using the model
    autoencoded_frame = autoencoder.predict(preprocessed_frame, verbose=0)

    # Get the latent representation size and compression ratio
    latent_representation = encoder.predict(preprocessed_frame, verbose=0)
    latent_size_bits = latent_representation.size * 32  # 32 bits per float
    input_size_bits = preprocessed_frame.size * 32
    compression_ratio = input_size_bits / latent_size_bits if latent_size_bits != 0 else 0

    # Update the bandwidth and compression ratio label
    bandwidth_label.config(text=f"Latent Space Size: {latent_size_bits / 1024:.2f} Kb\n"
                                f"Compression Ratio: {compression_ratio:.2f}")

    # Postprocess the autoencoded frame for display
    decoded_image = postprocess_frame(autoencoded_frame, image_size)
    decoded_image = cv2.resize(decoded_image, (display_width, display_height))
    
    # Convert to PIL format for Tkinter display
    original_image_pil = Image.fromarray(original_image)
    decoded_image_pil = Image.fromarray(decoded_image)
    
    # Convert the PIL images to ImageTk format
    original_image_tk = ImageTk.PhotoImage(image=original_image_pil)
    decoded_image_tk = ImageTk.PhotoImage(image=decoded_image_pil)

    # Display the original and decoded images in the Tkinter labels
    original_label.config(image=original_image_tk)
    original_label.image = original_image_tk  # Keep reference to avoid garbage collection
    decoded_label.config(image=decoded_image_tk)
    decoded_label.image = decoded_image_tk  # Keep reference to avoid garbage collection

    # Schedule the next update
    window.after(10, update)

stop_event = threading.Event()
trainer = threading.Thread(target=training_thread, args=(autoencoder, frame_queue, stop_event), daemon=True)
trainer.start()
update()
window.protocol("WM_DELETE_WINDOW", on_closing)
window.mainloop()


GPUs Available: 1


RuntimeError: Physical devices cannot be modified after being initialized

# Semantic Communication Transformer Encoder

In [None]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Conv2D, UpSampling2D, Dense, Flatten, Reshape,
    LeakyReLU, LayerNormalization, MultiHeadAttention, Dropout,
    TimeDistributed
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tkinter as tk
from PIL import Image, ImageTk
import threading
import queue
import time
import os
import sys

# Ensure that any deprecation warnings or errors are visible
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

# Constants
CHUNK_SIZE = 8  # Number of frames per chunk
FRAME_CHANNELS = 3

# Default Frame Size
DEFAULT_FRAME_SIZE = (128, 128)

# Preprocess and postprocess functions
def preprocess_frame(frame, frame_height, frame_width):
    resized = cv2.resize(frame, (frame_width, frame_height))  # Resize to selected frame size
    normalized = resized.astype('float32') / 255.0  # Normalize pixel values
    return normalized  # Shape: (frame_height, frame_width, 3)

def postprocess_frame(frame):
    frame = (frame * 255).astype('uint8')  # Rescale pixel values
    return cv2.resize(frame, (640, 480))  # Resize for display

# Frame buffers
frame_queue = queue.Queue(maxsize=100)  # For training frames
prediction_queue = queue.Queue(maxsize=10)  # For frames to predict
display_queue = queue.Queue(maxsize=10)  # For frames to display

# Losses dictionary
losses_lock = threading.Lock()
losses_dict = {'total_loss': 0}

class Autoencoder:
    def __init__(self, frame_height, frame_width):
        self.frame_height = frame_height
        self.frame_width = frame_width
        self.encoder = self.build_encoder()
        self.decoder = self.build_decoder()
        self.optimizer = Adam(learning_rate=0.0002, beta_1=0.5)
        self.autoencoder = self.build_autoencoder()
        self.autoencoder.compile(optimizer=self.optimizer, loss='mse')

    def build_encoder(self):
        input_img = Input(shape=(CHUNK_SIZE, self.frame_height, self.frame_width, FRAME_CHANNELS))
        x = TimeDistributed(Conv2D(32, (3,3), strides=(2,2), padding='same', activation='relu'))(input_img)
        x = TimeDistributed(Conv2D(64, (3,3), strides=(2,2), padding='same', activation='relu'))(x)
        x = TimeDistributed(Conv2D(128, (3,3), strides=(2,2), padding='same', activation='relu'))(x)
        x = TimeDistributed(Flatten())(x)
        # Flatten the temporal dimension
        x = Flatten()(x)
        # Latent representation
        latent = Dense(256, activation='relu')(x)
        encoder_model = Model(input_img, latent, name='encoder')
        return encoder_model

    def build_decoder(self):
        latent_input = Input(shape=(256,))
        x = Dense(CHUNK_SIZE * (self.frame_height // 8) * (self.frame_width // 8) * 128, activation='relu')(latent_input)
        x = Reshape((CHUNK_SIZE, self.frame_height // 8, self.frame_width // 8, 128))(x)
        x = TimeDistributed(UpSampling2D((2,2)))(x)  # 16x16
        x = TimeDistributed(Conv2D(64, (3,3), padding='same', activation='relu'))(x)
        x = TimeDistributed(UpSampling2D((2,2)))(x)  # 32x32
        x = TimeDistributed(Conv2D(32, (3,3), padding='same', activation='relu'))(x)
        x = TimeDistributed(UpSampling2D((2,2)))(x)  # 64x64
        reconstructed_output = TimeDistributed(Conv2D(FRAME_CHANNELS, (3,3), padding='same', activation='sigmoid'))(x)
        decoder_model = Model(latent_input, reconstructed_output, name='decoder')
        return decoder_model

    def build_autoencoder(self):
        input_frames = Input(shape=(CHUNK_SIZE, self.frame_height, self.frame_width, FRAME_CHANNELS))
        encoded = self.encoder(input_frames)
        decoded = self.decoder(encoded)
        autoencoder_model = Model(input_frames, decoded, name='autoencoder')
        return autoencoder_model

    def train_step(self, batch_chunks):
        loss = self.autoencoder.train_on_batch(batch_chunks, batch_chunks)
        return loss

    def predict(self, frames):
        reconstructed = self.autoencoder.predict(frames)
        return reconstructed

# Initialize default frame size
FRAME_HEIGHT, FRAME_WIDTH = DEFAULT_FRAME_SIZE

# Initialize the models
autoencoder = Autoencoder(FRAME_HEIGHT, FRAME_WIDTH)

def training_thread(autoencoder, frame_queue, stop_event):
    while not stop_event.is_set():
        if frame_queue.qsize() >= 1:
            batch_chunks = []
            while len(batch_chunks) < 1:
                chunk = frame_queue.get()
                batch_chunks.append(chunk)
            batch_chunks = np.stack(batch_chunks)  # Shape: (batch_size, CHUNK_SIZE, H, W, C)

            # Train the autoencoder
            loss = autoencoder.train_step(batch_chunks)

            # Store the loss in global variables for display
            with losses_lock:
                losses_dict['total_loss'] = loss

        else:
            # If queue is empty, wait a bit
            time.sleep(0.01)

def prediction_thread(autoencoder, prediction_queue, display_queue, stop_event):
    while not stop_event.is_set():
        try:
            chunk = prediction_queue.get(timeout=0.1)  # Shape: (CHUNK_SIZE, H, W, C)
            chunk = np.expand_dims(chunk, axis=0)  # Shape: (1, CHUNK_SIZE, H, W, C)
            # Predict the reconstructed frames
            reconstructed_images = autoencoder.predict(chunk)
            reconstructed_images = reconstructed_images[0]  # Shape: (CHUNK_SIZE, H, W, C)

            # Postprocess frames for display
            original_frames = chunk[0]  # Shape: (CHUNK_SIZE, H, W, C)
            reconstructed_frames = reconstructed_images  # Shape: (CHUNK_SIZE, H, W, C)

            # Display only the first frame in the chunk
            original_frame = original_frames[0]
            reconstructed_frame = reconstructed_frames[0]

            original_frame_disp = postprocess_frame(original_frame)
            original_frame_disp = cv2.cvtColor(original_frame_disp, cv2.COLOR_BGR2RGB)

            reconstructed_frame_disp = postprocess_frame(reconstructed_frame)

            # Compute bandwidth (optional: adjust as needed)
            # Original frame size in bits
            original_frame_size_bits = FRAME_HEIGHT * FRAME_WIDTH * FRAME_CHANNELS * 8 * CHUNK_SIZE
            # Latent representation size in bits
            latent_size_bits = 256 * 32  # Assuming float32
            # Compression ratio
            compression_ratio = original_frame_size_bits / latent_size_bits if latent_size_bits != 0 else 0

            # Semantic size in KB
            semantic_size_kb = latent_size_bits / 8 / 1024  # bits to KB

            # Put the frames and bandwidth info in the display queue
            display_queue.put((original_frame_disp, reconstructed_frame_disp, compression_ratio, semantic_size_kb))

        except queue.Empty:
            continue

# Tkinter window setup
window = tk.Tk()
window.title("Live Semantic Communication with Autoencoder")

# Get the current working directory and check if the video file exists
current_dir = os.getcwd()
video_path = os.path.join(current_dir, 'OurPlanet.mp4')  # Adjust to your file

if not os.path.exists(video_path):
    print(f"Error: Video file not found at {video_path}")
    sys.exit()
else:
    print(f"Opening video: {video_path}")

# Capture video from the file
cap = cv2.VideoCapture(video_path)

# Check if video capture is opened successfully
if not cap.isOpened():
    print("Error: Cannot open video file")
    sys.exit()

# Labels for the images
original_label = tk.Label(window)
original_label.grid(row=0, column=0, padx=10, pady=10)
decoded_label = tk.Label(window)
decoded_label.grid(row=0, column=1, padx=10, pady=10)

# Label for losses
loss_label = tk.Label(window, text="Losses:")
loss_label.grid(row=1, column=0, padx=10, pady=10, columnspan=2)

# Label for bandwidth
bandwidth_label = tk.Label(window, text="Bandwidth Info:")
bandwidth_label.grid(row=2, column=0, padx=10, pady=10, columnspan=2)

# Add Close button
def on_closing():
    global running
    print("Closing application...")
    running = False
    stop_event.set()  # Signal the threads to stop
    trainer.join(timeout=2)    # Wait for the training thread to finish
    predictor.join(timeout=2)  # Wait for the prediction thread to finish
    cap.release()
    window.destroy()

close_button = tk.Button(window, text="Close", command=on_closing)
close_button.grid(row=5, column=0, columnspan=2, pady=10)

running = True  # Flag to control the update loop

# Buffers for accumulating frames into chunks
frame_buffer = []

# Frame sizes options
FRAME_SIZES = [(64,64), (128,128), (256,256)]
selected_frame_size = tk.StringVar()
selected_frame_size.set(f"{FRAME_HEIGHT}x{FRAME_WIDTH}")

def update_frame_size(*args):
    global FRAME_HEIGHT, FRAME_WIDTH, autoencoder
    size_str = selected_frame_size.get()
    FRAME_HEIGHT, FRAME_WIDTH = map(int, size_str.split('x'))
    print(f"Selected frame size: {FRAME_HEIGHT}x{FRAME_WIDTH}")

    # Reinitialize autoencoder with new frame size
    stop_event.set()
    trainer.join(timeout=2)
    predictor.join(timeout=2)
    with frame_queue.mutex:
        frame_queue.queue.clear()
    with prediction_queue.mutex:
        prediction_queue.queue.clear()
    with display_queue.mutex:
        display_queue.queue.clear()
    global autoencoder
    autoencoder = Autoencoder(FRAME_HEIGHT, FRAME_WIDTH)
    stop_event.clear()
    start_threads()

size_options = [f"{w}x{h}" for w, h in FRAME_SIZES]
size_menu = tk.OptionMenu(window, selected_frame_size, *size_options, command=update_frame_size)
size_menu.grid(row=3, column=0, columnspan=2, pady=10)

def start_threads():
    global trainer, predictor
    trainer = threading.Thread(target=training_thread, args=(autoencoder, frame_queue, stop_event))
    trainer.start()
    predictor = threading.Thread(target=prediction_thread, args=(autoencoder, prediction_queue, display_queue, stop_event))
    predictor.start()

def update_gui():
    global frame_buffer
    if not running:
        return
    ret, frame = cap.read()
    if not ret:
        # If video ends or frame can't be grabbed, restart the video
        cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
        ret, frame = cap.read()

    if ret:
        # Flip the frame horizontally for a mirror effect
        frame = cv2.flip(frame, 1)

        # Preprocess the frame
        preprocessed_frame = preprocess_frame(frame, FRAME_HEIGHT, FRAME_WIDTH)
        frame_buffer.append(preprocessed_frame)

        # When we have enough frames for a chunk
        if len(frame_buffer) == CHUNK_SIZE:
            chunk = np.stack(frame_buffer)  # Shape: (CHUNK_SIZE, H, W, C)

            # Add to the frame queue for training
            if not frame_queue.full():
                frame_queue.put(chunk)

            # Add to the prediction queue
            if not prediction_queue.full():
                prediction_queue.put(chunk)

            frame_buffer = []  # Reset buffer

    # Update the images if available
    try:
        original_image, decoded_image, compression_ratio, semantic_size_kb = display_queue.get_nowait()

        # Convert images to ImageTk for display in Tkinter
        original_image_pil = Image.fromarray(original_image)
        decoded_image_pil = Image.fromarray(decoded_image)
        original_image_tk = ImageTk.PhotoImage(image=original_image_pil)
        decoded_image_tk = ImageTk.PhotoImage(image=decoded_image_pil)

        # Update the labels with the new images
        original_label.config(image=original_image_tk)
        original_label.image = original_image_tk
        decoded_label.config(image=decoded_image_tk)
        decoded_label.image = decoded_image_tk

        # Update the loss label
        with losses_lock:
            loss_text = f"Total Loss: {losses_dict['total_loss']:.6f}"
        loss_label.config(text=loss_text)

        # Update the bandwidth label
        bandwidth_text = f"Compression Ratio: {compression_ratio:.2f}\nSemantic Size: {semantic_size_kb:.2f} KB"
        bandwidth_label.config(text=bandwidth_text)

    except queue.Empty:
        pass

    window.after(10, update_gui)  # Update the frame every 10ms

# Event to signal the threads to stop
stop_event = threading.Event()

# Start the threads
start_threads()

# Bind the window close event
window.protocol("WM_DELETE_WINDOW", on_closing)

# Start the update loop
update_gui()  # Start the update loop
window.mainloop()


2024-10-09 15:05:10.304326: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-09 15:05:10.416918: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-09 15:05:10.464193: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-09 15:05:10.477576: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-09 15:05:10.556193: I tensorflow/core/platform/cpu_feature_guar

Opening video: /mnt/c/Users/keplarV4/Downloads/Github/Graduate School/Semantic-Communication-Robot/OurPlanet.mp4


I0000 00:00:1728500715.144031  163778 service.cc:146] XLA service 0x7fd7d8005680 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1728500715.144059  163778 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Ti, Compute Capability 8.9
2024-10-09 15:05:15.394820: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-10-09 15:05:15.660753: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


I0000 00:00:1728500717.502069  163778 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16

# Transformer Semantic Communication Testing

In [1]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Conv2D, Conv2DTranspose, Flatten, Dense,
    LeakyReLU, Add, PReLU, UpSampling2D, BatchNormalization, Reshape
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tkinter as tk
from PIL import Image, ImageTk
import time
import sys

# Constants
CHUNK_SIZE = 8  # Reduced batch size to avoid OOM
FRAME_CHANNELS = 3
DEFAULT_FRAME_SIZE = (128, 128)  # Reduced frame size
LATENT_DIM = 128  # Size of the latent space
RECONSTRUCTION_ITERATIONS = 100
RECONSTRUCTION_LEARNING_RATE = 0.01

# Preprocess and postprocess functions
def preprocess_frame(frame, frame_height, frame_width):
    resized = cv2.resize(frame, (frame_width, frame_height))  
    normalized = resized.astype('float32') / 255.0  
    return normalized  

def postprocess_frame(frame):
    frame = np.clip(frame, 0, 1)
    frame = (frame * 255).astype('uint8')  
    return cv2.resize(frame, (640, 480))  

# Residual Block for Generator
def residual_block(input_layer, filters=64):
    x = Conv2D(filters, (3, 3), padding='same')(input_layer)
    x = PReLU(shared_axes=[1, 2])(x)
    x = Conv2D(filters, (3, 3), padding='same')(x)
    
    # Adjust the number of channels in the shortcut to match 'filters'
    if input_layer.shape[-1] != filters:
        shortcut = Conv2D(filters, (1, 1), padding='same')(input_layer)
    else:
        shortcut = input_layer
    
    x = Add()([x, shortcut])  # Skip connection
    return x

# Generator: Single Latent Vector Input
def build_generator(latent_dim, frame_height, frame_width):
    input_latent = Input(shape=(latent_dim,), name='gen_latent_input')  # Single latent vector

    # Project and reshape
    x = Dense(256 * (frame_height // 16) * (frame_width // 16), activation='relu')(input_latent)
    x = Reshape((frame_height // 16, frame_width // 16, 256))(x)

    # Upsampling blocks
    for filters in [256, 128, 64, 32]:
        x = Conv2DTranspose(filters, (4, 4), strides=2, padding='same', activation='relu')(x)
        x = BatchNormalization()(x)

    # Final output layer
    generated_output = Conv2D(FRAME_CHANNELS, (3, 3), padding='same', activation='sigmoid', name='gen_output')(x)

    return Model(input_latent, generated_output, name='generator')

# Discriminator for GAN: Classifies whether an image is real or fake
def build_discriminator(frame_height, frame_width):
    input_img = Input(shape=(frame_height, frame_width, FRAME_CHANNELS), name='disc_input')
    
    x = Conv2D(64, (3, 3), strides=(2, 2), padding='same', name='disc_conv1')(input_img)  # (64,64,64)
    x = LeakyReLU(alpha=0.2, name='disc_lrelu1')(x)
    
    x = Conv2D(128, (3, 3), strides=(2, 2), padding='same', name='disc_conv2')(x)  # (32,32,128)
    x = LeakyReLU(alpha=0.2, name='disc_lrelu2')(x)
    
    x = Conv2D(256, (3, 3), strides=(2, 2), padding='same', name='disc_conv3')(x)  # (16,16,256)
    x = LeakyReLU(alpha=0.2, name='disc_lrelu3')(x)
    
    x = Flatten(name='disc_flatten')(x)  # (16*16*256,)
    x = Dense(1, activation='sigmoid', name='disc_output')(x)  # Outputs whether the image is real or fake
    
    return Model(input_img, x, name='discriminator')

# GAN Model (Generator + Discriminator)
class GANModel:
    def __init__(self, frame_height, frame_width, latent_dim=128):
        self.frame_height = frame_height
        self.frame_width = frame_width
        self.latent_dim = latent_dim
        self.generator = build_generator(latent_dim, frame_height, frame_width)
        self.discriminator = build_discriminator(frame_height, frame_width)
        
        # Compile Discriminator
        self.discriminator.compile(
            optimizer=Adam(learning_rate=0.0001, beta_1=0.1),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        # Make Discriminator non-trainable when training Generator
        self.discriminator.trainable = False
        
        # GAN Input
        gen_input = Input(shape=(latent_dim,), name='gan_latent_input')
        gen_output = self.generator(gen_input)
        gan_output = self.discriminator(gen_output)
        
        # Compile GAN
        self.gan_model = Model(gen_input, gan_output, name='GAN')
        self.gan_model.compile(
            optimizer=Adam(learning_rate=0.0001, beta_1=0.1),
            loss='binary_crossentropy'
        )
        
        # Reset Discriminator to trainable
        self.discriminator.trainable = True
        
        # Optimizers
        self.optimizer_g = Adam(learning_rate=0.0001, beta_1=0.1)
        self.optimizer_d = Adam(learning_rate=0.0001, beta_1=0.1)
        self.bce_loss = tf.keras.losses.BinaryCrossentropy()
    
    def train_step(self, real_frames):
        batch_size = real_frames.shape[0]
        
        # Labels
        real_labels = np.ones((batch_size, 1)) * 0.9  # Label smoothing for real
        fake_labels = np.zeros((batch_size, 1))       # Fake labels

        # ---------------------
        #  Train Discriminator
        # ---------------------
        with tf.GradientTape() as tape:
            # Generate fake frames
            random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))
            fake_frames = self.generator(random_latent_vectors, training=True)
            
            # Discriminator predictions
            real_predictions = self.discriminator(real_frames, training=True)
            fake_predictions = self.discriminator(fake_frames, training=True)
            
            # Compute loss
            real_loss = self.bce_loss(real_labels, real_predictions)
            fake_loss = self.bce_loss(fake_labels, fake_predictions)
            d_loss = real_loss + fake_loss

        # Compute gradients and update Discriminator
        d_gradients = tape.gradient(d_loss, self.discriminator.trainable_variables)
        self.optimizer_d.apply_gradients(zip(d_gradients, self.discriminator.trainable_variables))
        
        # ---------------------
        #  Train Generator
        # ---------------------
        with tf.GradientTape() as tape:
            # Generate fake frames
            fake_frames = self.generator(random_latent_vectors, training=True)
            # Discriminator predictions on fake frames
            fake_predictions = self.discriminator(fake_frames, training=True)
            # Generator tries to fool the discriminator
            g_loss = self.bce_loss(real_labels, fake_predictions)

        # Compute gradients and update Generator
        g_gradients = tape.gradient(g_loss, self.generator.trainable_variables)
        self.optimizer_g.apply_gradients(zip(g_gradients, self.generator.trainable_variables))
        
        return d_loss.numpy(), g_loss.numpy()
    
    def summarize_models(self):
        print("Discriminator Summary:")
        self.discriminator.summary()
        print("\nGenerator Summary:")
        self.generator.summary()
        print("\nGAN Model Summary:")
        self.gan_model.summary()

# GAN Model (apprentice)
class GANModel:
    def __init__(self, frame_height, frame_width, latent_dim=128):
        self.frame_height = frame_height
        self.frame_width = frame_width
        self.latent_dim = latent_dim
        self.generator = build_generator(latent_dim, frame_height, frame_width)
        self.discriminator = build_discriminator(frame_height, frame_width)
        
        # Compile Discriminator
        self.discriminator.compile(
            optimizer=Adam(learning_rate=0.0001, beta_1=0.1),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        # Make Discriminator non-trainable when training Generator
        self.discriminator.trainable = False
        
        # GAN Input
        gen_input = Input(shape=(latent_dim,), name='gan_latent_input')
        gen_output = self.generator(gen_input)
        gan_output = self.discriminator(gen_output)
        
        # Compile GAN
        self.gan_model = Model(gen_input, gan_output, name='GAN')
        self.gan_model.compile(
            optimizer=Adam(learning_rate=0.0001, beta_1=0.1),
            loss='binary_crossentropy'
        )
        
        # Reset Discriminator to trainable
        self.discriminator.trainable = True
        
        # Optimizers
        self.optimizer_g = Adam(learning_rate=0.0001, beta_1=0.1)
        self.optimizer_d = Adam(learning_rate=0.0001, beta_1=0.1)
        self.bce_loss = tf.keras.losses.BinaryCrossentropy()
    
    def train_step(self, real_frames):
        batch_size = real_frames.shape[0]
        
        # Labels
        real_labels = np.ones((batch_size, 1)) * 0.9  # Label smoothing for real
        fake_labels = np.zeros((batch_size, 1))       # Fake labels

        # ---------------------
        #  Train Discriminator
        # ---------------------
        with tf.GradientTape() as tape:
            # Generate fake frames
            random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))
            fake_frames = self.generator(random_latent_vectors, training=True)
            
            # Discriminator predictions
            real_predictions = self.discriminator(real_frames, training=True)
            fake_predictions = self.discriminator(fake_frames, training=True)
            
            # Compute loss
            real_loss = self.bce_loss(real_labels, real_predictions)
            fake_loss = self.bce_loss(fake_labels, fake_predictions)
            d_loss = real_loss + fake_loss

        # Compute gradients and update Discriminator
        d_gradients = tape.gradient(d_loss, self.discriminator.trainable_variables)
        self.optimizer_d.apply_gradients(zip(d_gradients, self.discriminator.trainable_variables))
        
        # ---------------------
        #  Train Generator
        # ---------------------
        with tf.GradientTape() as tape:
            # Generate fake frames
            fake_frames = self.generator(random_latent_vectors, training=True)
            # Discriminator predictions on fake frames
            fake_predictions = self.discriminator(fake_frames, training=True)
            # Generator tries to fool the discriminator
            g_loss = self.bce_loss(real_labels, fake_predictions)

        # Compute gradients and update Generator
        g_gradients = tape.gradient(g_loss, self.generator.trainable_variables)
        self.optimizer_g.apply_gradients(zip(g_gradients, self.generator.trainable_variables))
        
        return d_loss.numpy(), g_loss.numpy()
    
    def summarize_models(self):
        print("Discriminator Summary:")
        self.discriminator.summary()
        print("\nGenerator Summary:")
        self.generator.summary()
        print("\nGAN Model Summary:")
        self.gan_model.summary()

# Reconstruction Function (CNN Inversion)
def reconstruct_image_iCNN(target_image, generator, latent_dim, iterations=500, learning_rate=0.1):
    latent_vector = tf.Variable(tf.random.normal((1, latent_dim)), trainable=True)
    optimizer = tf.optimizers.Adam(learning_rate=learning_rate)
    loss_history = []
    
    # Preprocess target image
    target_image_processed = preprocess_frame(target_image, FRAME_HEIGHT, FRAME_WIDTH)
    target_image_processed = np.expand_dims(target_image_processed, axis=0)  # Add batch dimension
    
    for i in range(iterations):
        with tf.GradientTape() as tape:
            generated_image = generator(latent_vector, training=True)
            loss = tf.reduce_mean(tf.square(target_image_processed - generated_image))
        
        grads = tape.gradient(loss, [latent_vector])
        optimizer.apply_gradients(zip(grads, [latent_vector]))
        loss_history.append(loss.numpy())
        
        if i % 50 == 0:
            print(f"Reconstruction Iteration {i}, Loss: {loss.numpy()}")
    
    final_image = generator(latent_vector, training=False).numpy()[0]
    final_image = postprocess_frame(final_image)
    return final_image, loss_history

# Initialize frame dimensions
FRAME_HEIGHT, FRAME_WIDTH = DEFAULT_FRAME_SIZE

# Initialize GAN models
gan = GANModel(FRAME_HEIGHT, FRAME_WIDTH, latent_dim=LATENT_DIM)

# Tkinter window setup
window = tk.Tk()
window.title("Teacher-Apprentice GAN Frame Reconstruction")

# Labels for displaying original and reconstructed frames
original_label = tk.Label(window, text="Original Frame")
original_label.grid(row=0, column=0, padx=10, pady=10)
original_image_label = tk.Label(window)
original_image_label.grid(row=1, column=0, padx=10, pady=10)

gan_label = tk.Label(window, text="GAN Reconstructed")
gan_label.grid(row=0, column=1, padx=10, pady=10)
gan_image_label = tk.Label(window)
gan_image_label.grid(row=1, column=1, padx=10, pady=10)

diff_label = tk.Label(window, text="Diffusion Reconstructed")
diff_label.grid(row=0, column=2, padx=10, pady=10)
diff_image_label = tk.Label(window)
diff_image_label.grid(row=1, column=2, padx=10, pady=10)

cnn_label = tk.Label(window, text="CNN Inversion Reconstructed")
cnn_label.grid(row=0, column=3, padx=10, pady=10)
cnn_image_label = tk.Label(window)
cnn_image_label.grid(row=1, column=3, padx=10, pady=10)

# Add Close button
def on_closing():
    global running
    print("Closing application...")
    running = False
    cap.release()
    window.destroy()

close_button = tk.Button(window, text="Close", command=on_closing)
close_button.grid(row=5, column=0, columnspan=4, pady=10)

running = True  # Flag to control the update loop

# Buffers for accumulating frames into chunks
frame_buffer = []

# Capture video
video_path = 'OurPlanet.mp4'  # Replace with your video file
cap = cv2.VideoCapture(video_path)

# Ensure video file is opened successfully
if not cap.isOpened():
    print(f"Error: Cannot open video file {video_path}")
    sys.exit()
else:
    print(f"Video file {video_path} opened successfully")

# Frame processing function
def update():
    global frame_buffer
    if not running:
        return
    try:
        ret, frame = cap.read()
        if not ret:
            print("End of video or cannot read the frame, restarting...")
            cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
            ret, frame = cap.read()

        if ret:
            # Flip and preprocess the frame
            frame = cv2.flip(frame, 1)
            preprocessed_frame = preprocess_frame(frame, FRAME_HEIGHT, FRAME_WIDTH)
            frame_buffer.append(preprocessed_frame)

            # Display the original frame
            original_frame_disp = cv2.cvtColor(cv2.resize(frame, (640, 480)), cv2.COLOR_BGR2RGB)
            original_image_pil = Image.fromarray(original_frame_disp)
            original_image_tk = ImageTk.PhotoImage(image=original_image_pil)
            original_image_label.config(image=original_image_tk)
            original_image_label.image = original_image_tk  # Keep reference

            # Proceed if buffer is full
            if len(frame_buffer) == CHUNK_SIZE:
                chunk = np.stack(frame_buffer)
                frame_buffer = []

                # Train GAN
                d_loss, g_loss = gan.train_step(chunk)
                print(f"Discriminator Loss: {d_loss}, Generator Loss: {g_loss}")

                # Generate GAN reconstructed frame
                random_latent = np.random.normal(0, 1, (1, LATENT_DIM))
                reconstructed_output = gan.generator.predict(random_latent)
                gan_reconstructed_disp = postprocess_frame(reconstructed_output[0])
                gan_image_pil = Image.fromarray(gan_reconstructed_disp)
                gan_image_tk = ImageTk.PhotoImage(image=gan_image_pil)
                gan_image_label.config(image=gan_image_tk)
                gan_image_label.image = gan_image_tk  # Keep reference

                # Diffusion Model Reconstruction (Gaussian Blur)
                diffusion_reconstructed = cv2.GaussianBlur(gan_reconstructed_disp, (15, 15), 0)
                diffusion_reconstructed_disp = cv2.cvtColor(diffusion_reconstructed, cv2.COLOR_BGR2RGB)
                diff_image_pil = Image.fromarray(diffusion_reconstructed_disp)
                diff_image_tk = ImageTk.PhotoImage(image=diff_image_pil)
                diff_image_label.config(image=diff_image_tk)
                diff_image_label.image = diff_image_tk  # Keep reference

                # CNN Inversion Reconstruction
                cnn_reconstructed, loss_history = reconstruct_image_iCNN(
                    frame, gan.generator, LATENT_DIM, iterations=RECONSTRUCTION_ITERATIONS, 
                    learning_rate=RECONSTRUCTION_LEARNING_RATE
                )
                cnn_reconstructed_disp = cv2.cvtColor(cnn_reconstructed, cv2.COLOR_BGR2RGB)
                cnn_image_pil = Image.fromarray(cnn_reconstructed_disp)
                cnn_image_tk = ImageTk.PhotoImage(image=cnn_image_pil)
                cnn_image_label.config(image=cnn_image_tk)
                cnn_image_label.image = cnn_image_tk  # Keep reference

    except Exception as e:
        print(f"Error during update: {e}")

    finally:
        # Schedule the next update
        window.after(10, update)

# Start the update loop
update()

# Start the Tkinter event loop
window.mainloop()


2024-10-08 21:01:51.602073: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-08 21:01:51.609792: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-08 21:01:51.618640: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-08 21:01:51.621274: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-08 21:01:51.628335: I tensorflow/core/platform/cpu_feature_guar

Video file TopGun.mp4 opened successfully


2024-10-08 21:01:53.634950: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8902
W0000 00:00:1728435713.678812  816673 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1728435713.691920  816673 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1728435713.692321  816673 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1728435713.692666  816673 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1728435713.693011  816673 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1728435713.693368  816673 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1728435713.693730  816673 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1728435713.694131  816673 gpu_t

Discriminator Loss: 1.3891417980194092, Generator Loss: 0.7797466516494751


I0000 00:00:1728435715.478115  816777 service.cc:146] XLA service 0x7f0b74004220 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1728435715.478142  816777 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Ti, Compute Capability 8.9
2024-10-08 21:01:55.483339: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 647ms/step


I0000 00:00:1728435716.052981  816777 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1728435716.074374  816673 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1728435716.074978  816673 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1728435716.075520  816673 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1728435716.076069  816673 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1728435716.076513  816673 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1728435716.077180  816673 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1728435716.077656  816673 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W000

Reconstruction Iteration 0, Loss: 0.17006440460681915
Reconstruction Iteration 50, Loss: 0.16550908982753754
Discriminator Loss: 1.3209625482559204, Generator Loss: 0.8743060231208801
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Reconstruction Iteration 0, Loss: 0.24998271465301514
Reconstruction Iteration 50, Loss: 0.24652697145938873
Discriminator Loss: 1.3600549697875977, Generator Loss: 0.8643596172332764
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Reconstruction Iteration 0, Loss: 0.1681423783302307
Reconstruction Iteration 50, Loss: 0.16024257242679596
Discriminator Loss: 1.3373438119888306, Generator Loss: 0.844652533531189
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Reconstruction Iteration 0, Loss: 0.15814833343029022
Reconstruction Iteration 50, Loss: 0.15202195942401886
Discriminator Loss: 1.2952708005905151, Generator Loss: 0.8925613164901733
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [7]:
import cv2
import numpy as np
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.layers import Input, Conv2D
from tensorflow.keras.models import Model

# Teacher Class
class Teacher:
    def __init__(self):
        self.model = None  # Teacher doesn't directly hold a model anymore
    
    def teach(self, apprentice, features):
        # Teacher guiding the apprentice (Apprentice has the model)
        apprentice.train(features)

# Apprentice Class
class Apprentice:
    def __init__(self):
        # Apprentice initializes both the VGG16 model and the autoencoder
        self.vgg_model = self.load_vgg_model()
        self.autoencoder = self.create_autoencoder()

    def load_vgg_model(self):
        # Load VGG16 model and use a higher-resolution layer for feature extraction
        vgg_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
        vgg_model = Model(inputs=vgg_model.input, outputs=vgg_model.get_layer('block3_conv3').output)
        return vgg_model

    def create_autoencoder(self):
        # Define a deeper autoencoder with a higher capacity for reconstruction
        input_img = Input(shape=(56, 56, 256))  # Ensure the input shape matches the feature map size from VGG16

        # Encoder part
        x = Conv2D(128, (3, 3), activation='relu', padding='same')(input_img)
        x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
        encoded = Conv2D(32, (3, 3), activation='relu', padding='same')(x)

        # Decoder part (expand layers to learn richer features)
        x = Conv2D(64, (3, 3), activation='relu', padding='same')(encoded)
        x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
        # Final layer still has 256 channels for matching the input during training
        decoded = Conv2D(256, (3, 3), activation='linear', padding='same')(x)

        autoencoder = Model(input_img, decoded)
        autoencoder.compile(optimizer='adam', loss='mean_squared_error')  # Using MSE for better pixel-level reconstruction
        
        return autoencoder

    def extract_features(self, frame):
        # Function to extract features from frames using the modified VGG16 model
        frame = cv2.resize(frame, (224, 224))  # Ensure frame is resized to 224x224, the input size for VGG16
        frame = np.expand_dims(frame, axis=0)  # Add batch dimension
        frame = preprocess_input(frame)  # Preprocess input for VGG16
        features = self.vgg_model.predict(frame)  # Extract features from VGG16
        return features

    def train(self, features):
        # Apprentice trains the autoencoder
        self.autoencoder.train_on_batch(features, features)

    def try_reconstruct(self, features):
        # Apprentice reconstructs the frame using the autoencoder
        return self.autoencoder.predict(features)

# Function to visualize reconstructed frames in RGB
def visualize_reconstruction(reconstructed_features):
    # Reduce the channels from 256 to 3 for visualization
    reconstructed_frame = reconstructed_features[0, :, :, :3]  # Take the first 3 channels for RGB display

    # Resize the reconstructed frame to 224x224 to match the original input size
    reconstructed_frame = cv2.resize(reconstructed_frame, (224, 224))

    # Ensure reconstructed frame is in the range [0, 255] for display
    reconstructed_frame = np.clip(reconstructed_frame, 0, 1)  # Clip values between 0 and 1
    reconstructed_frame = (reconstructed_frame * 255).astype(np.uint8)  # Rescale to 255 for display

    return reconstructed_frame

# Global variable for checking the window close button
close_window = False

# Function to handle mouse clicks (for closing the window)
def click_close(event, x, y, flags, param):
    global close_window
    if event == cv2.EVENT_LBUTTONDOWN:
        close_window = True

# Cancel button in the GUI window
def add_cancel_button_below(combined_frame):
    # Create a new blank space below the frame for the button
    button_frame = np.zeros((50, combined_frame.shape[1], 3), dtype=np.uint8)
    
    # Add the cancel button text onto the blank space
    button_position = (combined_frame.shape[1] // 2 - 50, 30)
    cv2.rectangle(button_frame, (button_position[0] - 10, 10), (button_position[0] + 90, 40), (0, 0, 255), -1)
    cv2.putText(button_frame, "Close", button_position, cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
    
    # Concatenate the button frame to the original frame (below it)
    combined_frame_with_button = np.vstack((combined_frame, button_frame))
    
    return combined_frame_with_button

# Initialize video capture from the video file
video_path = 'OurPlanet.mp4'  # Path to your video
cap = cv2.VideoCapture(video_path)

# Window setup with mouse callback for closing
cv2.namedWindow("Raw vs Reconstructed", cv2.WINDOW_NORMAL)
cv2.setMouseCallback("Raw vs Reconstructed", click_close)

# Real-time processing loop
batch_size = 5
frames_buffer = []
teacher = Teacher()
apprentice = Apprentice()

while True:  # Loop video indefinitely
    ret, frame = cap.read()
    
    if not ret:
        # Loop back to the beginning of the video if end is reached
        cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
        continue

    # Collect frames in a buffer to train the autoencoder in batches
    frames_buffer.append(frame)

    if len(frames_buffer) == batch_size:
        # Extract features from the batch of frames using the Apprentice's VGG16 model
        features_batch = np.vstack([apprentice.extract_features(f) for f in frames_buffer])

        # Normalize feature batch to range [0, 1]
        features_batch = features_batch / 255.0

        # Teacher teaches Apprentice with the extracted feature batch
        teacher.teach(apprentice, features_batch)

        # Apprentice attempts to reconstruct the last frame from the batch
        reconstructed_features = apprentice.try_reconstruct(features_batch[-1:])

        # Visualize the reconstructed frame (reduce to RGB for display)
        reconstructed_frame = visualize_reconstruction(reconstructed_features)

        # Display the original and reconstructed frame side by side
        combined_frame = np.hstack((cv2.resize(frames_buffer[-1], (224, 224)), reconstructed_frame))

        # Add cancel button below the video feed
        combined_frame_with_button = add_cancel_button_below(combined_frame)
        
        cv2.imshow('Raw vs Reconstructed', combined_frame_with_button)

        # Reset the buffer
        frames_buffer = []

    # Check if the user pressed 'q' or clicked the window close button
    if cv2.waitKey(1) & 0xFF == ord('q') or close_window:
        break

# Cleanup and close
cap.release()
cv2.destroyAllWindows()


2024-10-09 10:28:58.850513: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-09 10:28:58.984453: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-09 10:28:59.050330: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-09 10:28:59.066284: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-09 10:28:59.166453: I tensorflow/core/platform/cpu_feature_guar

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


I0000 00:00:1728484144.920754   38073 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 367ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

: 

In [3]:
import tkinter as tk
import cv2  # OpenCV for video handling
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import time

# Teacher Class (for feature extraction and encoding)
class Teacher:
    def __init__(self, video_path):
        self.video_path = video_path
        self.video_capture = cv2.VideoCapture(video_path)

        # Get original video properties (resolution and frame rate)
        self.original_width = int(self.video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
        self.original_height = int(self.video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
        self.frame_rate = self.video_capture.get(cv2.CAP_PROP_FPS)

        # Build the encoder model
        self.encoder_model = self.build_encoder()

    def build_encoder(self):
        """ Builds the encoder part of a convolutional autoencoder. """
        input_img = layers.Input(shape=(self.original_height, self.original_width, 1))  # Input size based on video

        # Encoder
        x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
        x = layers.MaxPooling2D((2, 2), padding='same')(x)
        x = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
        encoded = layers.MaxPooling2D((2, 2), padding='same')(x)

        # Encoder model
        encoder_model = models.Model(input_img, encoded)
        return encoder_model

    def frame_feature_extraction(self):
        """ Extracts and processes frames from the video. """
        success, frame = self.video_capture.read()
        if success:
            gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            gray_frame = gray_frame.astype('float32') / 255.0  # Normalize pixel values
            gray_frame = np.reshape(gray_frame, (self.original_height, self.original_width, 1))  # Reshape for model
            return frame, gray_frame
        else:
            # Loop the video by resetting to the first frame
            self.video_capture.set(cv2.CAP_PROP_POS_FRAMES, 0)
            return self.frame_feature_extraction()

    def encoder(self, gray_frame):
        """ Uses the encoder model to extract latent features. """
        latent_features = self.encoder_model.predict(np.array([gray_frame]))
        return latent_features

# Apprentice Class (for decoding the latent features)
class Apprentice:
    def __init__(self, original_height, original_width):
        self.decoder_model = self.build_decoder(original_height, original_width)

    def build_decoder(self, height, width):
        """ Builds the decoder part of a convolutional autoencoder. """
        encoded_input = layers.Input(shape=(height // 4, width // 4, 64))  # Input shape should match the encoder's output

        # Decoder
        x = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(encoded_input)
        x = layers.UpSampling2D((2, 2))(x)
        x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(x)
        x = layers.UpSampling2D((2, 2))(x)
        decoded = layers.Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

        # Decoder model
        decoder_model = models.Model(encoded_input, decoded)
        return decoder_model

    def decoder(self, latent_features):
        """ Decodes the latent features to reconstruct the frame. """
        decoded_frame = self.decoder_model.predict(latent_features)
        return decoded_frame[0]

# GUI Class for handling the custom interface
class GUI:
    def __init__(self, master):
        self.master = master
        self.master.title("Teacher-Apprentice Video Processing")

        # Initialize the Teacher and Apprentice classes
        self.teacher = Teacher('OurPlanet.mp4')  # Hardcoded video path
        self.apprentice = Apprentice(self.teacher.original_height, self.teacher.original_width)

        # Start processing video automatically
        self.process_video()

    def process_video(self):
        """ Process the video by extracting, encoding, decoding, and reconstructing frames. """
        while True:
            start_time = time.time()

            # Teacher extracts frame and encodes it
            raw_frame, gray_frame = self.teacher.frame_feature_extraction()
            if raw_frame is None:
                break  # End of video

            # Encode the frame using the Teacher's encoder
            latent_features = self.teacher.encoder(gray_frame)

            # Decode the latent features using the Apprentice's decoder
            decoded_frame = self.apprentice.decoder(latent_features)

            # Display raw and reconstructed frames side by side
            if decoded_frame is not None:
                # Convert decoded frame back to BGR (since it's grayscale)
                decoded_bgr_frame = (decoded_frame * 255).astype(np.uint8)
                decoded_bgr_frame = cv2.cvtColor(decoded_bgr_frame, cv2.COLOR_GRAY2BGR)

                # Ensure decoded frame matches the resolution of the original frame
                decoded_bgr_frame_resized = cv2.resize(decoded_bgr_frame, (self.teacher.original_width, self.teacher.original_height))

                # Concatenate the original and decoded frames horizontally
                combined_frame = np.hstack((raw_frame, decoded_bgr_frame_resized))

                # Show the combined frame
                cv2.imshow("Raw vs Reconstructed", combined_frame)

                # Frame rate control to match the original video's FPS
                time_taken = time.time() - start_time
                delay = max(1, int((1.0 / self.teacher.frame_rate - time_taken) * 1000))
                if cv2.waitKey(delay) & 0xFF == ord('q'):  # Press 'q' to exit
                    break

        cv2.destroyAllWindows()

# Main code to run the application
if __name__ == "__main__":
    root = tk.Tk()
    app = GUI(root)
    root.mainloop()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 759ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

KeyboardInterrupt: 

In [1]:
import cv2
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Function to convert OpenCV frame to PyTorch tensor
def frame_to_tensor(frame):
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame = frame.astype(np.float32) / 255.0
    frame = np.transpose(frame, (2, 0, 1))
    tensor = torch.from_numpy(frame).unsqueeze(0)  # Add batch dimension
    return tensor

# Function to convert tensor to OpenCV image
def tensor_to_cv2_image(tensor):
    tensor = tensor.squeeze(0).detach().cpu()
    tensor = torch.clamp(tensor, 0, 1)
    np_img = tensor.numpy()
    np_img = np.transpose(np_img, (1, 2, 0))
    np_img = cv2.cvtColor(np_img, cv2.COLOR_RGB2BGR)
    np_img = (np_img * 255).astype(np.uint8)
    return np_img

# Function to overlay text with a black outline
def put_text_with_outline(img, text, position, font_scale=1, font_color=(255, 255, 255), outline_color=(0, 0, 0), thickness=2, outline_thickness=3):
    font = cv2.FONT_HERSHEY_SIMPLEX
    cv2.putText(img, text, position, font, font_scale, outline_color, outline_thickness, lineType=cv2.LINE_AA)
    cv2.putText(img, text, position, font, font_scale, font_color, thickness, lineType=cv2.LINE_AA)

# Encoder Class
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, 3, stride=2, padding=1)  # Downsample
        self.context_conv = nn.Conv2d(1, 64, 3, stride=2, padding=1)  # Downsample context
        self.gdn = nn.GroupNorm(32, 64)
        self.resblock = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, padding=1)
        )
    
    def forward(self, x, context):
        x = self.conv1(x)  # [B, 64, H/2, W/2]
        context = self.context_conv(context)  # [B, 64, H/2, W/2]
        x = x + context  # Combine features
        x = self.gdn(x)
        x = self.resblock(x)
        return x  # Latent representation

# Decoder Class
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.deconv1 = nn.ConvTranspose2d(64, 64, 3, stride=2, padding=1, output_padding=1)  # Upsample
        self.gdn_dec = nn.GroupNorm(32, 64)
        self.resblock_dec = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, padding=1)
        )
        self.conv_out = nn.Conv2d(64, 3, kernel_size=3, stride=1, padding=1)  # Reconstruct to 3 channels
    
    def forward(self, latent):
        x = self.deconv1(latent)  # [B, 64, H, W]
        x = self.gdn_dec(x)
        x = self.resblock_dec(x)
        x = self.conv_out(x)  # [B, 3, H, W]
        return x

# Autoencoder Class
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
    
    def forward(self, x, context):
        latent = self.encoder(x, context)
        reconstructed = self.decoder(latent)
        return reconstructed, latent

# Main video loop with bandwidth calculation
def play_video_and_train():
    video_path = 'OurPlanet.mp4'
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Cannot open video file {video_path}")
        return

    # Frame rate synchronization
    fps = cap.get(cv2.CAP_PROP_FPS)
    wait_time = int(1000 / fps) if fps > 0 else 1

    # Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = Autoencoder().to(device)
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
            continue

        # Convert frame to tensor
        input_tensor = frame_to_tensor(frame).to(device)
        context_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        context_frame = context_frame.astype(np.float32) / 255.0
        context_tensor = torch.from_numpy(context_frame).unsqueeze(0).unsqueeze(0).to(device)

        # Forward pass
        optimizer.zero_grad()
        output, latent = model(input_tensor, context_tensor)
        loss = loss_fn(output, input_tensor)
        loss.backward()
        optimizer.step()

        # Bandwidth calculation
        frame_height, frame_width = frame.shape[:2]
        original_bandwidth = (frame_height * frame_width * 3 * 32) / 1e6  # Mbps for input frame
        latent_bandwidth = (latent.numel() * 32) / 1e6  # Mbps for latent space

        # Convert output tensor to OpenCV image
        decoded_frame = tensor_to_cv2_image(output)

        # Ensure both frames have the same height and width
        if decoded_frame.shape != frame.shape:
            decoded_frame = cv2.resize(decoded_frame, (frame.shape[1], frame.shape[0]))

        # Concatenate frames and display info
        combined_frame = np.hstack((frame, decoded_frame))
        put_text_with_outline(combined_frame, f"Loss: {loss.item():.4f}", (10, 30))
        # put_text_with_outline(combined_frame, f"Original Bandwidth: {original_bandwidth:.2f} Mbps", (10, 60))
        # put_text_with_outline(combined_frame, f"Latent Bandwidth: {latent_bandwidth:.2f} Mbps", (10, 90))

        # Display the frame
        cv2.imshow('Original | Decoded with Stats', combined_frame)

        if cv2.waitKey(wait_time) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    play_video_and_train()


# Check for available GPU

In [6]:
# Check for GPU availability
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPUs Available: {len(gpus)}")
    for gpu in gpus:
        print(f" - {gpu}")
        # Optional: Limit GPU memory growth
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print("No GPUs available.")


No GPUs available.


In [7]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  0


In [1]:
import cv2
import numpy as np
import tkinter as tk
from PIL import Image, ImageTk

# Initialize ORB detector
orb = cv2.ORB_create(nfeatures=1000)  # Increase features for better matching

# Initialize video capture and retrieve the frame size
cap = cv2.VideoCapture(0)

# Debugging: Check if webcam is opened properly
if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

ret, frame = cap.read()
if not ret:
    print("Error: Could not read from the webcam.")
    exit()

# Get the frame dimensions
frame_height, frame_width = frame.shape[:2]

# Get framerate
framerate = cap.get(cv2.CAP_PROP_FPS)

# Knowledge base to store descriptors of previously seen frames
knowledge_base = []
buffer_size = 10  # Maximum number of stored descriptors

# Tkinter setup
window = tk.Tk()
window.title("Frame Similarity Detection")

# Tkinter variables
seen_before_accuracy_var = tk.StringVar(value="Seen Before Accuracy: 0.00%")  # Variable to hold the similarity percentage
resolution_var = tk.StringVar()  # To display the resolution
framerate_var = tk.StringVar()  # To display the framerate
scale_factor = tk.DoubleVar(value=0.5)  # Variable to hold the scaling factor

# Set initial values for resolution and framerate
resolution_var.set(f"Resolution: {frame_width}x{frame_height}")
framerate_var.set(f"Framerate: {framerate:.2f} FPS")

# Labels for the images
original_label = tk.Label(window)
original_label.grid(row=0, column=0, padx=10, pady=10)

# Label for displaying the seen before accuracy
accuracy_label = tk.Label(window, textvariable=seen_before_accuracy_var, font=("Helvetica", 12))
accuracy_label.grid(row=1, column=0, pady=10)

# Label for displaying resolution
resolution_label = tk.Label(window, textvariable=resolution_var, font=("Helvetica", 12))
resolution_label.grid(row=2, column=0, pady=5)

# Label for displaying framerate
framerate_label = tk.Label(window, textvariable=framerate_var, font=("Helvetica", 12))
framerate_label.grid(row=3, column=0, pady=5)

# Dropdown for selecting the scaling factor
scaling_options = [0.25, 0.5, 0.75, 1.0]
scale_menu = tk.OptionMenu(window, scale_factor, *scaling_options)
scale_menu.grid(row=4, column=0, pady=5)

# BFMatcher to compare ORB descriptors (use NORM_HAMMING for ORB)
bf = cv2.BFMatcher(cv2.NORM_HAMMING)

# Feature extraction function using ORB
def extract_features(frame):
    # Convert frame to grayscale
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    # Detect keypoints and compute descriptors
    keypoints, descriptors = orb.detectAndCompute(gray_frame, None)
    return keypoints, descriptors

# Function to compare a new frame's descriptors with the knowledge base
def compare_with_knowledge_base(new_keypoints, new_descriptors, knowledge_base):
    max_similarity = 0
    if new_descriptors is None or len(new_descriptors) == 0:
        return max_similarity
    # Iterate over stored descriptors in the knowledge base
    for stored_keypoints, stored_descriptors in knowledge_base:
        if stored_descriptors is not None and len(stored_descriptors) > 0:
            # Use knnMatch to find the 2 nearest neighbors
            matches = bf.knnMatch(new_descriptors, stored_descriptors, k=2)
            good_matches = []
            # Apply ratio test
            for m in matches:
                if len(m) == 2:
                    if m[0].distance < 0.75 * m[1].distance:
                        good_matches.append(m[0])
            # Calculate similarity as ratio of good matches to total keypoints
            similarity = len(good_matches) / len(new_keypoints)
            max_similarity = max(max_similarity, similarity)
    return max_similarity

# Function to store a new descriptor in the knowledge base if it's new
def store_if_new(keypoints, descriptors, knowledge_base, similarity_threshold=0.7):
    similarity = compare_with_knowledge_base(keypoints, descriptors, knowledge_base)
    if similarity < similarity_threshold:  # If similarity is low, store the frame as new
        knowledge_base.append((keypoints, descriptors))
        if len(knowledge_base) > buffer_size:  # Maintain buffer size
            knowledge_base.pop(0)
        return similarity, True
    return similarity, False

# Update function to capture and process frames
def update():
    try:
        ret, frame = cap.read()

        if not ret:
            print("Error: Could not read frame from webcam.")
            window.after(10, update)
            return

        # Flip the frame horizontally for a mirror effect
        frame = cv2.flip(frame, 1)

        # Resize frame for faster computation
        scale = scale_factor.get()
        display_width = int(frame_width * scale)
        display_height = int(frame_height * scale)

        if display_width == 0 or display_height == 0:
            print("Error: Display width or height is zero.")
            window.after(10, update)
            return

        # Extract features from the current frame
        keypoints, descriptors = extract_features(frame)

        if descriptors is not None and len(keypoints) > 0:
            # Compare the frame's descriptors with the knowledge base and store if new
            similarity, is_new = store_if_new(keypoints, descriptors, knowledge_base)

            # Calculate seen before accuracy as a percentage
            seen_before_accuracy = similarity * 100

            if is_new:
                print("New frame stored in knowledge base.")
            else:
                print("Frame is similar to previous data.")

            # Update the seen before accuracy variable in the GUI
            seen_before_accuracy_var.set(f"Seen Before Accuracy: {seen_before_accuracy:.2f}%")
        else:
            print("No features detected")
            seen_before_accuracy_var.set("Seen Before Accuracy: N/A")

        # Convert frame from BGR to RGB for display in Tkinter
        frame_resized = cv2.resize(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), (display_width, display_height))
        original_image_pil = Image.fromarray(frame_resized)
        original_image_tk = ImageTk.PhotoImage(image=original_image_pil)

        # Update the labels with the new images
        original_label.config(image=original_image_tk)
        original_label.image = original_image_tk  # Prevent garbage collection

        window.after(10, update)  # Update the frame every 10ms

    except Exception as e:
        print(f"An error occurred: {e}")
        window.after(10, update)

# Event to signal the closing of the app
def on_closing():
    print("Closing application...")
    cap.release()
    cv2.destroyAllWindows()
    window.destroy()

# Add a close button
close_button = tk.Button(window, text="Close", command=on_closing)
close_button.grid(row=5, column=0, pady=10)

# Bind the window close event
window.protocol("WM_DELETE_WINDOW", on_closing)

# Start the update loop
update()

# Start the Tkinter event loop
window.mainloop()


New frame stored in knowledge base.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to previous data.
Frame is similar to

Exception in Tkinter callback
Traceback (most recent call last):
  File "c:\Users\keplarV4\AppData\Local\Programs\Python\Python311\Lib\tkinter\__init__.py", line 1948, in __call__
    return self.func(*args)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\keplarV4\AppData\Local\Temp\ipykernel_30808\2010831647.py", line 172, in on_closing
    cv2.destroyAllWindows()
cv2.error: OpenCV(4.10.0) D:\a\opencv-python\opencv-python\opencv\modules\highgui\src\window.cpp:1295: error: (-2:Unspecified error) The function is not implemented. Rebuild the library with Windows, GTK+ 2.x or Cocoa support. If you are on Ubuntu or Debian, install libgtk2.0-dev and pkg-config, then re-run cmake or configure script in function 'cvDestroyAllWindows'



Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not

Exception in Tkinter callback
Traceback (most recent call last):
  File "c:\Users\keplarV4\AppData\Local\Programs\Python\Python311\Lib\tkinter\__init__.py", line 1948, in __call__
    return self.func(*args)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\keplarV4\AppData\Local\Temp\ipykernel_30808\2010831647.py", line 172, in on_closing
    cv2.destroyAllWindows()
cv2.error: OpenCV(4.10.0) D:\a\opencv-python\opencv-python\opencv\modules\highgui\src\window.cpp:1295: error: (-2:Unspecified error) The function is not implemented. Rebuild the library with Windows, GTK+ 2.x or Cocoa support. If you are on Ubuntu or Debian, install libgtk2.0-dev and pkg-config, then re-run cmake or configure script in function 'cvDestroyAllWindows'



Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not

Exception in Tkinter callback
Traceback (most recent call last):
  File "c:\Users\keplarV4\AppData\Local\Programs\Python\Python311\Lib\tkinter\__init__.py", line 1948, in __call__
    return self.func(*args)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\keplarV4\AppData\Local\Temp\ipykernel_30808\2010831647.py", line 172, in on_closing
    cv2.destroyAllWindows()
cv2.error: OpenCV(4.10.0) D:\a\opencv-python\opencv-python\opencv\modules\highgui\src\window.cpp:1295: error: (-2:Unspecified error) The function is not implemented. Rebuild the library with Windows, GTK+ 2.x or Cocoa support. If you are on Ubuntu or Debian, install libgtk2.0-dev and pkg-config, then re-run cmake or configure script in function 'cvDestroyAllWindows'



Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not read frame from webcam.
Error: Could not

KeyboardInterrupt: 

: 