In [2]:
import pickle
import numpy as np

In [3]:
import numpy as np
import pickle

# Set random seed for reproducibility
np.random.seed(1337)

# Define paths
image_path = '/home/maria/Documents/HarvardData/Images'
session_ims = pickle.load(open('/home/maria/Documents/HarvardData/processed_sessions_v3/Bo220226/session_images.p','rb'))

# Construct full image paths
image_paths = np.array([f"{image_path}/{im.split('/')[2]}" for im in session_ims])

# Total number of images
n_total = len(session_ims)
print(f"Total number of images: {n_total}")

# Define the number of training samples
n_train = 1000

# Ensure that n_train does not exceed n_total
if n_train > n_total:
    raise ValueError("Number of training samples exceeds the total number of available images.")

# Randomly select unique training indices without replacement
training_path_inds = np.random.choice(n_total, size=n_train, replace=False)
training_paths = image_paths[training_path_inds]

# Determine test indices as those not in training_path_inds
test_inds = np.setdiff1d(np.arange(n_total), training_path_inds)
test_paths = image_paths[test_inds]

# Print shapes to verify
print(f"Training indices shape: {training_path_inds.shape}")  # Should be (1000,)
print(f"Number of test samples: {len(test_paths)}")           # Should be n_total - 1000

# Optional: Verify no overlap between training and test sets
overlap = np.intersect1d(training_paths, test_paths)
print(f"Number of overlapping images between training and test sets: {len(overlap)}")  # Should be 0


Total number of images: 1250
Training indices shape: (1000,)
Number of test samples: 250
Number of overlapping images between training and test sets: 0


In [1]:
import os
import numpy as np
import pickle
from PIL import Image
from torchvision import transforms

# ================================
# Step 1: Define Data Augmentation Pipeline
# ================================

# Define your data augmentation pipeline (excluding normalization)
augmentation_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),            # 50% chance to flip horizontally
    transforms.RandomRotation(degrees=15),             # Rotate by ±15 degrees
    transforms.ColorJitter(brightness=0.2,              # Adjust brightness
                           contrast=0.2,                # Adjust contrast
                           saturation=0.2,              # Adjust saturation
                           hue=0.1)                     # Adjust hue
    # Add more augmentations here if desired
])

# ================================
# Step 2: Create Output Directory
# ================================

# Define the output directory for augmented images
output_dir = '/home/maria/Documents/HarvardData/Augmentations'

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# ================================
# Step 3: Load Training Image Paths
# ================================

# Load session image identifiers
session_ims = pickle.load(open('/home/maria/Documents/HarvardData/processed_sessions_v3/Bo220226/session_images.p','rb'))

# Function to find the correct image path with existing extension
def find_image_path(session_ims, base_image_dir, possible_extensions):
    image_paths = []
    for p in session_ims:
        # Extract the filename after 'OOD_monkey_data/Images/'
        if 'OOD_monkey_data/Images/' in p:
            filename = p.split('OOD_monkey_data/Images/')[-1]
        else:
            filename = os.path.basename(p)  # Fallback to basename
        
        base_name, ext = os.path.splitext(filename)
        
        # Try each possible extension until a file is found
        found_path = None
        for ext_candidate in possible_extensions:
            candidate_path = os.path.join(base_image_dir, base_name + ext_candidate)
            if os.path.exists(candidate_path):
                found_path = candidate_path
                break
        
        if found_path is None:
            # If no matching file is found, warn and skip
            print(f"Warning: No matching file found for base name: {base_name}")
            # Optionally, append a placeholder or handle as needed
            # image_paths.append('/path/to/placeholder.jpg') # Uncomment if using placeholders
        else:
            image_paths.append(found_path)
    
    return np.array(image_paths)

# Define base image directory and possible extensions
base_image_dir = '/home/maria/Documents/HarvardData/Images'
possible_extensions = ['.jpg', '.JPG', '.png', '.PNG']

# Get the array of valid image paths
image_paths = find_image_path(session_ims, base_image_dir, possible_extensions)
print(f"Total valid images found: {len(image_paths)}")

# Define the number of training samples
n_train = 1000

# Ensure that n_train does not exceed the total number of images
if n_train > len(image_paths):
    raise ValueError("Number of training samples exceeds the total number of available images.")

# Generate unique training indices without replacement
training_path_inds = np.random.choice(len(image_paths), size=n_train, replace=False)
training_paths = image_paths[training_path_inds]

# Determine test indices as the complement of training indices
test_inds = np.setdiff1d(np.arange(len(image_paths)), training_path_inds)
test_paths = image_paths[test_inds]

print(f"Number of training samples: {len(training_paths)}")  # Should be 1000
print(f"Number of test samples: {len(test_paths)}")          # Should be len(image_paths) - 1000

# Optional: Verify no overlap
overlap = np.intersect1d(training_paths, test_paths)
print(f"Number of overlapping images between training and test sets: {len(overlap)}")  # Should be 0

# ================================
# Step 4: Apply Augmentations and Save Augmented Images
# ================================

# Initialize a counter for naming augmented images
counter = 1

# Iterate through each image in the training set
for path in training_paths:
    try:
        # Open the image and ensure it's in RGB format
        image = Image.open(path).convert('RGB')
        
        # Apply the augmentation transformations
        augmented_image = augmentation_transform(image)
        
        # Extract the original file extension
        _, ext = os.path.splitext(path)
        ext = ext.lower()  # Ensure the extension is in lowercase (e.g., '.jpg', '.png')
        
        # Define the new filename (e.g., '1.jpg', '2.png', etc.)
        new_filename = f"{counter}{ext}"
        
        # Define the full path to save the augmented image
        save_path = os.path.join(output_dir, new_filename)
        
        # Save the augmented image
        augmented_image.save(save_path)
        
        # Increment the counter
        counter += 1
        
    except Exception as e:
        print(f"Error processing image {path}: {e}")

print(f"Augmented images have been saved to {output_dir}")


Total valid images found: 1250
Number of training samples: 1000
Number of test samples: 250
Number of overlapping images between training and test sets: 0
Augmented images have been saved to /home/maria/Documents/HarvardData/Augmentations


In [4]:
import os
import numpy as np
from PIL import Image
from transformers import ViTImageProcessor, ViTModel
import torch

# ================================
# Step 1: Define Paths and Initialize ViT
# ================================

# Define the path to the augmented images
augmentations_dir = '/home/maria/Documents/HarvardData/Augmentations'

# Define the model name
model_name = 'google/vit-base-patch16-224'

# Initialize the processor and model
processor = ViTImageProcessor.from_pretrained(model_name)
model = ViTModel.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()

# Move the model to GPU if available for faster processing
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# ================================
# Step 2: List and Sort Augmented Image Files
# ================================

# Define possible image extensions
possible_extensions = ['.jpg', '.JPG', '.png', '.PNG']

# List all image files in the augmentations directory with the specified extensions
image_files = [f for f in os.listdir(augmentations_dir) if os.path.splitext(f)[1] in possible_extensions]

# Function to extract numerical value from filename for sorting
def extract_number(filename):
    name, _ = os.path.splitext(filename)
    try:
        return int(name)
    except ValueError:
        return name  # If filename isn't a number, sort lexicographically

# Sort image files numerically
image_files_sorted = sorted(image_files, key=extract_number)

print(f"Total augmented images found: {len(image_files_sorted)}")

# ================================
# Step 3: Embed Images and Extract CLS Tokens
# ================================

# Initialize a list to store embeddings
embeddings = []

# Iterate through each image file
for idx, image_file in enumerate(image_files_sorted, start=1):
    image_path = os.path.join(augmentations_dir, image_file)
    try:
        # Open the image and ensure it's in RGB format
        image = Image.open(image_path).convert('RGB')
        
        # Process the image using the ViTImageProcessor
        inputs = processor(images=image, return_tensors="pt")
        
        # Move inputs to the same device as the model
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Forward pass through the ViT model to get outputs
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract the CLS token from the pooler_output
        cls_token = outputs.pooler_output.squeeze().cpu().numpy()  # Shape: [hidden_size]
        
        # Append the CLS token to the embeddings list
        embeddings.append(cls_token)
        
        # Print progress every 100 images
        if idx % 100 == 0 or idx == len(image_files_sorted):
            print(f"Processed {idx}/{len(image_files_sorted)} images")
    
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")

# ================================
# Step 4: Compile and Save Embeddings
# ================================

# Convert the list of embeddings to a NumPy array
embeddings_array = np.vstack(embeddings)  # Shape: [num_images, hidden_size]

print(f"Embeddings shape: {embeddings_array.shape}")  # Example: (5000, 768)

# Define the path to save the embeddings
save_path = 'Bo220226_augmentations.npy'

# Save the embeddings array to a .npy file
np.save(save_path, embeddings_array)

print(f"Embeddings have been saved to {save_path}")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total augmented images found: 1000
Processed 100/1000 images
Processed 200/1000 images
Processed 300/1000 images
Processed 400/1000 images
Processed 500/1000 images
Processed 600/1000 images
Processed 700/1000 images
Processed 800/1000 images
Processed 900/1000 images
Processed 1000/1000 images
Embeddings shape: (1000, 768)
Embeddings have been saved to Bo220226_augmentations.npy


In [8]:
import os
import numpy as np
import pickle
from PIL import Image
from transformers import ViTImageProcessor, ViTModel
import torch
from torch.utils.data import Dataset, DataLoader

# ================================
# Step 1: Define Allowed Extensions
# ================================

# Define possible image extensions
possible_extensions = ['.jpg', '.JPG', '.png', '.PNG']

# ================================
# Step 2: Define Function to Find Correct Image Path
# ================================

def find_correct_image_path(base_dir, base_name, allowed_extensions):
    """
    Given a base directory and base image name, find the correct file path by trying allowed extensions.

    Args:
        base_dir (str): Directory where images are stored.
        base_name (str): Base name of the image (without extension).
        allowed_extensions (list): List of allowed file extensions.

    Returns:
        str or None: Full path to the image if found, else None.
    """
    for ext in allowed_extensions:
        candidate_path = os.path.join(base_dir, base_name + ext)
        if os.path.exists(candidate_path):
            return candidate_path
    print(f"Warning: No matching file found for base name: {base_name}")
    return None

# ================================
# Step 3: Extract Base Names from session_ims
# ================================

# Example: Load session image identifiers (replace with your actual data loading)
session_ims = pickle.load(open('/home/maria/Documents/HarvardData/processed_sessions_v3/Bo220226/session_images.p','rb'))

# Define base image directory
base_image_dir = '/home/maria/Documents/HarvardData/Images'

# Function to extract base names from session_ims
def extract_base_names(session_ims, prefix='OOD_monkey_data/Images/'):
    base_names = []
    for p in session_ims:
        if prefix in p:
            filename = p.split(prefix)[-1]
            base_name, _ = os.path.splitext(filename)
            base_names.append(base_name)
        else:
            # If prefix not found, use the basename without extension
            base_name = os.path.splitext(os.path.basename(p))[0]
            base_names.append(base_name)
    return base_names

# Extract base names
base_names = extract_base_names(session_ims)

# ================================
# Step 4: Split into Training and Test Sets
# ================================

n_train = 1000
np.random.seed(1337)
train_indices = np.random.choice(len(base_names), size=n_train, replace=False)
test_indices = np.setdiff1d(np.arange(len(base_names)), train_indices)

training_base_names = [base_names[i] for i in train_indices]
test_base_names = [base_names[i] for i in test_indices]

# ================================
# Step 5: Filter Image Paths with Correct Extensions
# ================================

def filter_image_paths(base_dir, base_names, allowed_extensions):
    """
    Given a list of base names, find the correct image paths by trying allowed extensions.

    Args:
        base_dir (str): Directory where images are stored.
        base_names (list): List of base image names (without extension).
        allowed_extensions (list): List of allowed file extensions.

    Returns:
        list: List of full image paths with correct extensions.
    """
    image_paths = []
    for base_name in base_names:
        correct_path = find_correct_image_path(base_dir, base_name, allowed_extensions)
        if correct_path:
            image_paths.append(correct_path)
        # If not found, it's already handled in find_correct_image_path
    return image_paths

# Get filtered training and test paths
filtered_training_paths = filter_image_paths(base_image_dir, training_base_names, possible_extensions)
filtered_test_paths = filter_image_paths(base_image_dir, test_base_names, possible_extensions)

print(f"Number of filtered training images: {len(filtered_training_paths)}")
print(f"Number of filtered test images: {len(filtered_test_paths)}")

# ================================
# Step 6: Initialize Processor and Model
# ================================

# Define the model name
model_name = 'google/vit-base-patch16-224'

# Initialize the processor and model
processor = ViTImageProcessor.from_pretrained(model_name)
model = ViTModel.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()

# Move the model to GPU if available for faster processing
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# ================================
# Step 7: Define the Embedding Function
# ================================

def embed_images(image_paths, processor, model, device, batch_size=32):
    """
    Embed images using a Vision Transformer (ViT) model and extract CLS tokens.

    Args:
        image_paths (list): List of image file paths to embed.
        processor (ViTImageProcessor): The image processor for ViT.
        model (ViTModel): The pre-trained ViT model.
        device (torch.device): The device to run the model on (CPU or GPU).
        batch_size (int, optional): Number of images to process in a batch. Defaults to 32.

    Returns:
        np.ndarray: Array of CLS token embeddings with shape [num_images, hidden_size].
    """
    class ImageDataset(Dataset):
        def __init__(self, image_paths, processor):
            self.image_paths = image_paths
            self.processor = processor

        def __len__(self):
            return len(self.image_paths)

        def __getitem__(self, idx):
            image_path = self.image_paths[idx]
            try:
                image = Image.open(image_path).convert('RGB')  # Ensure 3 channels
            except Exception as e:
                print(f"Error loading image {image_path}: {e}")
                # Return a black image as a placeholder
                image = Image.new('RGB', (224, 224))
            return image

    # Initialize the dataset and dataloader with a custom collate_fn
    dataset = ImageDataset(image_paths, processor)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda x: x  # Custom collate_fn to return list of PIL Images
    )

    embeddings = []

    # Iterate through the dataloader
    for batch_idx, images in enumerate(dataloader, start=1):
        # Process the batch using the processor
        inputs = processor(images=images, return_tensors="pt")

        # Move inputs to the appropriate device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Forward pass through the model
        with torch.no_grad():
            outputs = model(**inputs)

        # Extract the CLS tokens
        cls_tokens = outputs.pooler_output.squeeze().cpu().numpy()  # Shape: [batch_size, hidden_size]

        embeddings.append(cls_tokens)

        # Print progress every 100 batches or on the last batch
        if batch_idx % 100 == 0 or batch_idx == len(dataloader):
            print(f"Processed {batch_idx}/{len(dataloader)} batches")

    # Concatenate all embeddings into a single array
    embeddings_array = np.vstack(embeddings)  # Shape: [num_images, hidden_size]

    return embeddings_array

# ================================
# Step 8: Embed and Save Training Images
# ================================

# Define the path where training embeddings will be saved
training_embeddings_path = 'Bo220226_training_set.npy'

# Embed the training images
print("Starting embedding of training images...")
training_embeddings = embed_images(
    image_paths=filtered_training_paths,
    processor=processor,
    model=model,
    device=device,
    batch_size=32  # Adjust based on your system's capabilities
)
print(f"Training embeddings shape: {training_embeddings.shape}")  # Expected: [n_train_filtered, hidden_size]

# Save the embeddings to a .npy file
np.save(training_embeddings_path, training_embeddings)
print(f"Training embeddings have been saved to {training_embeddings_path}")

# ================================
# Step 9: Embed and Save Test Images
# ================================

# Define the path where test embeddings will be saved
test_embeddings_path = 'Bo220226_test_set.npy'

# Embed the test images
print("Starting embedding of test images...")
test_embeddings = embed_images(
    image_paths=filtered_test_paths,
    processor=processor,
    model=model,
    device=device,
    batch_size=32  # Adjust based on your system's capabilities
)
print(f"Test embeddings shape: {test_embeddings.shape}")  # Expected: [n_test_filtered, hidden_size]

# Save the embeddings to a .npy file
np.save(test_embeddings_path, test_embeddings)
print(f"Test embeddings have been saved to {test_embeddings_path}")

# ================================
# Step 10: Verification of Saved Embeddings
# ================================

# Load the training embeddings
loaded_training_embeddings = np.load('Bo220226_training_set.npy')
print(f"Loaded training embeddings shape: {loaded_training_embeddings.shape}")  # Should match [n_train_filtered, hidden_size]

# Load the test embeddings
loaded_test_embeddings = np.load('Bo220226_test_set.npy')
print(f"Loaded test embeddings shape: {loaded_test_embeddings.shape}")  # Should match [n_test_filtered, hidden_size]

# Optional: Inspect a few embeddings
print("Sample Training Embeddings:")
print(loaded_training_embeddings[:5])

print("Sample Test Embeddings:")
print(loaded_test_embeddings[:5])


Number of filtered training images: 1000
Number of filtered test images: 250


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting embedding of training images...
Processed 32/32 batches
Training embeddings shape: (1000, 768)
Training embeddings have been saved to Bo220226_training_set.npy
Starting embedding of test images...
Processed 8/8 batches
Test embeddings shape: (250, 768)
Test embeddings have been saved to Bo220226_test_set.npy
Loaded training embeddings shape: (1000, 768)
Loaded test embeddings shape: (250, 768)
Sample Training Embeddings:
[[ 0.29927626 -0.6662772  -0.06322875 ...  0.5529889   0.20184693
   0.7659803 ]
 [-0.41798595 -0.90310955  0.65945077 ...  0.32807255 -0.07639293
   0.5208216 ]
 [-0.48322454 -0.3857926  -0.5082047  ...  0.64021987  0.7675375
   0.8217118 ]
 [-0.56429875 -0.4239243  -0.4356303  ... -0.2282228   0.5815877
   0.1060072 ]
 [ 0.5453199  -0.41448903  0.81488144 ... -0.42007238  0.84903145
  -0.35274485]]
Sample Test Embeddings:
[[-0.6807091   0.14434983 -0.17385224 ... -0.06038163  0.5956547
  -0.64094853]
 [-0.0127363   0.12923259 -0.29310498 ... -0.11654445  0.2