In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torchvision.models import resnet50
from PIL import Image
from tqdm import tqdm

In [2]:
class LocalizationNetwork(nn.Module):
    def __init__(self, input_channels):
        super(LocalizationNetwork, self).__init__()
        self.conv1 = nn.Conv2d(input_channels, 8, kernel_size=7)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(8, 10, kernel_size=5)

        # Placeholder fc1 (will update after shape calculation)
        self.fc1 = nn.Linear(1, 32)  # Temporary placeholder
        self.fc2 = nn.Linear(32, 6)  # 6 affine parameters

        # Initialize weights for identity transformation
        self.fc2.weight.data.zero_()
        self.fc2.bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float))

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))

        # Dynamically compute the flattened size
        if not hasattr(self, 'computed_fc1'):
            flattened_size = x.view(x.shape[0], -1).shape[1]
            self.fc1 = nn.Linear(flattened_size, 32).to(x.device)
            self.computed_fc1 = True  # Prevent re-initialization

        x = x.view(x.shape[0], -1)  # Flatten
        x = F.relu(self.fc1(x))
        theta = self.fc2(x)
        theta = theta.view(-1, 2, 3)
        return theta


class WeightingModule(nn.Module):
    def __init__(self, sigma=0.5):
        super(WeightingModule, self).__init__()
        self.lambda_param = nn.Parameter(torch.tensor(0.5))
        self.sigma = sigma

    def forward(self, grid):
        dist2 = grid[..., 0] ** 2 + grid[..., 1] ** 2
        weight = 1 + self.lambda_param * torch.exp(-dist2 / (2 * self.sigma ** 2))
        return weight.unsqueeze(-1)


class DEFTModule(nn.Module):
    def __init__(self, input_channels, sigma=0.5):
        super(DEFTModule, self).__init__()
        self.localization = LocalizationNetwork(input_channels)
        self.weighting = WeightingModule(sigma)

    def forward(self, x):
        theta = self.localization(x)
        grid = F.affine_grid(theta, x.size(), align_corners=False)
        weight = self.weighting(grid)
        x_transformed = F.grid_sample(x, grid, align_corners=False)

        if x.shape[1] > 1:
            weight = weight.expand(-1, x.shape[2], x.shape[3], x.shape[1]).permute(0, 3, 1, 2)
        else:
            weight = weight.permute(0, 3, 1, 2)

        x_weighted = x_transformed * weight
        return x_weighted

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

resnet_model = resnet50(weights=True)
resnet_model = torch.nn.Sequential(*list(resnet_model.children())[:-1])  # Remove classification layer
resnet_model.eval().to(device)

# Load DEFT Module
deft_model = DEFTModule(input_channels=3).to(device)  # Input channels = 3 (RGB)
deft_model.eval()

###########################################################
# Define Image Transformation
###########################################################

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])



In [4]:
# ------------------------------
# Paths for RGB frames & labels
# ------------------------------
rgb_path = "D:/Datasets/Datasets/EPIC_Kitchen/RGB/P01_04"
label_csv_path = "D:/Datasets/Datasets/EPIC_Kitchen/Label/P01_04.csv"
output_csv = "..Features/Feature_P01_04_tsne.csv"

labels_df = pd.read_csv(label_csv_path)

In [6]:

def extract_features(image_path, transform, device=device):
    """Extract ResNet50 features from a single image after DEFT transformation."""
    try:
        image = Image.open(image_path).convert('RGB')
        image = transform(image).unsqueeze(0).to(device)  # Convert to tensor and send to GPU

        # Apply DEFT Transformation
        with torch.no_grad():
            image = deft_model(image)  # Pass through DEFT Module
            features = resnet_model(image).squeeze().cpu().numpy()  # Extract ResNet50 features
        
        return features
    except Exception as e:
        print(f"Skipping frame: {image_path} due to error: {e}")
        return None

# ------------------------------
# Sampling Setup
# ------------------------------
S = 1 # Every 5th frame
features_list = []
all_frames = sorted(os.listdir(rgb_path))[::S]

# ------------------------------
# Extract RGB Features Only
# ------------------------------
for frame in tqdm(all_frames, desc="Extracting RGB Features with DEFT"):
    rgb_frame_path = os.path.join(rgb_path, frame)
    rgb_features = extract_features(rgb_frame_path, transform, device)

    if rgb_features is not None:
        frame_number = int(frame.split('_')[-1].split('.')[0])
        label_row = labels_df[(labels_df['StartFrame'] <= frame_number) & (labels_df['EndFrame'] >= frame_number)]

        if not label_row.empty:
            action_class = label_row.iloc[0]['Action_class']
        else:
            action_class = -1  # Default if no label found

        features_list.append([frame, action_class] + rgb_features.tolist())




Extracting RGB Features with DEFT: 100%|███████████████████████████████████████████| 6308/6308 [07:00<00:00, 14.98it/s]


In [8]:
output_csv = "../../Features/Feature_P01_04_tsne.csv"

In [9]:
# ----------------- Save to CSV -----------------
if len(features_list) == 0:
    raise ValueError("No valid features extracted. Check paths and formats.")

columns = ["Frame", "Action_class"] + [f"Feature_{i}" for i in range(len(rgb_features))]
df = pd.DataFrame(features_list, columns=columns)

# ✅ Only create directory if it's not empty
output_dir = os.path.dirname(output_csv)
if output_dir:
    os.makedirs(output_dir, exist_ok=True)

df.to_csv(output_csv, index=False)
print(f"\n✅ Feature extraction completed. Saved to: {output_csv}")


✅ Feature extraction completed. Saved to: ../../Features/Feature_P01_04_tsne.csv
