In [1]:
# Import Library Files

import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torchvision.models import resnet50
from PIL import Image
from tqdm import tqdm

In [2]:
#----
#1. DEFT Module Components
#----

class LocalizationNetwork(nn.Module):
    def __init__(self, input_channels):
        super(LocalizationNetwork, self).__init__()
        self.conv1 = nn.Conv2d(input_channels, 8, kernel_size=7)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(8, 10, kernel_size=5)

        # Placeholder fc1 (will update after shape calculation)
        self.fc1 = nn.Linear(1, 32)  # Temporary placeholder
        self.fc2 = nn.Linear(32, 6)  # 6 affine parameters

        # Initialize weights for identity transformation
        self.fc2.weight.data.zero_()
        self.fc2.bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float))

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))

        # Dynamically compute the flattened size
        if not hasattr(self, 'computed_fc1'):
            flattened_size = x.view(x.shape[0], -1).shape[1]
            self.fc1 = nn.Linear(flattened_size, 32).to(x.device)
            self.computed_fc1 = True  # Prevent re-initialization

        x = x.view(x.shape[0], -1)  # Flatten
        x = F.relu(self.fc1(x))
        theta = self.fc2(x)
        theta = theta.view(-1, 2, 3)
        return theta


class WeightingModule(nn.Module):
    def __init__(self, sigma=0.5):
        super(WeightingModule, self).__init__()
        self.lambda_param = nn.Parameter(torch.tensor(0.5))
        self.sigma = sigma

    def forward(self, grid):
        dist2 = grid[..., 0] ** 2 + grid[..., 1] ** 2
        weight = 1 + self.lambda_param * torch.exp(-dist2 / (2 * self.sigma ** 2))
        return weight.unsqueeze(-1)


class DEFTModule(nn.Module):
    def __init__(self, input_channels, sigma=0.5):
        super(DEFTModule, self).__init__()
        self.localization = LocalizationNetwork(input_channels)
        self.weighting = WeightingModule(sigma)

    def forward(self, x):
        theta = self.localization(x)
        grid = F.affine_grid(theta, x.size(), align_corners=False)
        weight = self.weighting(grid)
        x_transformed = F.grid_sample(x, grid, align_corners=False)

        if x.shape[1] > 1:
            weight = weight.expand(-1, x.shape[2], x.shape[3], x.shape[1]).permute(0, 3, 1, 2)
        else:
            weight = weight.permute(0, 3, 1, 2)

        x_weighted = x_transformed * weight
        return x_weighted

In [3]:
#----
#2. ResNet50 Model for Feature Extraction
#----

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

resnet_model = resnet50(weights=True)
resnet_model = torch.nn.Sequential(*list(resnet_model.children())[:-1])  # Remove classification layer
resnet_model.eval().to(device)

# Load DEFT Module
deft_model = DEFTModule(input_channels=3).to(device)  # Input channels = 3 (RGB)
deft_model.eval()

###########################################################
# Define Image Transformation
###########################################################

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])



In [4]:
#----
#3. Feature Extraction Function with DEFT
#----

def extract_features(image_path, transform, device=device):
    """Extract ResNet50 features from a single image after DEFT transformation."""
    try:
        image = Image.open(image_path).convert('RGB')
        image = transform(image).unsqueeze(0).to(device)  # Convert to tensor and send to GPU

        # Apply DEFT Transformation
        with torch.no_grad():
            image = deft_model(image)  # Pass through DEFT Module
            features = resnet_model(image).squeeze().cpu().numpy()  # Extract ResNet50 features
        
        return features
    except Exception as e:
        print(f"Skipping frame: {image_path} due to error: {e}")
        return None


###########################################################
# Paths to RGB and Optical Flow Directories
###########################################################

#rgb_path = "D:/Datasets/Datasets/EPIC_Kitchen/RGB/P01_02"
flow_u_path = "D:/Datasets/Datasets/EPIC_Kitchen/OpticalFlow/P01_04/u"
flow_v_path = "D:/Datasets/Datasets/EPIC_Kitchen/OpticalFlow/P01_04/v"

label_csv_path = "D:/Datasets/Datasets/EPIC_Kitchen/Label/P01_04.csv"
labels_df = pd.read_csv(label_csv_path)

output_csv = "../SavedFeatures/P01_04/Feature_P01_04_with_DEFT_OpticalOnly.csv"




In [5]:

#---
#4. Extract Features with DEFT & Save
#----

###########################################################
# Extract Features with DEFT & Save (Only Optical Flow)
###########################################################

S = 10  # Sampling every 30th frame
features_list = []

all_frames = sorted(os.listdir(flow_u_path))[::S]  # Use optical flow frames

for frame in tqdm(all_frames, desc="Extracting Features with DEFT"):
    flow_u_frame_path = os.path.join(flow_u_path, frame)
    flow_v_frame_path = os.path.join(flow_v_path, frame)
    
    # Extract Optical Flow Features (u and v)
    flow_u_features = extract_features(flow_u_frame_path, transform, device)
    flow_v_features = extract_features(flow_v_frame_path, transform, device)
    
    if flow_u_features is not None and flow_v_features is not None:
        combined_features = np.concatenate([flow_u_features, flow_v_features])  # Concatenate u and v
        
        frame_number = int(frame.split('_')[-1].split('.')[0])
        label_row = labels_df[(labels_df['StartFrame'] <= frame_number) & (labels_df['EndFrame'] >= frame_number)]
        
        if not label_row.empty:
            verb_class = label_row.iloc[0]['Verb_class']
            noun_class = label_row.iloc[0]['Noun_class']
            action_class = label_row.iloc[0]['Action_class']
        else:
            verb_class, noun_class, action_class = 0, 0, 0  # Default if no label found
        
        # Append only Optical Flow Features (u and v)
        features_list.append([frame, verb_class, noun_class, action_class] + combined_features.tolist())

# Handle case where no features are extracted
if len(features_list) == 0:
    raise ValueError("No valid features extracted. Please check the dataset paths and feature extraction function.")

# Create columns based on extracted feature length
columns = ["Frame", "Verb_class", "Noun_class", "Action_class"] + [f"Feature_{i}" for i in range(len(combined_features))]
df = pd.DataFrame(features_list, columns=columns)
df.to_csv(output_csv, index=False)

print(f"Feature extraction completed! Saved to {output_csv}")


Extracting Features with DEFT: 100%|█████████████████████████████████████████████████| 632/632 [00:21<00:00, 29.05it/s]


Feature extraction completed! Saved to ../SavedFeatures/P01_04/Feature_P01_04_with_DEFT_OpticalOnly.csv


In [9]:
###########################################################
# Display Extracted Features for Verification
###########################################################
f = pd.read_csv("../SavedFeatures/P01_02/Feature_P01_02_with_DEFT_OpticalOnly.csv")
f

Unnamed: 0,Frame,Verb_class,Noun_class,Action_class,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,...,Feature_4086,Feature_4087,Feature_4088,Feature_4089,Feature_4090,Feature_4091,Feature_4092,Feature_4093,Feature_4094,Feature_4095
0,frame_00000.jpg,0,0,0,0.181404,0.056219,0.053478,0.031063,0.181965,0.351112,...,0.106878,0.013853,0.009342,0.056958,0.063259,0.021437,0.086363,0.262430,0.094472,0.118424
1,frame_00025.jpg,0,0,0,0.022181,0.088211,0.004550,0.085271,0.033596,0.067121,...,0.327617,0.014935,0.000000,0.305141,0.001381,0.001874,0.000000,0.034230,0.014367,0.260073
2,frame_00050.jpg,0,0,0,0.025713,0.033317,0.058570,0.005479,0.015454,0.034139,...,0.049529,0.000000,0.000000,0.079821,0.022303,0.013848,0.000000,0.094496,0.031575,0.130171
3,frame_00075.jpg,0,0,0,0.040327,0.325566,0.315692,0.208348,0.247595,0.147013,...,0.671565,0.060788,0.034933,0.484660,0.151212,0.013090,0.017168,0.073464,0.249105,0.252965
4,frame_00100.jpg,0,0,0,0.090518,0.024996,0.133374,0.262419,0.131047,0.038132,...,0.175967,0.000000,0.008986,0.529819,0.235216,0.022762,0.043977,0.038770,0.207423,0.218875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,frame_14950.jpg,0,0,0,0.080504,0.389140,0.302234,0.184829,0.114453,0.447288,...,0.526207,0.304720,0.171931,0.332172,0.088557,0.046952,0.223528,0.157193,0.004135,0.019685
599,frame_14975.jpg,18,1,26,0.024022,0.309356,0.065558,0.030800,0.031074,0.016492,...,0.122563,0.076290,0.024317,0.072923,0.028743,0.014175,0.002236,0.030144,0.057334,0.011936
600,frame_15000.jpg,18,1,26,0.032779,0.703668,0.266799,0.164839,0.334441,0.363864,...,0.234756,0.371410,0.052024,0.043649,0.120711,0.005368,0.103120,0.080062,0.029838,0.427716
601,frame_15025.jpg,18,1,26,0.049387,0.254889,0.365956,0.108505,0.070281,0.102155,...,0.963594,0.277108,0.401132,0.564930,0.078906,0.013146,0.047243,0.010052,0.020860,0.224987
