In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torchvision.models import resnet50
from PIL import Image
from tqdm import tqdm

In [2]:
###########################################################
# Define DEFT Module Components
###########################################################

class LocalizationNetwork(nn.Module):
    def __init__(self, input_channels):
        super(LocalizationNetwork, self).__init__()
        self.conv1 = nn.Conv2d(input_channels, 8, kernel_size=7)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(8, 10, kernel_size=5)

        # Placeholder fc1 (will update after shape calculation)
        self.fc1 = nn.Linear(1, 32)  # Temporary placeholder
        self.fc2 = nn.Linear(32, 6)  # 6 affine parameters

        # Initialize weights for identity transformation
        self.fc2.weight.data.zero_()
        self.fc2.bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float))

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))

        # Dynamically compute the flattened size
        if not hasattr(self, 'computed_fc1'):
            flattened_size = x.view(x.shape[0], -1).shape[1]
            self.fc1 = nn.Linear(flattened_size, 32).to(x.device)
            self.computed_fc1 = True  # Prevent re-initialization

        x = x.view(x.shape[0], -1)  # Flatten
        x = F.relu(self.fc1(x))
        theta = self.fc2(x)
        theta = theta.view(-1, 2, 3)
        return theta

class WeightingModule(nn.Module):
    def __init__(self, sigma=0.5):
        super(WeightingModule, self).__init__()
        self.lambda_param = nn.Parameter(torch.tensor(0.5))
        self.sigma = sigma

    def forward(self, grid):
        dist2 = grid[..., 0]**2 + grid[..., 1]**2
        weight = 1 + self.lambda_param * torch.exp(-dist2 / (2 * self.sigma ** 2))
        return weight.unsqueeze(-1)

class DEFTModule(nn.Module):
    def __init__(self, input_channels, sigma=0.5):
        super(DEFTModule, self).__init__()
        self.localization = LocalizationNetwork(input_channels)
        self.weighting = WeightingModule(sigma)

    def forward(self, x):
        theta = self.localization(x)
        grid = F.affine_grid(theta, x.size(), align_corners=False)
        weight = self.weighting(grid)
        x_transformed = F.grid_sample(x, grid, align_corners=False)

        if x.shape[1] > 1:
            weight = weight.expand(-1, x.shape[2], x.shape[3], x.shape[1]).permute(0, 3, 1, 2)
        else:
            weight = weight.permute(0, 3, 1, 2)

        x_weighted = x_transformed * weight
        return x_weighted



In [3]:

###########################################################
# Load ResNet50 Model for Feature Extraction
###########################################################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

resnet_model = resnet50(pretrained=True)
resnet_model = torch.nn.Sequential(*list(resnet_model.children())[:-1])  # Remove classification layer
resnet_model.eval().to(device)

# Load DEFT Module
deft_model = DEFTModule(input_channels=3).to(device)  # Input channels = 3 (RGB)
deft_model.eval()

###########################################################
# Define Image Transformation
###########################################################
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])




In [4]:
###########################################################
# Feature Extraction Function with DEFT
###########################################################
def extract_features(image_path, transform, device=device):
    """Extract ResNet50 features from a single image after DEFT transformation."""
    try:
        image = Image.open(image_path).convert('RGB')
        image = transform(image).unsqueeze(0).to(device)  # Convert to tensor and send to GPU

        # Apply DEFT Transformation
        with torch.no_grad():
            image = deft_model(image)  # Pass through DEFT Module
            features = resnet_model(image).squeeze().cpu().numpy()  # Extract ResNet50 features
        
        return features
    except Exception as e:
        print(f"Skipping frame: {image_path} due to error: {e}")
        return None

In [5]:
###########################################################
# Paths to RGB Directory and Label File
###########################################################
rgb_path = "D:/Datasets/Datasets/ADL/Frames/P_16"
label_csv_path = "D:/Datasets/Datasets/ADL/Label/P_16.csv"
labels_df = pd.read_csv(label_csv_path)

# Output CSV for Features
output_csv = "../SavedFeatures/Feature_P_16_RGB_ADL_Sampled.csv"

In [7]:
###########################################################
# Extract Features with DEFT & Save
###########################################################
S = 40  # Sampling every 5th frame
features_list = []

all_frames = sorted(os.listdir(rgb_path))[::S]

for frame in tqdm(all_frames, desc="Extracting Features with DEFT (RGB only)"):
    rgb_frame_path = os.path.join(rgb_path, frame)
    
    # Extract only RGB features
    rgb_features = extract_features(rgb_frame_path, transform, device)
    
    if rgb_features is not None:
        # Find frame number
        frame_number = int(frame.split('_')[-1].split('.')[0])
        
        # Match frame to action label
        label_row = labels_df[(labels_df['StartFrame'] <= frame_number) & (labels_df['EndFrame'] >= frame_number)]
        
        if not label_row.empty:
            action_label = label_row.iloc[0]['ActionLabel']
            action_name = label_row.iloc[0]['ActionName']
        else:
            action_label, action_name = 0, "Unknown"  # Default if no label found
        
        # Add to feature list
        features_list.append([frame, action_label, action_name] + rgb_features.tolist())



Extracting Features with DEFT (RGB only): 100%|██████████████████████████████████████| 630/630 [00:38<00:00, 16.53it/s]


In [8]:
# Handle case where no valid features were extracted
if len(features_list) == 0:
    raise ValueError("No valid features extracted. Please check the dataset paths and feature extraction function.")

# Create DataFrame and Save to CSV
columns = ["Frame", "ActionLabel", "ActionName"] + [f"Feature_{i}" for i in range(len(rgb_features))]
df = pd.DataFrame(features_list, columns=columns)
df.to_csv(output_csv, index=False)

print(f"Feature extraction completed! Saved to {output_csv}")

Feature extraction completed! Saved to ../SavedFeatures/Feature_P_16_RGB_ADL_Sampled.csv


In [9]:
data=pd.read_csv("../SavedFeatures/Feature_P_16_RGB_ADL_Sampled.csv")
data

Unnamed: 0,Frame,ActionLabel,ActionName,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,...,Feature_2038,Feature_2039,Feature_2040,Feature_2041,Feature_2042,Feature_2043,Feature_2044,Feature_2045,Feature_2046,Feature_2047
0,frame_00000.jpg,0,Unknown,0.104695,0.256306,0.334437,0.182553,0.507135,0.074522,0.409498,...,0.060401,0.062081,0.058476,0.145272,0.218072,0.008137,0.289075,0.072365,0.427008,0.229325
1,frame_00035.jpg,0,Unknown,0.211213,0.151590,0.117814,0.206189,0.505562,0.530281,0.458472,...,0.013891,0.025040,0.090195,0.069375,0.401190,0.044569,0.407820,0.102777,0.664284,0.188430
2,frame_00070.jpg,0,Unknown,0.107257,0.174991,0.375604,0.185740,0.319380,0.111444,0.465034,...,0.086704,0.023958,0.144146,0.182177,0.258346,0.017575,0.180100,0.062648,0.355485,0.208585
3,frame_00105.jpg,0,Unknown,0.198727,0.100481,0.351153,0.793076,0.336696,0.405042,0.121309,...,0.275317,0.076463,0.279628,0.225130,0.084788,0.063282,0.350621,0.361533,0.095574,0.491754
4,frame_00140.jpg,0,Unknown,0.038138,0.136935,0.848973,0.367541,0.274080,0.127369,0.086837,...,0.260670,0.199391,0.091241,0.216012,0.294984,0.029791,0.336792,0.181065,0.126237,0.418347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,frame_25025.jpg,0,Unknown,0.271282,0.476112,0.507719,0.628058,2.539046,0.887115,1.467239,...,0.547776,0.213527,0.165025,0.238795,0.963839,0.114923,0.679489,0.334174,0.288653,0.641225
716,frame_25060.jpg,0,Unknown,0.432795,0.261141,0.354827,0.680143,1.985080,0.077441,0.793684,...,0.685189,0.245055,1.048054,0.573189,0.792878,0.151473,0.384160,0.323690,0.809488,0.711264
717,frame_25095.jpg,0,Unknown,0.114935,0.159134,0.752584,0.557782,2.182549,0.061547,0.207901,...,0.504393,0.271641,0.159893,0.381986,0.766466,0.157465,0.248829,0.036971,0.230230,0.322887
718,frame_25130.jpg,0,Unknown,0.213673,0.155224,0.232236,0.314137,2.240488,0.227642,0.058016,...,0.574212,0.152679,0.179946,0.162456,0.402464,0.076672,0.200465,0.051658,0.460327,0.038638


In [10]:
label_csv_path = "D:/Datasets/Datasets/ADL/Label/P_16.csv"
labels_df = pd.read_csv(label_csv_path)
labels_df

Unnamed: 0,StartFrame,EndFrame,ActionLabel,ActionName
0,406,2668,3,brushing teeth
1,2755,2987,6,drying hands/face
2,3045,3596,1,combing hair
3,3625,4350,5,washing hands/face
4,4379,4669,6,drying hands/face
5,5046,6641,14,drinking water/bottle
6,6873,7656,10,washing dishes
7,7656,8874,12,making tea
8,9135,13079,16,making hot food
9,13079,14239,18,eating food/snack
