In [36]:
from torchvision import models
from torch import nn
import torch
import numpy as np
import os
from PIL import Image
from torch.utils.data import Dataset,DataLoader
import json
import shutil
import torch.nn.functional as F
from torchvision import transforms
from torchsummary import summary
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
class CustomDataset(Dataset):
    def __init__(self, data_path,transform=None):
        classes = [
  "person",
  "rider",
  "car",
  "truck",
  "bus",
  "train",
  "motor",
  "bike",
  "traffic light",
  "traffic sign"
]
        self.transform = transform
        self.data=[]
        for file_name in sorted(os.listdir(data_path)):
            img_path = data_path+"/"+file_name
            label_pth = img_path.replace("images","labels").replace(".jpg",".json")
            labels=[]
            with open(label_pth, "r") as label_file:
                label = json.load(label_file)
                objects = label["frames"][0]["objects"]
                for obj in objects:
                    if "box2d" in obj:
                        category = obj["category"]
                        category_num = classes.index(category)
                        box = obj["box2d"]
                        x1, y1 = box["x1"], box["y1"]
                        x2, y2 = box["x2"], box["y2"]
                        labels.append([category_num,x1,y1,x2,y2])
            self.data.append([img_path,labels])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, labels = self.data[idx]
        image = Image.open(img_path).convert("RGB")

        return image, labels

class Transform(Dataset):
    def __init__(self, base_dataset, transform):
        self.base_dataset = base_dataset
        self.transform = transform

    def __len__(self):
        return len(self.base_dataset)

    def __getitem__(self, idx):
        (image_1, image_2), labels = self.base_dataset[idx]  # (image_1, image_2) ve labels al
        if self.transform:
            image_1 = self.transform(image_1)  # İlk kareye transform uygula
            image_2 = self.transform(image_2)  # İkinci kareye transform uygula
        return (image_1, image_2), labels

def custom_collate_fn(batch):
    images = [item[0] for item in batch]
    labels = [item[1] for item in batch]

    images = torch.stack(images, dim=0)  # [B, C, H, W]
    return images, labels

In [4]:
class CustomDataset(Dataset):
    def __init__(self, data_path, transform=None):
        self.classes = [
            "person", "rider", "car", "truck", "bus",
            "train", "motor", "bike", "traffic light", "traffic sign"
        ]
        self.transform = transform
        self.data = []
        
        # Get sorted list of image files
        file_names = sorted(os.listdir(data_path))
        
        # Create pairs of consecutive frames
        for i in range(len(file_names) - 1):  # -1 to ensure we have pairs
            img_path_1 = os.path.join(data_path, file_names[i])
            img_path_2 = os.path.join(data_path, file_names[i + 1])
            
            # Load labels for both frames
            label_path_1 = img_path_1.replace("images", "labels").replace(".jpg", ".json")
            label_path_2 = img_path_2.replace("images", "labels").replace(".jpg", ".json")
            
            labels_1 = []
            labels_2 = []
            
            # Load labels for first frame
            with open(label_path_1, "r") as label_file:
                label = json.load(label_file)
                objects = label["frames"][0]["objects"]
                for obj in objects:
                    if "box2d" in obj:
                        category = obj["category"]
                        category_num = self.classes.index(category)
                        box = obj["box2d"]
                        x1, y1 = box["x1"], box["y1"]
                        x2, y2 = box["x2"], box["y2"]
                        labels_1.append([category_num, x1, y1, x2, y2])
            
            # Load labels for second frame
            with open(label_path_2, "r") as label_file:
                label = json.load(label_file)
                objects = label["frames"][0]["objects"]
                for obj in objects:
                    if "box2d" in obj:
                        category = obj["category"]
                        category_num = self.classes.index(category)
                        box = obj["box2d"]
                        x1, y1 = box["x1"], box["y1"]
                        x2, y2 = box["x2"], box["y2"]
                        labels_2.append([category_num, x1, y1, x2, y2])
            
            self.data.append([(img_path_1, img_path_2), (labels_1, labels_2)])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        (img_path_1, img_path_2), (labels_1, labels_2) = self.data[idx]
        image_1 = Image.open(img_path_1).convert("RGB")
        image_2 = Image.open(img_path_2).convert("RGB")
        
        if self.transform:
            image_1 = self.transform(image_1)
            image_2 = self.transform(image_2)
        
        return (image_1, image_2), (labels_1, labels_2)

def custom_collate_fn(batch):
    # Separate images and labels for both frames
    images_1 = [item[0][0] for item in batch]  # First frame
    images_2 = [item[0][1] for item in batch]  # Second frame
    labels_1 = [item[1][0] for item in batch]  # Labels for first frame
    labels_2 = [item[1][1] for item in batch]  # Labels for second frame

    # Stack images to create [B, T, C, H, W]
    images_1 = torch.stack(images_1, dim=0)  # [B, C, H, W]
    images_2 = torch.stack(images_2, dim=0)  # [B, C, H, W]
    images = torch.stack([images_1, images_2], dim=1)  # [B, T=2, C, H, W]

    return images, (labels_1, labels_2)

In [5]:
#data_path = "/content/drive/MyDrive/BDD100k/images" #colab
data_path = "C:/Users/Mehmet/Desktop/BDD100k/images" #lcoal
train_path = data_path+"/train"
validation_path = data_path+"/val"
test_path = data_path+"/test"

image_size=300
batch_size=1

transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

#train_dataset = Transform(CustomDataset(train_path),transform)
#test_dataset  = Transform(CustomDataset(test_path),transform)
val_dataset   = Transform(CustomDataset(validation_path),transform)

#train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=0,collate_fn=custom_collate_fn)
#test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,pin_memory=True, num_workers=0,collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=0,collate_fn=custom_collate_fn)

In [37]:
class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()
        padding = (kernel_size - 1) // 2
        self.conv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x_cat = torch.cat([avg_out, max_out], dim=1)
        x_out = self.conv1(x_cat)
        attention_map = self.sigmoid(x_out)
        return x * attention_map

class EncoderBackBone(nn.Module):
    def __init__(self):
        super(EncoderBackBone,self).__init__()
        efficient = models.efficientnet_b3()
        self.features = efficient.features
        self.SAttention=SpatialAttention()
    def forward(self,x):         # x B,T,C,H,W
        x_t0 = x[:, 0, :, :, :]  # İlk kare: [B, C, H, W]
        x_t1 = x[:, 1, :, :, :]
        outs = []
        x_t0_out = x_t0
        x_t1_out = x_t1
        # t0 için özellikler
        for i, block in enumerate(self.features):
            x_t0_out = block(x_t0_out)
            x_t1_out = block(x_t1_out)
            if i > 2:  # C3’ten sonrası için Spatial Attention
                x_t0_out = x_t0_out * self.SAttention(x_t0_out)
                x_t1_out = x_t1_out * self.SAttention(x_t1_out)
            if i in [3, 5, 7]:  #object detection tutarlılık için sadece t0 frame C3, C4, C5 ları ekleniyor 
                
                out = F.interpolate(x_t0_out, size=256, mode='bilinear', align_corners=False)
                outs.append(out)
        x_out = torch.cat([x_t0_out, x_t1_out], dim=1)
          
        return [x_out, outs]   #torch.Size([32, 3072, 8, 8])      3,32,48,32,32

class DepthwiseSeparableConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super(DepthwiseSeparableConv, self).__init__()
        self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size,
                                 stride, padding, groups=in_channels, bias=False)
        self.pointwise = nn.Conv2d(in_channels, out_channels, 1, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)
        self.swish = nn.SiLU()

    def forward(self, x):
        x = self.depthwise(x)
        x = self.pointwise(x)
        x = self.bn(x)
        return self.swish(x)

class BiFPNBlock(nn.Module):
    def __init__(self, channels, epsilon=1e-4):
        super(BiFPNBlock, self).__init__()
        self.epsilon = epsilon
        self.channels = channels

        # Convolution layers for each level
        self.conv_p3 = DepthwiseSeparableConv(channels, channels)
        self.conv_p4 = DepthwiseSeparableConv(channels, channels)
        self.conv_p5 = DepthwiseSeparableConv(channels, channels)
        self.conv_p6 = DepthwiseSeparableConv(channels, channels)
        self.conv_p7 = DepthwiseSeparableConv(channels, channels)

        # Weight parameters for feature fusion
        self.w1 = nn.Parameter(torch.ones(2))
        self.w2 = nn.Parameter(torch.ones(2))
        self.w3 = nn.Parameter(torch.ones(2))
        self.w4 = nn.Parameter(torch.ones(2))
        self.w5 = nn.Parameter(torch.ones(3))
        self.w6 = nn.Parameter(torch.ones(3))
        self.w7 = nn.Parameter(torch.ones(3))
        self.w8 = nn.Parameter(torch.ones(2))

    def forward(self, inputs):
        P3, P4, P5, P6, P7 = inputs

        
        # Bottom-up pathway
        w1 = F.relu(self.w1)
        P6_td = (w1[0] * P6 + w1[1] * self.up_sampling(P7, P6.shape[-2:])) / (w1.sum() + self.epsilon)
        P6_td = self.conv_p6(P6_td)

        w2 = F.relu(self.w2)
        P5_td = (w2[0] * P5 + w2[1] * self.up_sampling(P6_td, P5.shape[-2:])) / (w2.sum() + self.epsilon)
        P5_td = self.conv_p5(P5_td)

        w3 = F.relu(self.w3)
        P4_td = (w3[0] * P4 + w3[1] * self.up_sampling(P5_td, P4.shape[-2:])) / (w3.sum() + self.epsilon)
        P4_td = self.conv_p4(P4_td)

        # Top-down pathway
        w4 = F.relu(self.w4)
        P3_out = (w4[0] * P3 + w4[1] * self.up_sampling(P4_td, P3.shape[-2:])) / (w4.sum() + self.epsilon)
        P3_out = self.conv_p3(P3_out)

        

        w5 = F.relu(self.w5)
        P4_out = (w5[0] * P4 + w5[1] * P4_td + w5[2] * self.down_sampling(P3_out, P4.shape[-2:])) / (w5.sum() + self.epsilon)
        P4_out = self.conv_p4(P4_out)

        w6 = F.relu(self.w6)
        P5_out = (w6[0] * P5 + w6[1] * P5_td + w6[2] * self.down_sampling(P4_out, P5.shape[-2:])) / (w6.sum() + self.epsilon)
        P5_out = self.conv_p5(P5_out)

        w7 = F.relu(self.w7)
        P6_out = (w7[0] * P6 + w7[1] * P6_td + w7[2] * self.down_sampling(P5_out, P6.shape[-2:])) / (w7.sum() + self.epsilon)
        P6_out = self.conv_p6(P6_out)

        w8 = F.relu(self.w8)
        P7_out = (w8[0] * P7 + w8[1] * self.down_sampling(P6_out, P7.shape[-2:])) / (w8.sum() + self.epsilon)
        P7_out = self.conv_p7(P7_out)

        return [P3_out, P4_out, P5_out, P6_out, P7_out]

    def up_sampling(self, x, target_size):
        return F.interpolate(x, size=target_size, mode='nearest')

    def down_sampling(self, x, target_size):
        if x.shape[-2:] == target_size:
            return x
        stride = x.shape[-1] // target_size[-1]
        kernel_size = stride
        return F.max_pool2d(x, kernel_size=kernel_size, stride=stride)

class BiFPN(nn.Module):
    def __init__(self, in_channels_list, out_channels=256, num_blocks=3):
        super(BiFPN, self).__init__()
        self.out_channels = out_channels
        self.num_blocks = num_blocks

        # Input projection layers
        self.input_convs = nn.ModuleList([
            nn.Conv2d(in_ch, out_channels, 1, bias=False)
            for in_ch in in_channels_list
        ])

        # Additional P6 and P7 layers
        self.p6_conv = nn.Conv2d(in_channels_list[-1], out_channels, 3, stride=2, padding=1)
        self.p7_conv = nn.Conv2d(out_channels, out_channels, 3, stride=2, padding=1)

        # BiFPN blocks
        self.bifpn_blocks = nn.ModuleList([
            BiFPNBlock(out_channels) for _ in range(num_blocks)
        ])

    def forward(self, inputs):
        # Project input features
        features = []
        for i, feat in enumerate(inputs):
            features.append(self.input_convs[i](feat))

        # Create P6 and P7
        P6 = self.p6_conv(inputs[-1])
        P7 = self.p7_conv(P6)

        # Initial feature list
        pyramid_features = features + [P6, P7]

        # Apply BiFPN blocks
        for block in self.bifpn_blocks:
            pyramid_features = block(pyramid_features)

        return pyramid_features


  
class BiFPNDepthMap(nn.Module):
    def __init__(self):
        super(BiFPNDepthMap, self).__init__()
        
        self.conv1x1 = nn.Conv2d(256, 1, kernel_size=1)
        
    def forward(self, bifpn_features):
        H_max = max(feat.shape[2] for feat in bifpn_features)  
        W_max = max(feat.shape[3] for feat in bifpn_features)  

        depth_maps = []
        for feature in bifpn_features:
            depth = self.conv1x1(feature)
            upsampled = F.interpolate(depth, size=(H_max, W_max), mode='bilinear', align_corners=False)  # [B, 1, H_max, W_max]
            depth_maps.append(upsampled)

        dep_map = torch.cat(depth_maps, dim=1)
        dep_map = torch.mean(dep_map, dim=1, keepdim=True)  

        return dep_map

class PoseNetwork(nn.Module):
    def __init__(self, backbone_channels=384, bifpn_channels=128):  # EfficientNet-B3 C5 channels + BiFPN features
        super(PoseNetwork, self).__init__()
        
        # Backbone features processing
        self.backbone_conv = nn.Sequential(
            nn.Conv2d(backbone_channels, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten()
        )
        self.depth_conv = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten()
        )
        # BiFPN + Backbone fusion
        self.fusion_fc = nn.Sequential(
            nn.Linear(256, 256),  # Backbone + BiFPN features
            nn.ReLU(inplace=True),
            nn.Linear(256, 128),
            nn.ReLU(inplace=True)
        )
        
        # Pose prediction: 6 DOF
        self.pose_head = nn.Linear(128, 6)
        
    def forward(self, backbone_features, bifpn_downsampled):
        backbone_feat = self.backbone_conv(backbone_features)  # [B, 128]
        depth_feat = self.depth_conv(bifpn_downsampled)
        # Fuse backbone and BiFPN features
        fused = torch.cat([backbone_feat, depth_feat], dim=1)  # [B, 256]
        fused = self.fusion_fc(fused)  # [B, 128]
        
        # Pose prediction
        pose = self.pose_head(fused)  # [B, 6]
        
        return pose

class SharedMultiScaleModel(nn.Module):
    """Detection and Depth heads"""
    def __init__(self, num_classes, in_channels=256):
        super().__init__()
        
        # Shared processing
        self.shared_conv = nn.Conv2d(in_channels, 128, 3, padding=1)
        
        # Task heads
        self.depth_head = nn.Sequential(
            nn.Conv2d(128, 64, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 1, 1),
            nn.Sigmoid()
        )
        
        self.cls_head = nn.Sequential(
            nn.Conv2d(128, 64, 3, padding=1), 
            nn.ReLU(inplace=True),
            nn.Conv2d(64, num_classes, 1)
        )
        
        self.reg_head = nn.Sequential(
            nn.Conv2d(128, 64, 3, padding=1),
            nn.ReLU(inplace=True), 
            nn.Conv2d(64, 4, 1)
        )
        
        # Learnable fusion weights
        self.depth_weights = nn.Parameter(torch.ones(5))
        self.cls_weights = nn.Parameter(torch.ones(5))
        self.reg_weights = nn.Parameter(torch.ones(5))
        
    def weighted_fusion(self, features, weights, target_size):
        weights = F.softmax(weights, dim=0)
        
        fused = None
        for feat, weight in zip(features, weights):
            if feat.shape[2:] != target_size:
                feat = F.interpolate(feat, size=target_size, mode='bilinear', align_corners=False)
            
            if fused is None:
                fused = weight * feat
            else:
                fused += weight * feat
                
        return fused
    
    def forward(self, x):
        # Shared processing
        processed_features = [self.shared_conv(feat) for feat in x]
        
        # Predictions for each level
        depth_preds = [self.depth_head(feat) for feat in processed_features]
        cls_preds = [self.cls_head(feat) for feat in processed_features]  
        reg_preds = [self.reg_head(feat) for feat in processed_features]
        
        # Target size (P3)
        target_size = depth_preds[0].shape[2:]
        
        # Weighted fusion
        final_depth = self.weighted_fusion(depth_preds, self.depth_weights, target_size)
        final_cls = self.weighted_fusion(cls_preds, self.cls_weights, target_size)
        final_reg = self.weighted_fusion(reg_preds, self.reg_weights, target_size)
        
        return {
            'depth': final_depth,
            'classification': final_cls,
            'regression': final_reg
        }

class BackwardWarping(nn.Module):
    """
    Backward warping using predicted depth and pose
    Makaledeki yaklaşıma göre: önceki frame'den current frame'i synthesize et
    """
    def __init__(self):
        super(BackwardWarping, self).__init__()
        
    def forward(self, img_prev, depth_curr, pose, K):
        """
        Args:
            img_prev: Previous frame [B, 3, H, W]
            depth_curr: Current depth [B, 1, H, W] 
            pose: Camera pose [B, 6] (tx, ty, tz, rx, ry, rz)
            K: Camera intrinsic matrix [B, 3, 3]
        Returns:
            warped_img: Synthesized current frame [B, 3, H, W]
            valid_mask: Valid pixel mask [B, 1, H, W]
        """
        B, _, H, W = img_prev.shape
        device = img_prev.device
        
        # Depth'i original image size'a resize et
        if depth_curr.shape[2:] != (H, W):
            depth_curr = F.interpolate(depth_curr, size=(H, W), mode='bilinear', align_corners=False)
        
        # Create pixel coordinates
        i, j = torch.meshgrid(torch.arange(H), torch.arange(W), indexing='ij')
        ones = torch.ones_like(i)
        pixel_coords = torch.stack([j, i, ones], dim=0).float().to(device)
        pixel_coords = pixel_coords.unsqueeze(0).repeat(B, 1, 1, 1)
        
        # Convert pose to transformation matrix
        T = self.pose_to_matrix(pose)
        
        # Inverse camera intrinsics
        K_inv = torch.inverse(K)
        
        # Back-project to 3D (current frame coordinates)
        cam_coords = torch.matmul(K_inv.unsqueeze(-1).unsqueeze(-1), 
                                pixel_coords.unsqueeze(-1))
        cam_coords = cam_coords.squeeze(-1) * depth_curr
        
        # Add homogeneous coordinate
        ones = torch.ones(B, 1, H, W).to(device)
        cam_coords_hom = torch.cat([cam_coords, ones], dim=1)
        
        # Transform to previous camera frame coordinates
        cam_coords_prev = torch.matmul(T.unsqueeze(-1).unsqueeze(-1), 
                                     cam_coords_hom.unsqueeze(-1))
        cam_coords_prev = cam_coords_prev.squeeze(-1)[:, :3]
        
        # Project back to 2D (previous frame pixel coordinates)
        pixel_coords_prev = torch.matmul(K.unsqueeze(-1).unsqueeze(-1), 
                                       cam_coords_prev.unsqueeze(-1))
        pixel_coords_prev = pixel_coords_prev.squeeze(-1)
        
        # Normalize by depth (perspective division)
        Z = pixel_coords_prev[:, 2:3]
        pixel_coords_prev = pixel_coords_prev[:, :2] / (Z + 1e-7)
        
        # Create valid mask (pixels that are within image bounds)
        valid_mask = ((pixel_coords_prev[:, 0:1] >= 0) & 
                     (pixel_coords_prev[:, 0:1] < W) &
                     (pixel_coords_prev[:, 1:2] >= 0) & 
                     (pixel_coords_prev[:, 1:2] < H) &
                     (Z > 0)).float()
        
        # Normalize coordinates to [-1, 1] for grid_sample
        pixel_coords_prev[:, 0] = 2 * pixel_coords_prev[:, 0] / (W - 1) - 1
        pixel_coords_prev[:, 1] = 2 * pixel_coords_prev[:, 1] / (H - 1) - 1
        
        # Rearrange for grid_sample [B, H, W, 2]
        grid = pixel_coords_prev.permute(0, 2, 3, 1)
        
        # Warp the previous image to synthesize current view
        warped_img = F.grid_sample(img_prev, grid, mode='bilinear', 
                                 padding_mode='zeros', align_corners=True)
        
        # Apply valid mask
        warped_img = warped_img * valid_mask
        
        return warped_img, valid_mask
    
    def pose_to_matrix(self, pose):
        """Convert 6DOF pose to 4x4 transformation matrix"""
        B = pose.shape[0]
        device = pose.device
        
        # Translation
        t = pose[:, :3]
        
        # Rotation (axis-angle)
        r = pose[:, 3:]
        
        # Convert axis-angle to rotation matrix
        angle = torch.norm(r, dim=1, keepdim=True)
        axis = r / (angle + 1e-7)
        
        cos_angle = torch.cos(angle)
        sin_angle = torch.sin(angle)
        
        # Rodrigues' formula
        K = torch.zeros(B, 3, 3).to(device)
        K[:, 0, 1] = -axis[:, 2]
        K[:, 0, 2] = axis[:, 1]
        K[:, 1, 0] = axis[:, 2]
        K[:, 1, 2] = -axis[:, 0]
        K[:, 2, 0] = -axis[:, 1]
        K[:, 2, 1] = axis[:, 0]
        
        I = torch.eye(3).unsqueeze(0).repeat(B, 1, 1).to(device)
        R = I + sin_angle.unsqueeze(-1) * K + (1 - cos_angle).unsqueeze(-1) * torch.matmul(K, K)
        
        # Create 4x4 transformation matrix
        T = torch.zeros(B, 4, 4).to(device)
        T[:, :3, :3] = R
        T[:, :3, 3] = t
        T[:, 3, 3] = 1
        
        return T
    
    def pose_to_matrix(self, pose):
        """Convert 6DOF pose to 4x4 transformation matrix"""
        B = pose.shape[0]
        device = pose.device
        
        # Translation
        t = pose[:, :3]
        
        # Rotation (axis-angle)
        r = pose[:, 3:]
        
        # Convert axis-angle to rotation matrix
        angle = torch.norm(r, dim=1, keepdim=True)
        axis = r / (angle + 1e-7)
        
        cos_angle = torch.cos(angle)
        sin_angle = torch.sin(angle)
        
        # Rodrigues' formula
        K = torch.zeros(B, 3, 3).to(device)
        K[:, 0, 1] = -axis[:, 2]
        K[:, 0, 2] = axis[:, 1]
        K[:, 1, 0] = axis[:, 2]
        K[:, 1, 2] = -axis[:, 0]
        K[:, 2, 0] = -axis[:, 1]
        K[:, 2, 1] = axis[:, 0]
        
        I = torch.eye(3).unsqueeze(0).repeat(B, 1, 1).to(device)
        R = I + sin_angle.unsqueeze(-1) * K + (1 - cos_angle).unsqueeze(-1) * torch.matmul(K, K)
        
        # Create 4x4 transformation matrix
        T = torch.zeros(B, 4, 4).to(device)
        T[:, :3, :3] = R
        T[:, :3, 3] = t
        T[:, 3, 3] = 1
        
        return T

class CompleteMultiTaskModel(nn.Module):
    """Complete model following your specified architecture"""
    def __init__(self, num_classes=10, bifpn_channels=256, bifpn_blocks=3):
        super(CompleteMultiTaskModel, self).__init__()
        
        # Backbone encoder
        self.encoder = EncoderBackBone()
        
        # EfficientNet-B3 channel dimensions (approximate values)
        # Block 3: ~48, Block 5: ~136, Block 7: ~384
        in_channels_list = [48, 136, 384]
        
        # BiFPN for multi-scale features
        self.bifpn = BiFPN(in_channels_list, bifpn_channels, bifpn_blocks)
        
        # BiFPN downsampling for pose network
        self.bifpn_downsampling = BiFPNDepthMap(bifpn_channels, 128)
        
        # Pose network (uses backbone + downsampled BiFPN features)
        self.pose_network = PoseNetwork(backbone_channels=384, bifpn_channels=128)
        
        # Detection and Depth heads
        self.detection_head = SharedMultiScaleModel(num_classes, bifpn_channels)
        
        # Backward warping
        self.backward_warping = BackwardWarping()
        
    def forward(self, img_curr, img_prev=None, K=None):
        
        backbone_features = self.encoder(img_curr, forPose=False)  # [C3, C4, C5]
        bifpn_features = self.bifpn(backbone_features)  # [P3, P4, P5, P6, P7]
        

        depth_map0 = self.bifpn_downsampling(bifpn_features)  # [B, 128]
        

        pose = self.pose_network(backbone_features, depth_map0)



        detection_outputs = self.detection_head(bifpn_features)
        
        # Outputs
        outputs = {
            'depth': detection_outputs['depth'],
            'classification': detection_outputs['classification'],
            'regression': detection_outputs['regression'],
            'pose': pose
        }
        
        # 6. Backward warping (if previous frame provided)
        if img_prev is not None and K is not None:
            warped_img = self.backward_warping(img_prev, detection_outputs['depth'], pose, K)
            outputs['warped_img'] = warped_img
        
        return outputs


   

In [None]:
class SelfSupervisedLoss(nn.Module):
    """
    Makaledeki self-supervised loss functions
    """
    def __init__(self, alpha=0.15, gamma=0.001):
        super(SelfSupervisedLoss, self).__init__()
        self.alpha = alpha  # SSIM loss weight (τ in paper)
        self.gamma = gamma  # smoothness loss weight (γ in paper)
        
    def photometric_loss(self, img_curr, img_warped, valid_mask=None):
        """L1 photometric loss between current and warped image"""
        diff = torch.abs(img_curr - img_warped)
        if valid_mask is not None:
            diff = diff * valid_mask
            loss = diff.sum() / (valid_mask.sum() + 1e-7)
        else:
            loss = diff.mean()
        return loss
    
    def ssim_loss(self, img_curr, img_warped, valid_mask=None):
        """SSIM loss between current and warped image"""
        def ssim(x, y):
            C1 = 0.01**2
            C2 = 0.03**2
            
            mu_x = F.avg_pool2d(x, 3, 1, 1)
            mu_y = F.avg_pool2d(y, 3, 1, 1)
            
            sigma_x = F.avg_pool2d(x**2, 3, 1, 1) - mu_x**2
            sigma_y = F.avg_pool2d(y**2, 3, 1, 1) - mu_y**2
            sigma_xy = F.avg_pool2d(x*y, 3, 1, 1) - mu_x*mu_y
            
            num = (2*mu_x*mu_y + C1) * (2*sigma_xy + C2)
            den = (mu_x**2 + mu_y**2 + C1) * (sigma_x + sigma_y + C2)
            
            return num / den
        
        ssim_map = ssim(img_curr, img_warped)
        if valid_mask is not None:
            # Valid mask'i SSIM boyutuna resize et
            valid_resized = F.interpolate(valid_mask, size=ssim_map.shape[2:], 
                                        mode='bilinear', align_corners=False)
            ssim_map = ssim_map * valid_resized
            loss = (1 - ssim_map).sum() / (valid_resized.sum() + 1e-7)
        else:
            loss = (1 - ssim_map).mean()
        return loss
    
    def smoothness_loss(self, depth, img):
        """Edge-aware smoothness loss for depth"""
        def gradient(x):
            h_grad = torch.abs(x[:, :, :, :-1] - x[:, :, :, 1:])
            v_grad = torch.abs(x[:, :, :-1, :] - x[:, :, 1:, :])
            return h_grad, v_grad
        
        # Normalize depth
        depth_mean = depth.mean(dim=[1, 2, 3], keepdim=True)
        depth_norm = depth / (depth_mean + 1e-7)
        
        # Depth gradients
        depth_grad_h, depth_grad_v = gradient(depth_norm)
        
        # Image gradients (for edge-aware weighting)
        img_grad_h, img_grad_v = gradient(img)
        img_grad_h = img_grad_h.mean(dim=1, keepdim=True)
        img_grad_v = img_grad_v.mean(dim=1, keepdim=True)
        
        # Edge-aware smoothness
        smooth_h = depth_grad_h * torch.exp(-img_grad_h)
        smooth_v = depth_grad_v * torch.exp(-img_grad_v)
        
        return smooth_h.mean() + smooth_v.mean()
    
    def forward(self, img_curr, img_warped, depth, valid_mask=None):
        """
        Combined self-supervised loss as in the paper
        """
        # Photometric loss
        L_ph = self.photometric_loss(img_curr, img_warped, valid_mask)
        
        # SSIM loss
        L_ssim = self.ssim_loss(img_curr, img_warped, valid_mask)
        
        # Smoothness loss
        L_smooth = self.smoothness_loss(depth, img_curr)
        
        # Combined loss (equation 6 in paper)
        # μ is auto-mask, simplified here as valid_mask consideration
        L_total = self.alpha * L_ph + (1 - self.alpha) * L_ssim + self.gamma * L_smooth
        
        return {
            'total': L_total,
            'photometric': L_ph,
            'ssim': L_ssim,
            'smoothness': L_smooth
        }
    
def create_scaled_camera_matrix(original_height, original_width, 
                              target_height, target_width, batch_size=1):
    """
    BDD100K orijinal boyuttan hedef boyuta ölçeklenmiş K matrisi
    """
    # Orijinal BDD100K için automotive kamera parametreleri
    fov_rad = np.radians(70)  # Automotive kameralar için tipik FOV
    focal_length = original_width / (2.0 * np.tan(fov_rad / 2.0))
    
    # Orijinal K matrisi
    K_original = torch.zeros(batch_size, 3, 3)
    K_original[:, 0, 0] = focal_length
    K_original[:, 1, 1] = focal_length
    K_original[:, 0, 2] = original_width / 2
    K_original[:, 1, 2] = original_height / 2
    K_original[:, 2, 2] = 1.0
    
    # Ölçekleme faktörleri
    scale_x = target_width / original_width
    scale_y = target_height / original_height
    
    # K matrisini ölçekle
    K_scaled = K_original.clone()
    K_scaled[:, 0, 0] *= scale_x  # fx
    K_scaled[:, 1, 1] *= scale_y  # fy
    K_scaled[:, 0, 2] *= scale_x  # cx
    K_scaled[:, 1, 2] *= scale_y  # cy
    
    return K_scaled

def get_bdd100k_camera_matrix(target_height=300, target_width=300, batch_size=1):
    """BDD100K için önerilen K matrisi"""
    return create_scaled_camera_matrix(720, 1280,  # BDD100K orijinal boyut
                                     target_height, target_width, 
                                     batch_size)

In [None]:
efficient_backbone = EncoderBackBone()
bifpn = BiFPN([48,136,384],256,3)
bifpn_depthmap=BiFPNDepthMap()
pose_network = PoseNetwork(backbone_channels=3072,bifpn_channels=256)
backward_warping = BackwardWarping()
pred_labels=[]
K = get_bdd100k_camera_matrix(300, 300, 1).cuda()
for images, (labels_1, labels_2) in val_loader:
    
    x = efficient_backbone(images)
    bifpn_out=bifpn(x[1])
    depthmap_out=bifpn_depthmap(bifpn_out)
    pose_network_out = pose_network(x[0],depthmap_out)
    print("Poz bilgisi boyutu : "+str(pose_network_out.shape))
    print("Depth map boyutu : "+str(depthmap_out.shape))
    print("1.Frame boyutu : "+str(images[:,:1:,:].shape))
    print("K matris boyutu : "+str(K.shape()))
    syhntez_depth_map = backward_warping(images[:,:1:,:],depthmap_out,pose_network_out,K)
    print(syhntez_depth_map.shape)
    pred_labels.append(pose_network_out)
    break

Poz bilgisi boyutu : torch.Size([1, 6])
Depth map boyutu : torch.Size([1, 1, 256, 256])
1.Frame boyutu : torch.Size([1, 1, 3, 300, 300])


In [None]:
#            img_prev: Previous frame [B, 3, H, W]
#            depth_curr: Current depth [B, 1, H, W]
#            pose: Camera pose [B, 6] (tx, ty, tz, rx, ry, rz)
#            K: Camera intrinsic matrix [B, 3, 3]