In [75]:
from torchvision import models
from torch import nn
import torch
from torchsummary import summary
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()

        padding = (kernel_size - 1) // 2
        self.conv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)  # [B, 1, H, W]
        max_out, _ = torch.max(x, dim=1, keepdim=True)  # [B, 1, H, W]
        
        x_cat = torch.cat([avg_out, max_out], dim=1)  # [B, 2, H, W]
        
        x_out = self.conv1(x_cat)  # [B, 1, H, W]
        attention_map = self.sigmoid(x_out)
        
        return x * attention_map
    

class EncoderBackBone(nn.Module):
    def __init__(self):
        efficient = models.efficientnet_b3()
        self.features = efficient.features
        self.SAttention=SpatialAttention()
    def forward(self,x,forPose): #Backbonedan çıkan veri BiFPN ve Pose hesaplaması için 2 ye ayrılacak BiFPN e 5-9.Bloklardan veri gidecek
        outs = []                #Pose hesaplaması için sadece son çıktı yeterli
        for i,block in enumerate(self.features): 
            x = block(x)
            if i>3:
                x = self.SAttention(x)
            if i>4 : 
                outs.append(x) # Burdaki veriler BiFPN e gönderilecek
        if(forPose):
            return outs
        else :
            return x
                


        
        