In [2]:
import numpy as np

import torch
import torchvision
import torchvision.transforms as transforms

import torch.nn as nn
import torch.nn.functional as F

from torchsummary import summary

torch.__version__

'1.8.2+cu111'

In [5]:
class VggBackbone(nn.Module):
    def __init__(self, init_weight):    
        super(VggBackbone, self).__init__()
        vgg_dict = self._make_vgg_dict(init_weight)
        self.conv1 = nn.Sequential(*vgg_dict['conv1'])
        self.conv2 = nn.Sequential(*vgg_dict['conv2'])
        self.conv3 = nn.Sequential(*vgg_dict['conv3'])
        self.conv4 = nn.Sequential(*vgg_dict['conv4'])
        self.conv5 = nn.Sequential(*vgg_dict['conv5'])
        if init_weight:
            self.conv1.apply(self._init_weight)
            self.conv2.apply(self._init_weight)
            self.conv3.apply(self._init_weight)
            self.conv4.apply(self._init_weight)
            self.conv5.apply(self._init_weight)
        else :
            self.conv5.apply(self._init_weight)
    
    def forward(self, x):
        out = self.conv1(x)      
        out = self.conv2(out)
        out = self.conv3(out)
        out = self.conv4(out)
        out = self.conv5(out)
        return out
    
    def _make_vgg_dict(self, shape_stream):
        vgg = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16', pretrained=True)
        tmp_list = []
        vgg_dict = {}
        for name, child in vgg._modules['features'].named_children():
            if not isinstance(child, nn.MaxPool2d):
                tmp_list.append(child)
            else :
                num = len(vgg_dict)+1
                if num == 5 and shape_stream:
                    pass
                else:
                    tmp_list.append(child)
                vgg_dict[f"conv{num}"] = tmp_list
                tmp_list = []
        return vgg_dict
    
    def _init_weight(self, layer):
        if isinstance(layer, nn.Conv2d):
            torch.nn.init.kaiming_normal_(layer.weight)
            torch.nn.init.zeros_(layer.bias)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vgg_texture = VggBackbone(init_weight=True).to(device)

Using cache found in /home/jaeho/.cache/torch/hub/pytorch_vision_v0.10.0


In [9]:
summary(vgg_texture, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 224, 224]           1,792
              ReLU-2         [-1, 64, 224, 224]               0
            Conv2d-3         [-1, 64, 224, 224]          36,928
              ReLU-4         [-1, 64, 224, 224]               0
         MaxPool2d-5         [-1, 64, 112, 112]               0
            Conv2d-6        [-1, 128, 112, 112]          73,856
              ReLU-7        [-1, 128, 112, 112]               0
            Conv2d-8        [-1, 128, 112, 112]         147,584
              ReLU-9        [-1, 128, 112, 112]               0
        MaxPool2d-10          [-1, 128, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]         295,168
             ReLU-12          [-1, 256, 56, 56]               0
           Conv2d-13          [-1, 256, 56, 56]         590,080
             ReLU-14          [-1, 256,

---

In [10]:
dummy_tensor = torch.rand((3, 224, 224))
dummy_tensor = torch.unsqueeze(dummy_tensor, 0)
dummy_tensor.shape

torch.Size([1, 3, 224, 224])

In [15]:
dummy_tensor = dummy_tensor.to(device)

In [16]:
output = vgg_texture(dummy_tensor)

In [17]:
output.shape

torch.Size([1, 512, 14, 14])

In [8]:
shape_dummy = torch.rand(output.shape)

In [9]:
shape_dummy.shape

torch.Size([1, 512, 7, 7])

In [10]:
concated_tensor = torch.cat((output, shape_dummy), dim=1)

In [11]:
concated_tensor.shape

torch.Size([1, 1024, 7, 7])

In [12]:
conv1 = nn.Conv2d(1024, 2048, 3, padding=0)

In [13]:
out_tensor_1 = conv1(concated_tensor)
out_tensor_1.shape

torch.Size([1, 2048, 5, 5])

In [14]:
conv2 = nn.Conv2d(2048, 4096, 3, padding=1)
out_tensor_2 = conv2(out_tensor_1)
out_tensor_2.shape

torch.Size([1, 4096, 5, 5])

In [15]:
gap = nn.AdaptiveAvgPool2d((1, 1))

In [16]:
after_gap = gap(out_tensor_2)
after_gap.squeeze_()
after_gap.shape

torch.Size([4096])

In [17]:
clothes_fc = nn.Linear(4096, 46)
clothes_activation = nn.Softmax(dim=0)
clotehs_out = clothes_activation(clothes_fc(after_gap))
clotehs_out

tensor([0.0205, 0.0201, 0.0180, 0.0244, 0.0185, 0.0255, 0.0228, 0.0220, 0.0217,
        0.0212, 0.0227, 0.0205, 0.0204, 0.0252, 0.0221, 0.0230, 0.0220, 0.0227,
        0.0197, 0.0208, 0.0207, 0.0185, 0.0183, 0.0215, 0.0234, 0.0216, 0.0222,
        0.0227, 0.0228, 0.0225, 0.0229, 0.0203, 0.0228, 0.0242, 0.0205, 0.0221,
        0.0248, 0.0236, 0.0224, 0.0231, 0.0225, 0.0215, 0.0213, 0.0198, 0.0209,
        0.0192], grad_fn=<SoftmaxBackward>)

In [18]:
att_fc = nn.Linear(4096, 1000)
att_activation = nn.Sigmoid()
att_out = att_activation(att_fc(after_gap))
att_out

tensor([0.5032, 0.5157, 0.4925, 0.5019, 0.4940, 0.4842, 0.5019, 0.4811, 0.4824,
        0.4944, 0.4684, 0.4843, 0.4719, 0.4791, 0.5011, 0.4741, 0.5113, 0.5166,
        0.5119, 0.5104, 0.4751, 0.5072, 0.5118, 0.4898, 0.5053, 0.4921, 0.4925,
        0.4767, 0.5164, 0.5013, 0.4779, 0.5083, 0.4975, 0.4861, 0.4722, 0.4975,
        0.4870, 0.4942, 0.5044, 0.4969, 0.4822, 0.5265, 0.5067, 0.4924, 0.5046,
        0.4583, 0.4873, 0.4996, 0.5122, 0.5231, 0.4794, 0.4688, 0.5499, 0.4700,
        0.4797, 0.4974, 0.4650, 0.5051, 0.4929, 0.4978, 0.4637, 0.4863, 0.5215,
        0.5324, 0.4920, 0.5035, 0.5001, 0.4788, 0.5065, 0.4948, 0.4791, 0.5069,
        0.4895, 0.4767, 0.4660, 0.5072, 0.4874, 0.4903, 0.5048, 0.5190, 0.4886,
        0.5159, 0.4962, 0.5064, 0.5223, 0.4637, 0.5135, 0.5140, 0.5034, 0.5080,
        0.5043, 0.5072, 0.5244, 0.4984, 0.5057, 0.4758, 0.5261, 0.5256, 0.4936,
        0.5163, 0.5253, 0.5090, 0.4965, 0.5090, 0.5232, 0.5076, 0.5021, 0.4749,
        0.4939, 0.5005, 0.5100, 0.5141, 

---

앞에서 다 tensor를 넣으면서 다 확인..
위에서 만든애들을 이제 묶어서 사용

In [43]:
class TextureBiasedStream(nn.Module):
    def __init__(self):
        super(TextureBiasedStream, self).__init__()
        self.vgg_texture = VggBackbone(init_weight=False)
        # concat
        self.texture_stream = nn.Sequential(
            nn.Conv2d(1024, 2048, 3, padding=0),
            nn.Conv2d(2048, 4096, 3, padding=1),
            nn.Dropout(0.5),
            nn.AdaptiveAvgPool2d((1, 1))
        )

        self.clothes_classification = nn.Sequential(
            nn.Linear(4096, 46),
            # nn.Softmax(dim=0)
        )
        self.attributes_recognition = nn.Sequential(
            nn.Linear(4096, 1000),
            # nn.Sigmoid()
        )
    
    def forward(self, x, shape_feature):
        out = self.vgg_texture(x)
        out = torch.cat((out, shape_feature), dim=1)
        out = self.texture_stream(out)
        out = torch.squeeze(out)
        
        clothes_out = self.clothes_classification(out)
        clothes_out = torch.softmax(clothes_out, dim=0)
        
        attr_out = self.attributes_recognition(out)
        attr_out = torch.sigmoid(attr_out)
        return clothes_out, attr_out

In [37]:
# device

In [38]:
texture_stream = TextureBiasedStream().to(device)

Using cache found in /home/jaeho/.cache/torch/hub/pytorch_vision_v0.10.0


In [39]:
texture_stream

TextureBiasedStream(
  (vgg_texture): VggBackbone(
    (conv1): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (conv2): Sequential(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (conv3): Sequential(
      (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): Conv2d(256, 256, kernel_size=(3

In [40]:
summary(texture_stream, [(3, 224, 224), (512, 7, 7)])

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 224, 224]           1,792
              ReLU-2         [-1, 64, 224, 224]               0
            Conv2d-3         [-1, 64, 224, 224]          36,928
              ReLU-4         [-1, 64, 224, 224]               0
         MaxPool2d-5         [-1, 64, 112, 112]               0
            Conv2d-6        [-1, 128, 112, 112]          73,856
              ReLU-7        [-1, 128, 112, 112]               0
            Conv2d-8        [-1, 128, 112, 112]         147,584
              ReLU-9        [-1, 128, 112, 112]               0
        MaxPool2d-10          [-1, 128, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]         295,168
             ReLU-12          [-1, 256, 56, 56]               0
           Conv2d-13          [-1, 256, 56, 56]         590,080
             ReLU-14          [-1, 256,

### shape biased stream


In [41]:
dummy_input = torch.rand((1, 512, 7, 7))

In [42]:
conv1 = nn.Conv2d(512, 256, 1)
conv2 = nn.Conv2d(256, 128, 3)
conv3 = nn.Conv2d(128, 256, 1)

In [44]:
out = conv1(dummy_input)
out.shape

torch.Size([1, 256, 7, 7])

In [45]:
out = conv2(out)
out.shape

torch.Size([1, 128, 5, 5])

In [46]:
out = conv3(out)
out.shape

torch.Size([1, 256, 5, 5])

In [55]:
# landmark
# flatten? GAP?
landmark_flatten = nn.AdaptiveAvgPool2d((1, 1))
# landmark_fc = nn.Linear()

In [56]:
landmark_out = landmark_flatten(out)
landmark_out.shape

torch.Size([1, 256, 1, 1])

In [57]:
landmark_out.squeeze_()
landmark_out.shape

torch.Size([256])

In [58]:
landmark_fc = nn.Linear(256, 8)
landmark_activation = nn.Sigmoid()

In [59]:
landmark_out = landmark_fc(landmark_out)
landmark_out = landmark_activation(landmark_out)
landmark_out.shape

torch.Size([8])

In [60]:
landmark_out

tensor([0.4945, 0.5449, 0.4847, 0.4928, 0.4681, 0.4661, 0.5401, 0.5032],
       grad_fn=<SigmoidBackward>)

In [61]:
# landmark location
out.shape

torch.Size([1, 256, 5, 5])

In [62]:
location_conv1 = nn.ConvTranspose2d(256, 256, 4, stride=2)

In [63]:
location_out = location_conv1(out)
location_out.shape

torch.Size([1, 256, 12, 12])

In [64]:
location_conv2 = nn.Conv2d(256, 8, 3)
location_out = location_conv2(location_out)
location_out.shape

torch.Size([1, 8, 10, 10])

In [90]:
class ShapeBiasedStream(nn.Module):
    def __init__(self):
        super(ShapeBiasedStream, self).__init__()
        self.vgg_texture = VggBackbone(init_weight=True)
        
        self.shape_stream = nn.Sequential(
            nn.Conv2d(512, 256, 1),
            nn.Conv2d(256, 128, 3),
            nn.Conv2d(128, 256, 1)
        )
        
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.vis_lin = nn.Linear(256, 8)
        
        self.location = nn.Sequential(
            nn.ConvTranspose2d(256, 256, 4, stride=2),
            nn.Conv2d(256, 8, 3)
        )
    
    def forward(self, x):
        # activation은 여기서
        print(x.shape)
        out = self.vgg_texture(x)
        # print(out.shape)
        out = self.shape_stream(out)
        # print(out.shape)
        # return shape_out
        
        vis_out = self.avg_pool(out)
        vis_out = vis_out.squeeze()
        vis_out = torch.sigmoid(vis_out)
        
        loc_out = self.location(out)
        return vis_out, loc_out

In [91]:
shape_model = ShapeBiasedStream().to(device)

Using cache found in /home/jaeho/.cache/torch/hub/pytorch_vision_v0.10.0


In [92]:
# shape_out = shape_model(torch.rand(1, 3, 224, 224).to(device))
summary(shape_model, (3, 224, 224))

torch.Size([2, 3, 224, 224])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 224, 224]           1,792
              ReLU-2         [-1, 64, 224, 224]               0
            Conv2d-3         [-1, 64, 224, 224]          36,928
              ReLU-4         [-1, 64, 224, 224]               0
         MaxPool2d-5         [-1, 64, 112, 112]               0
            Conv2d-6        [-1, 128, 112, 112]          73,856
              ReLU-7        [-1, 128, 112, 112]               0
            Conv2d-8        [-1, 128, 112, 112]         147,584
              ReLU-9        [-1, 128, 112, 112]               0
        MaxPool2d-10          [-1, 128, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]         295,168
             ReLU-12          [-1, 256, 56, 56]               0
           Conv2d-13          [-1, 256, 56, 56]         590,080
          

In [74]:
shape_out.shape

torch.Size([1, 256, 5, 5])

In [75]:
avg_pooled = nn.AdaptiveAvgPool2d((1, 1))(shape_out)

In [76]:
avg_pooled.shape

torch.Size([1, 256, 1, 1])

## 다 합쳐보면
---

크게 두개의 클래스로..
1. Vgg feature backbone
2. TSFashionNet
    - 여기서 근데 Stream이 두개이지만... 공유하는 구조는 단순히 vgg 백본 뿐
    - 근데 학습 자체적으로 shape따로, 이후에 joint learning을 하는 것이기 때문에 각각의 클래스를 따로 두는 것이 나을지도 모른다.
        - 그게 아니면 막 클래스안에서 이프문으로 나눠야 하는데..
    - 하지만 concat한 feature를 역전파 시키기 위헤서는 그 뒤에서부터 오는 ........

In [None]:
class TSFashionNet(nn.Module):
    def __init__(self):
        pass
    
    def forward(self, x):
        pass
    
    def _make_texture_stream(self):
        pass
    
    def _make_shape_stream(self):
        pass

In [18]:
class TSFashionNet(nn.Module):
    def __init__(self):
        super(TSFashionNet, self).__init__()
        # texture
        self.texture_backbone = VggBackbone(init_weight=False)
        self.texture_stream = nn.Sequential(
            nn.Conv2d(1024, 2048, 3, padding=0),
            nn.Conv2d(2048, 4096, 3, padding=1),
            nn.Dropout(0.5),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        self.clothes_cls_fc = nn.Linear(4096, 48)
        self.attr_recog_fc = nn.Linear(4096, 1000)
        
        
        # shape
        self.shape_backbone = VggBackbone(init_weight=True)
        self.conv5_maxpool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        
        self.shape_stream = nn.Sequential(
            nn.Conv2d(512, 256, 1),
            nn.Conv2d(256, 128, 3, padding=1),
            nn.Conv2d(128, 256, 1)
        )
        
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.vis_fc = nn.Linear(256, 8)
        
        self.location = nn.Sequential(
            nn.ConvTranspose2d(256, 256, 4, stride=2),
            nn.Conv2d(256, 8, 3)
        )
    
    def forward(self, x, shape=False):
        # shape
        shape_feature = self.shape_backbone(x)
        shape_out = self.shape_stream(shape_feature)
        vis_out = self.avg_pool(shape_out)
        vis_out = torch.squeeze(vis_out)
        vis_out = self.vis_fc(vis_out)
        vis_out = torch.sigmoid(vis_out)
        
        loc_out = self.location(shape_out)
        
        if shape:
            return vis_out, loc_out
        
        # texture
        texture_out = self.texture_backbone(x)
        cat_shape = self.conv5_maxpool(shape_feature)
        texture_out = torch.cat((texture_out, cat_shape), dim=1)
        texture_out = self.texture_stream(texture_out)
        texture_out = torch.squeeze(texture_out)
        
        clothes_out = self.clothes_cls_fc(texture_out)
        clothes_out = torch.softmax(clothes_out, dim=0)
        
        attr_out = self.attr_recog_fc(texture_out)
        attr_out = torch.sigmoid(attr_out)
        
        return vis_out, loc_out, clothes_out, attr_out

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [20]:
model = TSFashionNet().to(device)

Using cache found in /home/jaeho/.cache/torch/hub/pytorch_vision_v0.10.0
Using cache found in /home/jaeho/.cache/torch/hub/pytorch_vision_v0.10.0


In [22]:
model

TSFashionNet(
  (texture_backbone): VggBackbone(
    (conv1): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (conv2): Sequential(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (conv3): Sequential(
      (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): Conv2d(256, 256, kernel_size=(3, 

In [21]:
summary(model, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 224, 224]           1,792
              ReLU-2         [-1, 64, 224, 224]               0
            Conv2d-3         [-1, 64, 224, 224]          36,928
              ReLU-4         [-1, 64, 224, 224]               0
         MaxPool2d-5         [-1, 64, 112, 112]               0
            Conv2d-6        [-1, 128, 112, 112]          73,856
              ReLU-7        [-1, 128, 112, 112]               0
            Conv2d-8        [-1, 128, 112, 112]         147,584
              ReLU-9        [-1, 128, 112, 112]               0
        MaxPool2d-10          [-1, 128, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]         295,168
             ReLU-12          [-1, 256, 56, 56]               0
           Conv2d-13          [-1, 256, 56, 56]         590,080
             ReLU-14          [-1, 256,