In [1]:
!pip install torch torchvision timm

Collecting torch
  Downloading torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision
  Downloading torchvision-0.19.1-cp310-cp310-manylinux1_x86_64.whl.metadata (6.0 kB)
Collecting timm
  Downloading timm-1.0.9-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting filelock (from torch)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy (from torch)
  Downloading sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (f

In [None]:
import torch
import torch.nn as nn
import timm  # Pretrained models for ConvNeXt, EfficientNet, etc.

# 전면부: ConvNeXt 또는 EfficientNet-L
class Frontend(nn.Module):
    def __init__(self, model_type="convnext"):
        super(Frontend, self).__init__()
        if model_type == "convnext":
            self.model = timm.create_model('convnext_base', pretrained=True)
        elif model_type == "efficientnet":
            self.model = timm.create_model('efficientnet_l2', pretrained=True)
        else:
            raise ValueError("Invalid model type. Choose 'convnext' or 'efficientnet'.")
        
        # Remove final classification head
        self.model.reset_classifier(0)

    def forward(self, x):
        return self.model(x)

# 백엔드: Swin Transformer 또는 Perceiver
class Backbone(nn.Module):
    def __init__(self, model_type="swin"):
        super(Backbone, self).__init__()
        if model_type == "swin":
            self.model = timm.create_model('swin_base_patch4_window7_224', pretrained=True)
        elif model_type == "perceiver":
            self.model = timm.create_model('perceiver_io_base', pretrained=True)
        else:
            raise ValueError("Invalid model type. Choose 'swin' or 'perceiver'.")
        
        # Again, remove the classification head for flexibility
        self.model.reset_classifier(0)

    def forward(self, x):
        return self.model(x)

# 전체 모델 구성
class LipReadingModel(nn.Module):
    def __init__(self, frontend_type="convnext", backend_type="swin"):
        super(LipReadingModel, self).__init__()
        self.frontend = Frontend(model_type=frontend_type)
        self.backend = Backbone(model_type=backend_type)
        self.fc = nn.Linear(1024, 512)  # 예시로 크기 지정, 필요에 맞게 조정
        
    def forward(self, x):
        x = self.frontend(x)
        x = self.backend(x)
        x = self.fc(x)
        return x

# 예시 모델 생성
# Frontend: ConvNeXt, Backbone: Swin Transformer
model = LipReadingModel(frontend_type="convnext", backend_type="swin")
print(model)

# 모델 학습 루프 (간단한 예시)
input_data = torch.randn(8, 3, 224, 224)  # 예시 입력 (배치 크기: 8, 이미지 크기: 224x224)
output = model(input_data)
print(output.shape)  # 출력 크기


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.5):
        super(ContrastiveLoss, self).__init__()
        self.temperature = temperature

    def forward(self, z_i, z_j):
        # L2 normalize
        z_i = F.normalize(z_i, dim=1)
        z_j = F.normalize(z_j, dim=1)

        # Compute cosine similarity
        cosine_sim = torch.mm(z_i, z_j.t()) / self.temperature
        labels = torch.arange(z_i.size(0)).long().to(z_i.device)

        # Contrastive Loss (cross entropy between similarity and true labels)
        loss = F.cross_entropy(cosine_sim, labels)
        return loss

# 예시: 두 입력을 받아 Contrastive Loss를 계산
def train_step(model, data_loader, optimizer, contrastive_loss_fn):
    model.train()
    for (x_i, x_j), _ in data_loader:
        optimizer.zero_grad()
        
        # 두 입력을 모델에 통과시켜 임베딩 벡터를 얻음
        z_i = model(x_i)
        z_j = model(x_j)
        
        # Contrastive Loss 계산
        loss = contrastive_loss_fn(z_i, z_j)
        loss.backward()
        optimizer.step()

# Contrastive Loss 사용 예시
contrastive_loss_fn = ContrastiveLoss()


In [None]:
class SimSiam(nn.Module):
    def __init__(self, base_encoder, feature_dim=2048, proj_dim=256, pred_dim=512):
        super(SimSiam, self).__init__()
        
        # Base encoder (ConvNeXt, EfficientNet 등)
        self.encoder = base_encoder
        
        # Projection MLP
        self.projector = nn.Sequential(
            nn.Linear(feature_dim, proj_dim),
            nn.BatchNorm1d(proj_dim),
            nn.ReLU(),
            nn.Linear(proj_dim, proj_dim)
        )
        
        # Prediction MLP
        self.predictor = nn.Sequential(
            nn.Linear(proj_dim, pred_dim),
            nn.BatchNorm1d(pred_dim),
            nn.ReLU(),
            nn.Linear(pred_dim, proj_dim)
        )

    def forward(self, x1, x2):
        # 두 개의 augmented input 통과
        z1 = self.projector(self.encoder(x1))
        z2 = self.projector(self.encoder(x2))

        p1 = self.predictor(z1)
        p2 = self.predictor(z2)

        return p1, p2, z1.detach(), z2.detach()

def loss_fn(p1, p2, z1, z2):
    def D(p, z):
        return -F.cosine_similarity(p, z).mean()
    
    return D(p1, z2) / 2 + D(p2, z1) / 2

# Self-Supervised 학습 루프
def train_step_simsiam(model, data_loader, optimizer):
    model.train()
    for (x1, x2), _ in data_loader:
        optimizer.zero_grad()
        
        # 모델의 출력을 받음
        p1, p2, z1, z2 = model(x1, x2)
        
        # SimSiam 손실 계산
        loss = loss_fn(p1, p2, z1, z2)
        loss.backward()
        optimizer.step()


In [None]:
# Model 정의: 전면부 + 백엔드
model = LipReadingModel(frontend_type="convnext", backend_type="swin")

# Contrastive Learning 또는 Self-Supervised Learning 학습 방식 선택
contrastive_loss_fn = ContrastiveLoss(temperature=0.5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 데이터 로더에서 두 개의 augmented 이미지 쌍을 받아 학습
train_step(model, data_loader, optimizer, contrastive_loss_fn)
