In [1]:
# Google Drive 마운트
from google.colab import drive
drive.mount('/content/drive')

# 필요한 라이브러리 로드
import json
import os
import torch
import torch.nn as nn
from transformers import BigBirdModel, BigBirdTokenizer, ViTModel, DistilBertModel, DistilBertConfig
from torchvision import transforms
from PIL import Image
from torch.cuda.amp import autocast

# 모델과 토크나이저 로드 (Mixed Precision과 Gradient Checkpointing을 위해 설정)
text_model = BigBirdModel.from_pretrained("google/bigbird-roberta-base", attention_type="original_full")
text_model.gradient_checkpointing_enable()
tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")

image_model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
image_model.gradient_checkpointing_enable()

# 이미지 전처리 함수 정의
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# JSON 파일 경로
json_file_path = '/content/drive/MyDrive/[캡스톤2] 허위조작 정보 경로 분석/data/final_merged_data_1108.json'

# JSON 파일 읽기
with open(json_file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# 한 번에 처리할 데이터 배치 크기 설정
batch_size = 20

# 저장 폴더 경로 설정
save_folder = '/content/drive/MyDrive/Text_Image_Embeddings'
os.makedirs(save_folder, exist_ok=True)

# 결합을 위한 경량화된 DistilBERT 설정
class FusionDistilBERT(nn.Module):
    def __init__(self, embed_dim):
        super(FusionDistilBERT, self).__init__()
        config = DistilBertConfig(dim=embed_dim, hidden_dim=512, n_layers=4, n_heads=4)
        self.transformer = DistilBertModel(config)

    def forward(self, combined_embeddings):
        outputs = self.transformer(inputs_embeds=combined_embeddings)
        return outputs.last_hidden_state.mean(dim=1)  # 최종 결합 임베딩

# 텍스트 임베딩과 이미지 임베딩을 결합할 Fusion DistilBERT 초기화
embed_dim = 768  # BigBird와 ViT 임베딩 차원 동일하게 설정
fusion_transformer = FusionDistilBERT(embed_dim)

# 전체 데이터를 배치 단위로 처리
for batch_num in range(37):  # 733개의 데이터이므로 20개씩 배치 처리
    start_idx = batch_num * batch_size
    end_idx = min(start_idx + batch_size, len(data))
    batch_data = data[start_idx:end_idx]

    batch_embeddings = []
    for example_data in batch_data:
        script_text = example_data["script_text"]
        image_paths = example_data["image_frames"]

        # 텍스트 임베딩 생성
        inputs = tokenizer(script_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Mixed Precision 적용
        with torch.no_grad(), autocast():
            text_embedding = text_model(**inputs).last_hidden_state.mean(dim=1)

        # 이미지 임베딩 생성
        image_embeddings = []
        for image_path in image_paths:
            image = Image.open(image_path).convert("RGB")
            image = preprocess(image).unsqueeze(0)

            # Mixed Precision 적용
            with torch.no_grad(), autocast():
                image_embedding = image_model(image).last_hidden_state.mean(dim=1)
            image_embeddings.append(image_embedding)

        # 이미지 임베딩 결합
        if image_embeddings:
            image_embeddings = torch.stack(image_embeddings).mean(dim=0)
        else:
            image_embeddings = torch.zeros_like(text_embedding)

        # 텍스트와 이미지 임베딩을 결합하기 위해 결합 벡터 생성
        combined_embedding = torch.stack([text_embedding, image_embeddings], dim=1)  # (batch, 2, embed_dim)

        # DistilBERT 기반 Fusion Transformer를 통해 최종 결합 임베딩 생성
        final_embedding = fusion_transformer(combined_embedding)
        batch_embeddings.append(final_embedding)

    # 배치별로 임베딩 저장
    torch.save(batch_embeddings, f'{save_folder}/embeddings_batch_{batch_num}.pt')

    # GPU 메모리 해제
    torch.cuda.empty_cache()

print("모든 배치의 처리가 완료되었습니다.")

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/846k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/775 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

  with torch.no_grad(), autocast():
  with torch.no_grad(), autocast():


모든 배치의 처리가 완료되었습니다.
