## Import

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
import os
import numpy as np
import random
import warnings
import dataset as d
warnings.filterwarnings(action='ignore') 

## Hyperparameter Settings

In [2]:
CFG = {
    'IMG_SIZE':224,
    'EPOCHS':10, #Your Epochs,
    'LR':1e-5, #Your Learning Rate,
    'BATCH_SIZE': 128, #Your Batch Size,
    'SEED':41
}

## Fixed Random-Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## Custom Dataset

In [4]:
# dataset.py 의 CustomDataset 클래스 사용
import dataset as d
from util.preprocessing import  *


In [5]:
train_mean = (0.42008194, 0.3838274, 0.34902292)
train_Std = (0.23926373, 0.22593886, 0.22363442)

test_mean = (0.4216005, 0.38125762, 0.34539804)
test_Std = (0.23252015, 0.21890979, 0.21627444)

In [6]:
train_data = pd.read_csv('./data/open/train.csv')
test_data = pd.read_csv('./data/open/test.csv')
train_transform = d.ImageTransForm(CFG['IMG_SIZE'], train_mean, train_Std)
test_transform = d.ImageTransForm(CFG['IMG_SIZE'], test_mean, test_Std)


In [7]:
train_dataset = d.CustomDataset(train_data, 'train', transform=train_transform)
test_dataset = d.CustomDataset(test_data, 'test', transform=test_transform)

In [8]:
train_loader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)


In [9]:
imgs, moss, comments = next(iter(train_loader))

In [10]:
# 가장 긴 단어 탐색 (패딩을 위함)
max_len_train = max(len(c) for c in train_data['comments'])
max_len_train

454

In [11]:

# 단어 사전 생성
all_comments = ' '.join(train_data['comments']).split()
vocab = set(all_comments)
vocab = ['<PAD>', '<SOS>', '<EOS>'] + list(vocab)
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

In [12]:
for i, m, c in train_loader:
    print(i.size(), m.size(), len(c))
    break

torch.Size([128, 3, 224, 224]) torch.Size([128]) 128


## Define Model

In [13]:
# 데이터 로드
from train.models.encoder_resnet import EncoderResnet
encoder = EncoderResnet(512)
out, mos = encoder.forward(imgs)
print(mos.shape)
print(out.shape)
print(type(out))

torch.Size([128, 1])
torch.Size([128, 512])
<class 'torch.Tensor'>


In [14]:
# 모델 / 손실함수 / 옵티마이저 생성
from train.models.seq2seq import Seq2seq
from common.optimizer import Adam

vocab_size = len(vocab)
wordvec_size = 456
hidden_size = 512

model = Seq2seq(len(vocab), 456, 512)
criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<PAD>'])
optimizer = Adam()

In [15]:
print(model)

<train.models.seq2seq.Seq2seq object at 0x00000261EAD49B80>


## Train

In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


# 학습
total_loss = 0

for epoch in range(CFG['EPOCHS']):
    total_loss = 0
    loop = tqdm(train_loader, leave=True)
    for imgs, moss, comments in loop:
        imgs = imgs.float()
        
        # Batch Preprocessing
        comments_tensor = torch.zeros((len(comments), len(max(comments, key=len)))).long()        
        
        for i, comment in enumerate(comments):
            tokenized = ['<SOS>'] + comment.split() + ['<EOS>']
            comments_tensor[i, :len(tokenized)] = torch.tensor([word2idx[word] for word in tokenized])
        
        comments_tensor = pad_sequences(comments_tensor, 456, padding='post')
        comments_tensor = torch.tensor(comments_tensor)
        print(comments_tensor.size())
        break
        loss = model.forward(imgs, comments_tensor)
        model.backward()
        optimizer.update(model.params, model.grads)
        total_loss += loss


        
        start_id = comments_tensor[0]
        correct = comments_tensor[1:]
        predicted_comments = model.generate(imgs, start_id, len(correct))

        predicted_comments = ''.join([idx2word[int(c)] for c in predicted_comments])

        



  0%|          | 0/583 [00:00<?, ?it/s]


torch.Size([128, 456])


  0%|          | 0/583 [00:00<?, ?it/s]


torch.Size([128, 456])


  0%|          | 0/583 [00:00<?, ?it/s]


torch.Size([128, 456])


  0%|          | 0/583 [00:00<?, ?it/s]


torch.Size([128, 456])


  0%|          | 0/583 [00:00<?, ?it/s]


KeyboardInterrupt: 

## Inference & Submit

In [32]:
test_data = pd.read_csv('./data/open/test.csv')
test_dataset = d.CustomDataset(test_data, transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model.eval()
predicted_mos_list = []
predicted_comments_list = []

def greedy_decode(model, image, max_length=50):
    image = image.unsqueeze(0)
    mos, _ = model(image)
    output_sentence = []
    
    # 시작 토큰 설정
    current_token = torch.tensor([word2idx['<SOS>']])
    hidden = None
    features = model.cnn(image).view(image.size(0), -1)

    for _ in range(max_length):
        embeddings = model.embedding(current_token).unsqueeze(0)
        combined = torch.cat([features.unsqueeze(1), embeddings], dim=2)
        out, hidden = model.lstm(combined, hidden)
        
        output = model.fc(out.squeeze(0))
        _, current_token = torch.max(output, dim=1)

        # <EOS> 토큰에 도달하면 멈춤
        if current_token.item() == word2idx['<EOS>']:
            break

        # <SOS> 또는 <PAD> 토큰은 생성한 캡션에 추가하지 않음
        if current_token.item() not in [word2idx['<SOS>'], word2idx['<PAD>']]:
            output_sentence.append(idx2word[current_token.item()])
     
    return mos.item(), ' '.join(output_sentence)

# 추론 과정
with torch.no_grad():
    for imgs, _, _ in tqdm(test_loader):
        for img in imgs:
            img = img.float()
            mos, caption = greedy_decode(model, img)
            predicted_mos_list.append(mos)
            predicted_comments_list.append(caption)

# 결과 저장
result_df = pd.DataFrame({
    'img_name': test_data['img_name'],
    'mos': predicted_mos_list,
    'comments': predicted_comments_list  # 캡션 부분은 위에서 생성한 것을 사용
})

# 예측 결과에 NaN이 있다면, 제출 시 오류가 발생하므로 후처리 진행 (sample_submission.csv과 동일하게)
result_df['comments'] = result_df['comments'].fillna('Nice Image.')
result_df.to_csv('submit.csv', index=False)

print("Inference completed and results saved to submit.csv.")

NameError: name 'transform' is not defined