In [12]:
import os
import pandas as pd
import torch
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm

import re
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\minja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
dataset_dirs = "datasets/VQA/"
image_root = os.path.join((dataset_dirs + "images"))
train_data_path = os.path.join((dataset_dirs + "data_train.csv"))
test_data_path = os.path.join((dataset_dirs + "data_test.csv"))
answer_space_path = os.path.join((dataset_dirs + "answer_space.txt"))

In [None]:
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

with open(answer_space_path, "r") as f:
    answer_space = [line.strip() for line in f.readlines()]
answer2idx = {ans : idx for idx, ans in enumerate(answer_space)}

train_df["answer_idx"] = train_df["answer"].map(answer2idx).fillna(-1).astype(int)
test_df["answer_idx"] = test_df["answer"].map(answer2idx).fillna(-1).astype(int)

print(f"학습 질문 수: {len(train_df)}")
print(f"테스트 질문 수: {len(test_df)}")
print(f"정답 후보 수: {len(answer_space)}")
print(f"정답 인덱스 분포:\n{train_df['answer_idx'].value_counts().head()}")

# 예시 출력
print("\n[질문-정답-이미지ID] 예시:")
print(train_df[['question', 'answer', 'image_id', 'answer_idx']].head())

                                          question        answer   image_id  \
0                what is the object on the shelves           cup   image100   
1                         how man chairs are there             6   image888   
2      what is hanged to the right side of the bed       curtain  image1174   
3                 how many picture are on the wall             2   image942   
4  what is the object on the floor behind the rack  room_divider  image1220   

   answer_idx  
0         149  
1          16  
2         150  
3          10  
4         419  
학습 질문 수: 9974
테스트 질문 수: 2494
정답 후보 수: 582
정답 인덱스 분포:
-1      990
 10     442
 483    346
 106    293
 13     257
Name: answer_idx, dtype: int64

[질문-정답-이미지ID] 예시:
                                          question        answer   image_id  \
0                what is the object on the shelves           cup   image100   
1                         how man chairs are there             6   image888   
2      what is hanged to the 

In [None]:
# train_df.to_csv("testest.csv")

In [None]:
def preprocess_question(q):
    q = q.lower()
    q = re.sub(r"[^a-z0-9\s]", "", q)
    tokens = word_tokenize(q)
    return tokens

train_df["tokens"] = train_df["question"].apply(preprocess_question)
test_df["tokens"] = test_df["question"].apply(preprocess_question)

all_tokens = [token for row in train_df["tokens"] for token in row] + \
             [token for row in test_df["tokens"] for token in row]
vocab = ["<PAD>", "<UNK>"] + sorted(set(all_tokens))
word2idx = {word: idx for idx, word in enumerate(vocab)}

MAX_LEN = 20

def tokens_to_indices(tokens, word2idx):
    return [word2idx.get(token, word2idx["<UNK>"]) for token in tokens]

def pad_sequence(seq, max_len, pad_value=0):
    return seq[:max_len] + [pad_value] * (max_len - len(seq))


train_df["indexed"] = train_df["tokens"].apply(lambda t: pad_sequence(tokens_to_indices(t, word2idx), MAX_LEN))
test_df["indexed"] = test_df["tokens"].apply(lambda t: pad_sequence(tokens_to_indices(t, word2idx), MAX_LEN))

In [41]:
train_df.head()

Unnamed: 0,question,answer,image_id,answer_idx,tokens,indexed
0,what is the object on the shelves,cup,image100,149,"[what, is, the, object, on, the, shelves]","[1060, 510, 966, 639, 646, 966, 844, 0, 0, 0, ..."
1,how man chairs are there,6,image888,16,"[how, man, chairs, are, there]","[474, 585, 178, 26, 969, 0, 0, 0, 0, 0, 0, 0, ..."
2,what is hanged to the right side of the bed,curtain,image1174,150,"[what, is, hanged, to, the, right, side, of, t...","[1060, 510, 441, 984, 966, 785, 854, 643, 966,..."
3,how many picture are on the wall,2,image942,10,"[how, many, picture, are, on, the, wall]","[474, 588, 703, 26, 646, 966, 1049, 0, 0, 0, 0..."
4,what is the object on the floor behind the rack,room_divider,image1220,419,"[what, is, the, object, on, the, floor, behind...","[1060, 510, 966, 639, 646, 966, 377, 70, 966, ..."


In [None]:
# def load_images(root_dir, size=(224, 224)):
#     images = []
    
#     for filename in os.listdir(root_dir):
#         if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')):
#             img_path = os.path.join(root_dir, filename)
#             img = Image.open(img_path).convert("RGB")
#             img = img.resize(size)
#             img_array = np.array(img).transpose(2, 0, 1)  # (C, H, W)
#             images.append(img_array)
            
#     return np.array(images)

def load_images(image_ids, root_dir, size=(224, 224)):
    data = []
    for img_id in tqdm(image_ids):
        img_path = os.path.join(root_dir, img_id + ".png")
        img = Image.open(img_path).convert("RGB")
        img = img.resize(size)
        img_array = np.array(img).transpose(2, 0, 1)  # (C, H, W)
        data.append(img_array)
    return np.array(data)

# 커스텀 데이터셋 정의
class Custom_Dataset(Dataset):
    def __init__(self, X, Y):
        self.X = X  # 넘파이 배열 (N, 3, H, W)
        self.Y = Y  # 레이블 (정수 인덱스)
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),
            transforms.RandomRotation(degrees=2),
            transforms.RandomAffine(degrees=0, translate=(0.02, 0.02)),
            transforms.ColorJitter(brightness=0.2, contrast=0.2),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]),
            transforms.RandomErasing(p=0.1, scale=(0.02, 0.05)),
        ])

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        x = self.X[index]  # (3, H, W)
        y = self.Y[index]
        x = x.transpose(1, 2, 0)  # (3, H, W) → (H, W, 3)
        x = self.transform(x)
        return x, y

In [54]:
image_size = (224, 224)

# 학습셋에서 사용된 이미지 id 리스트
image_ids = train_df["image_id"].unique()

image_arrays = load_images(image_root)

image_id_to_label = dict(zip(train_df["image_id"], train_df["answer_idx"]))
image_labels = np.array([image_id_to_label.get(img_id, -1) for img_id in image_ids])

# DataLoader 예시
train_dataset = Custom_Dataset(image_arrays, image_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)