In [3]:
import os
import pandas as pd
import torch
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

import re
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

In [10]:
dataset_dirs = "datasets/VQA/"
image_dir = os.path.join((dataset_dirs + "images"))
train_data_path = os.path.join((dataset_dirs + "data_train.csv"))
test_data_path = os.path.join((dataset_dirs + "data_test.csv"))
answer_space_path = os.path.join((dataset_dirs + "answer_space.txt"))

In [19]:
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

with open(answer_space_path, "r") as f:
    answer_space = [line.strip() for line in f.readlines()]
answer2idx = {ans : idx for ans, idx in enumerate(answer_space)}

train_df["answer_idx"] = train_df["answer"].map(answer2idx).fillna(-1).astype(int)
test_df["answer_idx"] = test_df["answer"].map(answer2idx).fillna(-1).astype(int)

print(f"학습 질문 수: {len(train_df)}")
print(f"테스트 질문 수: {len(test_df)}")
print(f"정답 후보 수: {len(answer_space)}")
print(f"정답 인덱스 분포:\n{train_df['answer_idx'].value_counts().head()}")

# 예시 출력
print("\n[질문-정답-이미지ID] 예시:")
print(train_df[['question', 'answer', 'image_id', 'answer_idx']].head())

학습 질문 수: 9974
테스트 질문 수: 2494
정답 후보 수: 582
정답 인덱스 분포:
answer_idx
-1    9974
Name: count, dtype: int64

[질문-정답-이미지ID] 예시:
                                          question        answer   image_id  \
0                what is the object on the shelves           cup   image100   
1                         how man chairs are there             6   image888   
2      what is hanged to the right side of the bed       curtain  image1174   
3                 how many picture are on the wall             2   image942   
4  what is the object on the floor behind the rack  room_divider  image1220   

   answer_idx  
0          -1  
1          -1  
2          -1  
3          -1  
4          -1  


  train_df["answer_idx"] = train_df["answer"].map(answer2idx).fillna(-1).astype(int)
  test_df["answer_idx"] = test_df["answer"].map(answer2idx).fillna(-1).astype(int)


In [45]:
import nltk
nltk.download()



showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [46]:
def preprocess_question(q):
    q = q.lower()
    q = re.sub(r"[^a-z0-9\s]", "", q)
    tokens = word_tokenize(q)
    return tokens

train_df["tokens"] = train_df["question"].apply(preprocess_question)
test_df["tokens"] = test_df["question"].apply(preprocess_question)

all_tokens = [token for row in train_df["tokens"] for token in row] + \
             [token for row in test_df["tokens"] for token in row]
vocab = ["<PAD>", "<UNK>"] + sorted(set(all_tokens))
word2idx = {word: idx for idx, word in enumerate(vocab)}

MAX_LEN = 20

def tokens_to_indices(tokens, word2idx):
    return [word2idx.get(token, word2idx["<UNK>"]) for token in tokens]

def pad_sequence(seq, max_len, pad_value=0):
    return seq[:max_len] + [pad_value] * (max_len - len(seq))


train_df["indexed"] = train_df["tokens"].apply(lambda t: pad_sequence(tokens_to_indices(t, word2idx), MAX_LEN))
test_df["indexed"] = test_df["tokens"].apply(lambda t: pad_sequence(tokens_to_indices(t, word2idx), MAX_LEN))

import ace_tools as tools; tools.display_dataframe_to_user(name="전처리된 질문 데이터", dataframe=train_df[["question", "tokens", "indexed", "answer", "answer_idx"]].head(10))

# 추가 정보 요약
{
    "vocab_size": len(vocab),
    "max_sequence_length": MAX_LEN,
    "sample_vocab": list(word2idx.items())[:10],
    "sample_question": train_df["question"].iloc[0],
    "tokenized": train_df["tokens"].iloc[0],
    "indexed": train_df["indexed"].iloc[0],
    "answer_index": train_df["answer_idx"].iloc[0]
}

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/aepeul/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/share/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - 'datasets/VQA'
**********************************************************************
