In [1]:
import sys
import torch
# custom
from AudioCaps.AudioCaps_Dataset import * # 데이터셋
from transformers import GPT2Tokenizer
from ClipCap_forAAC.CLIPCAP_forAAC import * # network
from Train import *
import warnings
import string
warnings.filterwarnings("ignore", category=DeprecationWarning) 


# PANNs를 써먹기 위해 prefix_size를 수정
audio_prefix_size = 15
semantic_prefix_size = 11 # 기존의 Semantic mapping network를 사용시
# semantic_prefix_size = 10 # 새로운 Semantic mapping network를 사용시
prefix_size = audio_prefix_size + semantic_prefix_size
transformer_num_layers = {"audio_num_layers" : 4 , "semantic_num_layers" : 4}
prefix_size_dict = {"audio_prefix_size" : audio_prefix_size, "semantic_prefix_size" : semantic_prefix_size}

USE_CUDA = torch.cuda.is_available() 
device = torch.device('cuda:0' if USE_CUDA else 'cpu')

# 4992 : Clotho Tokenizer
# 5069 : ACT Tokenizer
# 7911 : Custom Tokenizer1
# 5084 : Custom Tokenizer2

vocab_size = 7911
tokenizer_type = None

if vocab_size == None :
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer_type = 'GPT2'
else :
    tokenizer = tokenizer_AudioCaps(vocab_size)
    tokenizer_type = 'Custom'

data_dir = './AudioCaps'

TEST_BATCH_SIZE = 5
test_dataloader  = dataloader_AudioCapsDataset(tokenizer, data_dir, TEST_BATCH_SIZE, split = 'test', prefix_size = prefix_size, is_TrainDataset = False, tokenizer_type = 'Custom')

model = get_ClipCap_AAC(tokenizer, vocab_size = vocab_size, Dataset = 'AudioCaps',
                        prefix_size_dict = prefix_size_dict, transformer_num_layers = transformer_num_layers, 
                        encoder_freeze = False, decoder_freeze = True,
                        pretrain_fromAudioCaps = False, device = device)


# model_name = 'model_AudioCaps_GPT2_header_freezing.pt'
model_name = 'model_AudioCaps_Custom_' + str(vocab_size) + '.pt'
model.load_state_dict(torch.load("./Trained_model_params/" + model_name))

get dataset...:   0%|                                   | 0/960 [00:00<?, ?it/s]

audio feature's mapping network : num_head = 8 num_layers = 4
semantic feature ver1's mapping network : num_head = 8 num_layers = 4
GPT2 freezing


<All keys matched successfully>

In [2]:
def get_pred_captions(model, test_dataloader, device) :
    model.eval()
    model.to(device)

    # 모아놨다가 한 번에 평가하자
    captions_pred: List[Dict] = []
    captions_gt: List[Dict] = []
    
    for i, (audio, _, f_names) in enumerate(tqdm(test_dataloader, desc="Get Caption...")):
        with torch.no_grad():
            # 하나의 raw audio에 대해 5개의 caption이 등장
            
            # Test dataset은 audio, caption의 비율이 1:5다 
            # Batch size를 5로 설정했음. 0번 인덱스 값만 사용할거임
            audio = audio.to(device)
            
            audio = audio[0,:].unsqueeze(0)
            
            pred_caption = model(audio, None, beam_search = True)[0][0]

        captions_pred.append({
                        'file_name': f_names[0], 
                        'caption_predicted': pred_caption})
    
    
    return captions_pred

In [3]:
captions_pred = get_pred_captions(model, test_dataloader, device)

Get Caption...: 100%|█████████████████████████| 957/957 [01:41<00:00,  9.44it/s]


In [4]:
word_dictionary = {}

for values in captions_pred :
    caption = values['caption_predicted']
    caption = caption.translate(str.maketrans('', '', string.punctuation))
    word_list = caption.split(' ')
    for word in word_list :
        if word in word_dictionary: 
            word_dictionary[word] += 1
        else : 
            word_dictionary[word] = 1

In [5]:
len(word_dictionary)

354