### 임포트

In [1]:
import torch
import os
import sys
import numpy as np
import random
import copy

# custom
from util import *
from transformers import GPT2Tokenizer
from AAC_Prefix.AAC_Prefix import * # network
from Train import *

### 기타 값들 설정

In [2]:
# prefix vector 크기 설정
temporal_prefix_size = 15 # 0 or 15
global_prefix_size = 11 # 0 or 11

prefix_size = temporal_prefix_size + global_prefix_size 

# mapping network가 사용할 transformer의 스펙 설정
transformer_num_layers = {"temporal_num_layers" : 4, "global_num_layers" : 4}
prefix_size_dict = {"temporal_prefix_size" : temporal_prefix_size, "global_prefix_size" : global_prefix_size}


data_dir = './Clotho'
MODEL_NAME = 'add_exp_train_clotho_test_audiocaps'

epochs = 60
LR = 5e-5

TEST_BATCH_SIZE = 5
TRAIN_BATCH_SIZE = 55

random_seed=2766
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
torch.backends.cudnn.benchmark=False
torch.backends.cudnn.deterministic=True
np.random.seed(random_seed)
random.seed(random_seed)  

USE_CUDA = torch.cuda.is_available() 
device = torch.device('cuda' if USE_CUDA else 'cpu')

### Tokenizer, Dataloader 불러오기

In [3]:
tokenizer_type = 'GPT2'
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
vocab_size = None

In [4]:
train_dataloader = CreateDataloader(tokenizer, data_dir, TRAIN_BATCH_SIZE, 'development', prefix_size, is_TrainDataset = True, tokenizer_type = tokenizer_type)
test_dataloader_audiocaps = CreateDataloader(tokenizer, './AudioCaps', TEST_BATCH_SIZE, 'test', prefix_size, is_TrainDataset = False, tokenizer_type = tokenizer_type)

get dataset...: 100%|███████████████████████| 2893/2893 [00:56<00:00, 51.39it/s]
get dataset...: 100%|███████████████████████| 960/960 [00:00<00:00, 1495.88it/s]


### 학습결과 정리하는 폴더 생성하기

In [5]:
directory = "./Train_record/params_" + MODEL_NAME
try:
    if not os.path.exists(directory):
        os.makedirs(directory)
except OSError:
    print("Error: Failed to create the directory.")


### 모델 초기화

In [6]:
model = get_AAC_Prefix(tokenizer, 
                        vocab_size = vocab_size, Dataset = 'Clotho',
                        prefix_size_dict = prefix_size_dict, transformer_num_layers = transformer_num_layers, 
                        encoder_freeze = True, decoder_freeze = True,
                        pretrain_fromAudioCaps = False, device = device)

# 다른 데이터셋으로 평가하는데 이 때 조건이 Header는 freeze하는 조건이 들어있었다. 그래서 Header freeze 해줌
for param in model.language_header.parameters():
    param.requires_grad = False

  fft_window = librosa.util.pad_center(fft_window, n_fft)
  return f(*args, **kwargs)


use GPT2 Tokenizer
temporal feature's mapping network : num_head = 8 num_layers = 4
global feature ver's mapping network : num_head = 8 num_layers = 4
Encoder freezing
GPT2 freezing
header trainable!


### 학습 & 평가

In [7]:
Train(model, LR, train_dataloader, test_dataloader_audiocaps,
    epochs, model_name = MODEL_NAME, beam_search = True, device = device,
    Dataset = 'Clotho', test_dataloader_other_dataset = None)

Training Epoch 0, Loss = 5.62156: 100%|███████| 263/263 [03:53<00:00,  1.13it/s]
Training Epoch 1, Loss = 3.9628: 100%|████████| 263/263 [03:52<00:00,  1.13it/s]
Training Epoch 2, Loss = 3.69099: 100%|███████| 263/263 [03:51<00:00,  1.14it/s]
Training Epoch 3, Loss = 3.56709: 100%|███████| 263/263 [03:51<00:00,  1.14it/s]
Training Epoch 4, Loss = 3.46687: 100%|███████| 263/263 [03:51<00:00,  1.13it/s]
Training Epoch 5, Loss = 3.396: 100%|█████████| 263/263 [03:52<00:00,  1.13it/s]
Training Epoch 6, Loss = 3.34051: 100%|███████| 263/263 [03:49<00:00,  1.15it/s]
Training Epoch 7, Loss = 3.29782: 100%|███████| 263/263 [03:51<00:00,  1.14it/s]
Training Epoch 8, Loss = 3.25949: 100%|███████| 263/263 [03:52<00:00,  1.13it/s]
Training Epoch 9, Loss = 3.22336: 100%|███████| 263/263 [03:53<00:00,  1.13it/s]
Training Epoch 10, Loss = 3.18943: 100%|██████| 263/263 [03:51<00:00,  1.14it/s]
Training Epoch 11, Loss = 3.15415: 100%|██████| 263/263 [03:50<00:00,  1.14it/s]
Training Epoch 12, Loss = 3.

loading annotations into memory...
0:00:00.006744
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 59528 tokens at 394543.44 tokens per second.
PTBTokenizer tokenized 11257 tokens at 123190.14 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 9331, 'reflen': 8422, 'guess': [9331, 8374, 7417, 6460], 'correct': [4032, 1281, 411, 93]}
ratio: 1.107931607694003
Bleu_1: 0.432
Bleu_2: 0.257
Bleu_3: 0.154
Bleu_4: 0.085
computing METEOR score...
METEOR: 0.141
computing Rouge score...
ROUGE_L: 0.333
computing CIDEr score...
CIDEr: 0.211
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 2.240 s
SPICE: 0.080
computing SPIDEr score...
SPIDEr: 0.146


Training Epoch 15, Loss = 3.04196: 100%|██████| 263/263 [06:36<00:00,  1.51s/it]
Training Epoch 16, Loss = 3.01616: 100%|██████| 263/263 [06:01<00:00,  1.38s/it]
Training Epoch 17, Loss = 2.99205: 100%|██████| 263/263 [06:40<00:00,  1.52s/it]
Training Epoch 18, Loss = 2.96767: 100%|██████| 263/263 [06:42<00:00,  1.53s/it]
Training Epoch 19, Loss = 2.94846: 100%|██████| 263/263 [06:42<00:00,  1.53s/it]
Eval using dataset...: 100%|██████████████████| 957/957 [18:47<00:00,  1.18s/it]


loading annotations into memory...
0:00:00.006024
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 59528 tokens at 413542.13 tokens per second.
PTBTokenizer tokenized 11405 tokens at 107407.27 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 9396, 'reflen': 8624, 'guess': [9396, 8439, 7482, 6525], 'correct': [4038, 1299, 421, 100]}
ratio: 1.0895176252317846
Bleu_1: 0.430
Bleu_2: 0.257
Bleu_3: 0.155
Bleu_4: 0.087
computing METEOR score...
METEOR: 0.142
computing Rouge score...
ROUGE_L: 0.321
computing CIDEr score...
CIDEr: 0.201
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 2.410 s
SPICE: 0.082
computing SPIDEr score...
SPIDEr: 0.141


Training Epoch 20, Loss = 2.92326: 100%|██████| 263/263 [06:40<00:00,  1.52s/it]
Training Epoch 21, Loss = 2.90163: 100%|██████| 263/263 [06:43<00:00,  1.53s/it]
Training Epoch 22, Loss = 2.87872: 100%|██████| 263/263 [06:41<00:00,  1.53s/it]
Training Epoch 23, Loss = 2.86066: 100%|██████| 263/263 [06:43<00:00,  1.54s/it]
Training Epoch 24, Loss = 2.83874: 100%|██████| 263/263 [04:00<00:00,  1.09it/s]
Eval using dataset...: 100%|██████████████████| 957/957 [13:13<00:00,  1.21it/s]


loading annotations into memory...
0:00:00.006668
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 59528 tokens at 382633.80 tokens per second.
PTBTokenizer tokenized 11429 tokens at 116907.15 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 9459, 'reflen': 8691, 'guess': [9459, 8502, 7545, 6588], 'correct': [4098, 1348, 434, 90]}
ratio: 1.0883672764927985
Bleu_1: 0.433
Bleu_2: 0.262
Bleu_3: 0.158
Bleu_4: 0.086
computing METEOR score...
METEOR: 0.143
computing Rouge score...
ROUGE_L: 0.324
computing CIDEr score...
CIDEr: 0.198
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 2.301 s
SPICE: 0.087
computing SPIDEr score...
SPIDEr: 0.143


Training Epoch 25, Loss = 2.81651: 100%|██████| 263/263 [05:07<00:00,  1.17s/it]
Training Epoch 26, Loss = 2.79972: 100%|██████| 263/263 [06:43<00:00,  1.53s/it]
Training Epoch 27, Loss = 2.78117: 100%|██████| 263/263 [06:43<00:00,  1.53s/it]
Training Epoch 28, Loss = 2.76233: 100%|██████| 263/263 [06:40<00:00,  1.52s/it]
Training Epoch 29, Loss = 2.74412: 100%|██████| 263/263 [06:43<00:00,  1.53s/it]
Eval using dataset...: 100%|██████████████████| 957/957 [17:56<00:00,  1.12s/it]


loading annotations into memory...
0:00:00.010208
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 59528 tokens at 256940.76 tokens per second.
PTBTokenizer tokenized 11049 tokens at 73230.93 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 9114, 'reflen': 8508, 'guess': [9114, 8157, 7200, 6243], 'correct': [4151, 1380, 477, 96]}
ratio: 1.0712270803947965
Bleu_1: 0.455
Bleu_2: 0.278
Bleu_3: 0.172
Bleu_4: 0.094
computing METEOR score...
METEOR: 0.147
computing Rouge score...
ROUGE_L: 0.337
computing CIDEr score...
CIDEr: 0.228
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 2.338 s
SPICE: 0.090
computing SPIDEr score...
SPIDEr: 0.159


Training Epoch 30, Loss = 2.73109: 100%|██████| 263/263 [06:41<00:00,  1.53s/it]
Training Epoch 31, Loss = 2.71394: 100%|██████| 263/263 [06:44<00:00,  1.54s/it]
Training Epoch 32, Loss = 2.697: 100%|████████| 263/263 [06:42<00:00,  1.53s/it]
Training Epoch 33, Loss = 2.68685: 100%|██████| 263/263 [06:42<00:00,  1.53s/it]
Training Epoch 34, Loss = 2.67036: 100%|██████| 263/263 [06:40<00:00,  1.52s/it]
Eval using dataset...: 100%|██████████████████| 957/957 [08:28<00:00,  1.88it/s]


loading annotations into memory...
0:00:00.006488
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 59528 tokens at 274567.99 tokens per second.
PTBTokenizer tokenized 11712 tokens at 96557.42 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 9712, 'reflen': 8921, 'guess': [9712, 8755, 7798, 6841], 'correct': [4257, 1366, 436, 87]}
ratio: 1.0886671897768088
Bleu_1: 0.438
Bleu_2: 0.262
Bleu_3: 0.156
Bleu_4: 0.084
computing METEOR score...
METEOR: 0.146
computing Rouge score...
ROUGE_L: 0.327
computing CIDEr score...
CIDEr: 0.213
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 2.281 s
SPICE: 0.090
computing SPIDEr score...
SPIDEr: 0.151


Training Epoch 35, Loss = 2.658: 100%|████████| 263/263 [04:56<00:00,  1.13s/it]
Training Epoch 36, Loss = 2.64485: 100%|██████| 263/263 [06:29<00:00,  1.48s/it]
Training Epoch 37, Loss = 2.62873: 100%|██████| 263/263 [06:30<00:00,  1.48s/it]
Training Epoch 38, Loss = 2.62087: 100%|██████| 263/263 [06:30<00:00,  1.48s/it]
Training Epoch 39, Loss = 2.60604: 100%|██████| 263/263 [06:29<00:00,  1.48s/it]
Eval using dataset...: 100%|██████████████████| 957/957 [16:18<00:00,  1.02s/it]


loading annotations into memory...
0:00:00.006713
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 59528 tokens at 305853.07 tokens per second.
PTBTokenizer tokenized 11664 tokens at 100525.63 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 9643, 'reflen': 8872, 'guess': [9643, 8686, 7729, 6772], 'correct': [4285, 1387, 449, 81]}
ratio: 1.0869026149683174
Bleu_1: 0.444
Bleu_2: 0.266
Bleu_3: 0.160
Bleu_4: 0.084
computing METEOR score...
METEOR: 0.146
computing Rouge score...
ROUGE_L: 0.328
computing CIDEr score...
CIDEr: 0.207
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 2.840 s
SPICE: 0.091
computing SPIDEr score...
SPIDEr: 0.149


Training Epoch 40, Loss = 2.59819: 100%|██████| 263/263 [06:32<00:00,  1.49s/it]
Training Epoch 41, Loss = 2.58976: 100%|██████| 263/263 [06:29<00:00,  1.48s/it]
Training Epoch 42, Loss = 2.57858: 100%|██████| 263/263 [06:27<00:00,  1.47s/it]
Training Epoch 43, Loss = 2.57208: 100%|██████| 263/263 [06:26<00:00,  1.47s/it]
Training Epoch 44, Loss = 2.56094: 100%|██████| 263/263 [06:27<00:00,  1.47s/it]
Eval using dataset...: 100%|██████████████████| 957/957 [09:55<00:00,  1.61it/s]


loading annotations into memory...
0:00:00.006858
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 59528 tokens at 392647.20 tokens per second.
PTBTokenizer tokenized 12009 tokens at 119476.78 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 10003, 'reflen': 9199, 'guess': [10003, 9046, 8089, 7132], 'correct': [4293, 1360, 427, 72]}
ratio: 1.0874008044351464
Bleu_1: 0.429
Bleu_2: 0.254
Bleu_3: 0.150
Bleu_4: 0.077
computing METEOR score...
METEOR: 0.145
computing Rouge score...
ROUGE_L: 0.324
computing CIDEr score...
CIDEr: 0.203
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 1.977 s
SPICE: 0.089
computing SPIDEr score...
SPIDEr: 0.146


Training Epoch 45, Loss = 2.5532: 100%|███████| 263/263 [03:49<00:00,  1.15it/s]
Training Epoch 46, Loss = 2.54996: 100%|██████| 263/263 [03:50<00:00,  1.14it/s]
Training Epoch 47, Loss = 2.53911: 100%|██████| 263/263 [03:51<00:00,  1.14it/s]
Training Epoch 48, Loss = 2.53706: 100%|██████| 263/263 [03:52<00:00,  1.13it/s]
Training Epoch 49, Loss = 2.53075: 100%|██████| 263/263 [03:50<00:00,  1.14it/s]
Eval using dataset...: 100%|██████████████████| 957/957 [05:57<00:00,  2.67it/s]


loading annotations into memory...
0:00:00.006490
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 59528 tokens at 319482.57 tokens per second.
PTBTokenizer tokenized 11611 tokens at 99204.72 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 9587, 'reflen': 8852, 'guess': [9587, 8630, 7673, 6716], 'correct': [4079, 1289, 419, 69]}
ratio: 1.0830320831449296
Bleu_1: 0.425
Bleu_2: 0.252
Bleu_3: 0.151
Bleu_4: 0.077
computing METEOR score...
METEOR: 0.140
computing Rouge score...
ROUGE_L: 0.317
computing CIDEr score...
CIDEr: 0.191
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 2.115 s
SPICE: 0.085
computing SPIDEr score...
SPIDEr: 0.138


Training Epoch 50, Loss = 2.52582: 100%|██████| 263/263 [03:49<00:00,  1.15it/s]
Training Epoch 51, Loss = 2.52208: 100%|██████| 263/263 [03:51<00:00,  1.14it/s]
Training Epoch 52, Loss = 2.51782: 100%|██████| 263/263 [03:49<00:00,  1.14it/s]
Training Epoch 53, Loss = 2.51512: 100%|██████| 263/263 [03:50<00:00,  1.14it/s]
Training Epoch 54, Loss = 2.50988: 100%|██████| 263/263 [03:49<00:00,  1.15it/s]
Eval using dataset...: 100%|██████████████████| 957/957 [05:56<00:00,  2.68it/s]


loading annotations into memory...
0:00:00.006575
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 59528 tokens at 387461.65 tokens per second.
PTBTokenizer tokenized 11720 tokens at 123232.32 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 9682, 'reflen': 8940, 'guess': [9682, 8725, 7768, 6811], 'correct': [4231, 1358, 449, 84]}
ratio: 1.0829977628634135
Bleu_1: 0.437
Bleu_2: 0.261
Bleu_3: 0.158
Bleu_4: 0.083
computing METEOR score...
METEOR: 0.143
computing Rouge score...
ROUGE_L: 0.323
computing CIDEr score...
CIDEr: 0.200
computing SPICE score...


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.5 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.6

SPICE evaluation took: 13.34 s
SPICE: 0.086
computing SPIDEr score...
SPIDEr: 0.143


Training Epoch 55, Loss = 2.51344: 100%|██████| 263/263 [03:51<00:00,  1.14it/s]
Training Epoch 56, Loss = 2.509: 100%|████████| 263/263 [03:48<00:00,  1.15it/s]
Training Epoch 57, Loss = 2.5107: 100%|███████| 263/263 [03:50<00:00,  1.14it/s]
Training Epoch 58, Loss = 2.5093: 100%|███████| 263/263 [03:50<00:00,  1.14it/s]
Training Epoch 59, Loss = 2.50891: 100%|██████| 263/263 [03:50<00:00,  1.14it/s]
Eval using dataset...: 100%|██████████████████| 957/957 [05:58<00:00,  2.67it/s]


loading annotations into memory...
0:00:00.006645
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 59528 tokens at 421589.26 tokens per second.
PTBTokenizer tokenized 11690 tokens at 123862.71 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 9701, 'reflen': 8896, 'guess': [9701, 8744, 7787, 6830], 'correct': [4218, 1342, 435, 82]}
ratio: 1.0904901079135465
Bleu_1: 0.435
Bleu_2: 0.258
Bleu_3: 0.155
Bleu_4: 0.082
computing METEOR score...
METEOR: 0.142
computing Rouge score...
ROUGE_L: 0.321
computing CIDEr score...
CIDEr: 0.195
computing SPICE score...


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.5 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.5 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.6

SPICE evaluation took: 12.85 s
SPICE: 0.086
computing SPIDEr score...
SPIDEr: 0.141

Training time : 5:10:08
