# Bart-DMLM(train-sciq-passage-level) Text2Text Generation on Sciq
使用 Sciq dataset訓練 Bart Distractor Generation<br>
直接使用 trainer 訓練 <br>

### GPU

In [1]:
!nvidia-smi

Fri Jun 23 01:51:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.182.03   Driver Version: 470.182.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA TITAN RTX    Off  | 00000000:09:00.0 Off |                  N/A |
| 78%   86C    P2   262W / 280W |  24087MiB / 24217MiB |     99%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA TITAN RTX    Off  | 00000000:0A:00.0 Off |                  N/A |
| 41%   44C    P8    30W / 280W |      3MiB / 24220MiB |      0%      Default |
|       

In [2]:
project_name = "test on sciq with Bart"
import os

os.environ["WANDB_PROJECT"] = project_name
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

### import

In [3]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

  from .autonotebook import tqdm as notebook_tqdm


### Loading the dataset

In [4]:
import json

In [5]:
def read_data(item):
    path = '../../../../data/Sciq/sciq_{}.json'.format(item)
    with open(path) as f:
        data = json.load(f)
    return data

In [6]:
train = read_data('train')
valid = read_data('valid')
test = read_data('test')

In [7]:
train = list(train)
test = list(test)
valid = list(valid)

In [8]:
len(train), len(valid), len(test)

(11679, 1000, 1000)

In [9]:
train[0]

{'question': 'What type of organism is commonly used in preparation of foods such as cheese and yogurt?',
 'distractor3': 'viruses',
 'distractor1': 'protozoa',
 'distractor2': 'gymnosperms',
 'correct_answer': 'mesophilic organisms',
 'support': 'Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.'}

### Prepare data

In [10]:
def processData(data):
    
    sentences = []
    labels = []
    answers = []
    for d in data:
        sentence = d['question']
        distractors = [d['distractor1'], d['distractor2'], d['distractor3']]
        answer = d['correct_answer']
        
        # 避免dataset的label有空白
        distractors = [dis.strip() for dis in distractors]
        
        sentences.append(sentence)
        labels.append('_ of distractors are ' + ', '.join(distractors))
        answers.append(answer)
        
    return sentences, answers, labels

In [11]:
train_sent, train_answer, train_label = processData(train)
valid_sent, valid_answer, valid_label = processData(valid)
test_sent, test_answer, test_label = processData(test)

In [12]:
print(test_label[0])

_ of distractors are antioxidants, Oxygen, residues


In [13]:
for l in test_label:
    if 'ultraviolet light' in l:
        print(l)

_ of distractors are invisible light, sunlight, ultraviolet light


In [14]:
len(train_sent), len(train_answer), len(train_label)

(11679, 11679, 11679)

In [15]:
for idx in range(2):
    print(train_sent[idx])
    print(train_answer[idx])
    print(train_label[idx])
    print()

What type of organism is commonly used in preparation of foods such as cheese and yogurt?
mesophilic organisms
_ of distractors are protozoa, gymnosperms, viruses

What phenomenon makes global winds blow northeast to southwest or the reverse in the northern hemisphere and northwest to southeast or the reverse in the southern hemisphere?
coriolis effect
_ of distractors are muon effect, centrifugal effect, tropical effect



In [16]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

In [17]:
train_encodings = tokenizer(train_sent, train_answer,truncation=True, padding=True)
valid_encodings = tokenizer(valid_sent, valid_answer,truncation=True, padding=True)
test_encodings = tokenizer(test_sent, test_answer,truncation=True, padding=True)

In [18]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [19]:
print(train_encodings.input_ids[0])

[0, 2264, 1907, 9, 33993, 16, 10266, 341, 11, 7094, 9, 6592, 215, 25, 7134, 8, 24351, 116, 2, 2, 12579, 6673, 22586, 28340, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [20]:
tokenizer.decode(train_encodings.input_ids[0])

'<s>What type of organism is commonly used in preparation of foods such as cheese and yogurt?</s></s>mesophilic organisms</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [21]:
def add_labels(encodings, distractors):
    
    distractors_encodings = tokenizer(distractors, padding=True)
    labels = []
    for i in range(len(distractors_encodings.input_ids)):
        labels.append(distractors_encodings.input_ids[i])
    
    encodings["labels"] = labels
    return encodings

In [22]:
train_encodings = add_labels(train_encodings, train_label)
valid_encodings = add_labels(valid_encodings, valid_label)
test_encodings = add_labels(test_encodings, test_label)

In [23]:
len(train_encodings.input_ids)

11679

In [24]:
class SciqDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SciqDataset(train_encodings)
valid_dataset = SciqDataset(valid_encodings)
test_dataset = SciqDataset(test_encodings)

In [25]:
len(train_dataset), len(valid_dataset), len(test_dataset)

(11679, 1000, 1000)

### Fine-tuning

In [26]:
from transformers import BartForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
model.resize_token_embeddings(len(tokenizer))

Embedding(50265, 768, padding_idx=1)

In [27]:
model_dict = torch.nn.ModuleDict({
    'model': model,
})
checkpoint = torch.load('/user_data/Cloze/dtt_mask_lm_model/bart/mcq_all_3dtt_passage_level_12/checkpoints/epoch=01-dev_loss=0.14.ckpt')
model_dict.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [28]:
batch_size = 16
args = Seq2SeqTrainingArguments(
    output_dir = "./results-1",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="P@1",
    weight_decay=0.01,
    predict_with_generate=True,
    eval_accumulation_steps = 1,
    report_to="wandb" if os.getenv("WANDB_PROJECT") else "none"
)

In [29]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [30]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # store all article
    predicted = []
    true_label = []
    
    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')
        
        pred_list[0] = pred_list[0].split('are ')[-1]
        label_list[0] = label_list[0].split('are ')[-1]

        predicted.append(pred_list)
        true_label.append(label_list)

    # evaluation metrics
    p1 = 0
    p3 = 0
    r3 = 0
    f3 = 0
    for idx in range(len(true_label)):
        distractors = predicted[idx]
        labels = true_label[idx]

        act_set = set(labels)
        pred1_set = set(distractors[:1])
        pred3_set = set(distractors[:3])

        p_1 = len(act_set & pred1_set) / float(1)
        p_3 = len(act_set & pred3_set) / float(3)
        r_3 = len(act_set & pred3_set) / float(len(act_set))

        if p_3 == 0 and r_3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p_3 * r_3 / (p_3 + r_3))

        p1+=p_1
        p3+=p_3
        r3+=r_3
        f3+=f1_3

    avg_p1 = p1 / len(true_label)
    avg_p3 = p3 / len(true_label)
    avg_r3 = r3 / len(true_label)
    avg_f3 = f3 / len(true_label)

    result = {'P@1': avg_p1,
              'P@3': avg_p3,
              'R@3': avg_r3,
              'F1@3': avg_f3}
    
    return result

In [31]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [32]:
trainer.train()

***** Running training *****
  Num examples = 11679
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 36500
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mms0004284[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,P@1,P@3,R@3,F1@3
1,0.5825,0.683668,0.125,0.092333,0.091952,0.091978
2,0.4996,0.671764,0.159,0.098333,0.09831,0.098178
3,0.439,0.664118,0.157,0.109,0.108619,0.108644
4,0.4212,0.663127,0.176,0.118667,0.1185,0.118511
5,0.3874,0.66572,0.175,0.119333,0.1195,0.119311
6,0.3701,0.674257,0.206,0.132333,0.132333,0.132244
7,0.3459,0.680004,0.204,0.13,0.130167,0.129978
8,0.33,0.689671,0.19,0.131667,0.131452,0.131378
9,0.3108,0.702306,0.189,0.137,0.136786,0.136711
10,0.2943,0.709576,0.192,0.142,0.1425,0.142156


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to ./results-1/checkpoint-730
Configuration saved in ./results-1/checkpoint-730/config.json
Model weights saved in ./results-1/checkpoint-730/pytorch_model.bin
tokenizer config file saved in ./results-1/checkpoint-730/tokenizer_config.json
Special tokens file saved in ./results-1/checkpoint-730/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to ./results-1/checkpoint-1460
Configuration saved in ./results-1/checkpoint-1460/config.json
Model weights saved in ./results-1/checkpoint-1460/pytorch_model.bin
tokenizer config file saved in ./results-1/checkpoint-1460/tokenizer_config.json
Special tokens file saved in ./results-1/checkpoint-1460/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to ./results-1/checkpoint-2190
Configuration saved in ./results-1/chec

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16


{'eval_loss': 0.727323591709137,
 'eval_P@1': 0.205,
 'eval_P@3': 0.1409999999999999,
 'eval_R@3': 0.1407857142857142,
 'eval_F1@3': 0.14071111111111106,
 'eval_runtime': 59.3494,
 'eval_samples_per_second': 16.849,
 'eval_steps_per_second': 1.062,
 'epoch': 50.0}

In [None]:
trainer.save_model('/user_data/CTG/train/DG/Sciq/Bart_mcq_all_passage_level/sciq/bart-base-text2text-sciq-pretrain-on-mcq-all-passage-level-e1')

Saving model checkpoint to /user_data/CTG/train/DG/Sciq/Bart_sciq_train/sciq/bart-base-text2text-sciq-pretrain-on-sciq-train-passage-e1
Configuration saved in /user_data/CTG/train/DG/Sciq/Bart_sciq_train/sciq/bart-base-text2text-sciq-pretrain-on-sciq-train-passage-e1/config.json
Model weights saved in /user_data/CTG/train/DG/Sciq/Bart_sciq_train/sciq/bart-base-text2text-sciq-pretrain-on-sciq-train-passage-e1/pytorch_model.bin
tokenizer config file saved in /user_data/CTG/train/DG/Sciq/Bart_sciq_train/sciq/bart-base-text2text-sciq-pretrain-on-sciq-train-passage-e1/tokenizer_config.json
Special tokens file saved in /user_data/CTG/train/DG/Sciq/Bart_sciq_train/sciq/bart-base-text2text-sciq-pretrain-on-sciq-train-passage-e1/special_tokens_map.json


In [None]:
predictions, labels, metrics = trainer.predict(valid_dataset)
print('valid: ')
metrics

***** Running Prediction *****
  Num examples = 1000
  Batch size = 16


valid: 


{'test_loss': 0.727323591709137,
 'test_P@1': 0.205,
 'test_P@3': 0.1409999999999999,
 'test_R@3': 0.1407857142857142,
 'test_F1@3': 0.14071111111111106,
 'test_runtime': 59.4148,
 'test_samples_per_second': 16.831,
 'test_steps_per_second': 1.06}

In [None]:
predictions, labels, metrics = trainer.predict(test_dataset)
print('test: ')
metrics

***** Running Prediction *****
  Num examples = 1000
  Batch size = 16


test: 


{'test_loss': 0.8055908679962158,
 'test_P@1': 0.195,
 'test_P@3': 0.14800000000000005,
 'test_R@3': 0.14806349206349215,
 'test_F1@3': 0.1478333333333334,
 'test_runtime': 57.7573,
 'test_samples_per_second': 17.314,
 'test_steps_per_second': 1.091}

In [None]:
stop

NameError: name 'stop' is not defined

In [None]:
import json
def write_json(data, path):
    
    jsonString = json.dumps(data)
    jsonFile = open(path, "w")
    jsonFile.write(jsonString)
    jsonFile.close()

In [None]:
def save_data(data, predictions, labels, file_name):
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # store all article
    predicted = []
    true_label = []
    
    for k in range(len(decoded_labels)):
        pred = decoded_preds[k]
        label = decoded_labels[k]

        pred_list = pred.split(', ')
        label_list = label.split(', ')
        
        pred_list[0] = pred_list[0].split('are ')[-1]
        label_list[0] = label_list[0].split('are ')[-1]

        predicted.append(pred_list)
        true_label.append(label_list)
    
    
    # evaluation metrics
    for idx in range(len(true_label)):
        distractors = predicted[idx]
        labels = true_label[idx]
        
        data[idx]['pred_distractors'] = distractors

        act_set = set(labels)
        pred1_set = set(distractors[:1])
        pred3_set = set(distractors[:3])

        p_1 = len(act_set & pred1_set) / float(1)
        p_3 = len(act_set & pred3_set) / float(3)
        r_3 = len(act_set & pred3_set) / float(len(act_set))

        if p_3 == 0 and r_3 == 0:
            f1_3 = 0
        else:
            f1_3 = 2 * (p_3 * r_3 / (p_3 + r_3))
            
        data[idx]['metric'] = {'P@1': p_1, 'P@3': p_3, 'R@3': r_3, 'F1@3': f1_3}
        
    write_json(data, file_name)
    print(file_name + ' is saved :)')

In [None]:
save_data(test, test_predictions, test_labels, '/user_data/CTG/train/DG/Sciq/Bart_sciq_train_passage_level/sciq_test_retsult/sciq_test_t5_text2text_pretrain_on_sciq_train_passage_level.json')

/user_data/CTG/train/DG/Sciq/Bart_sciq_train_passage_level/sciq_test_retsult/sciq_test_t5_text2text_pretrain_on_sciq_train_passage_level.json is saved :)


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

tokenizer = BartTokenizer.from_pretrained("/user_data/CTG/train/DG/Sciq/Bart_sciq_train_passage_level/sciq/bart-base-text2text-sciq-pretrain-on-sciq-train-passage-level-e1")
model = BartForConditionalGeneration.from_pretrained("/user_data/CTG/train/DG/Sciq/Bart_sciq_train_passage_level/sciq/bart-base-text2text-sciq-pretrain-on-sciq-train-passage-level-e1")

In [None]:
batch_size = 64
args = Seq2SeqTrainingArguments(
    output_dir = "./results",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="P@1",
    weight_decay=0.01,
    predict_with_generate=True,
    eval_accumulation_steps = 1,
)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
test_predictions, test_labels, test_metrics = trainer.predict(test_dataset)
test_metrics

***** Running Prediction *****
  Num examples = 1000
  Batch size = 64


{'test_loss': 0.8552417755126953,
 'test_P@1': 0.202,
 'test_P@3': 0.1636666666666668,
 'test_R@3': 0.16392063492063505,
 'test_F1@3': 0.16363333333333344,
 'test_runtime': 14.9751,
 'test_samples_per_second': 66.778,
 'test_steps_per_second': 0.534}

Result

In [None]:
import json
def read_data(path):
    with open(path) as f:
        data = json.load(f)
    return data

In [None]:
test = read_data('/user_data/CTG/test_result/sciq_test_t5_text2text_pretrain_on_sciq_training_set_passage_level_600000.json')

In [None]:
for i in range(0, 100, 7):
    example = test[i]
    sentence = example['question']
    answer = example['correct_answer']
    distractors = [example['distractor1'], example['distractor2'], example['distractor3']]
    pred_distractors = example['pred_distractors']
    metric = example['metric']
    
    print('question:', sentence.replace('**blank**', '_'))
    print('answer:', answer)
    print('distractors:', distractors)
    print('predict:', pred_distractors)
    print('metric:', metric)
    print()

question: Compounds that are capable of accepting electrons, such as o 2 or f2, are called what?
answer: oxidants
distractors: ['antioxidants', 'Oxygen', 'residues']
predict: ['oxides', 'carbonates', 'soils']
metric: {'P@1': 0.0, 'P@3': 0.0, 'R@3': 0.0, 'F1@3': 0}

question: Which type of tree is dominant in temperate forests?
answer: deciduous
distractors: ['vines', 'fungus', 'shrubs']
predict: ['perennial', 'conifer', 'annual']
metric: {'P@1': 0.0, 'P@3': 0.0, 'R@3': 0.0, 'F1@3': 0}

question: Only about one percent of plants have lost what ability, turning them into consumers and even predators, instead of producers?
answer: photosynthesis
distractors: ['flowering', 'rooting', 'growth']
predict: ['germination', 'death', 'reproduction']
metric: {'P@1': 0.0, 'P@3': 0.0, 'R@3': 0.0, 'F1@3': 0}

question: Presence of a cell wall, large central vacuole, and organelles called plastids distinguish what type of cell?
answer: plant
distractors: ['animal', 'reproductive', 'heterotroph']
predi