In [1]:
import pandas as pd

In [2]:
train_df = pd.read_json('./train.json').reset_index()

In [3]:
val_df = pd.read_json('./val.json').reset_index()

In [4]:
test_df = pd.read_json('./test.json').reset_index()

In [5]:
# !pip install transformers==4.2.1
from transformers import BertTokenizer, GPT2Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# !pip install datasets==1.9
from datasets import Dataset
train_data = Dataset.from_pandas(train_df)
val_data = Dataset.from_pandas(val_df)
test_data = Dataset.from_pandas(test_df)

In [7]:
tokenizer_src = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
tokenizer_src.bos_token = tokenizer_src.cls_token
tokenizer_src.eos_token = tokenizer_src.sep_token

In [8]:
# make sure GPT2 appends EOS in begin and end
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs

In [9]:
GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
tokenizer_tgt = GPT2Tokenizer.from_pretrained("cahya/gpt2-small-indonesian-522M")
tokenizer_tgt.pad_token = tokenizer_tgt.unk_token

In [10]:
alabs = pd.concat([train_df, val_df])

In [11]:
alabs

Unnamed: 0.1,index,Unnamed: 0,review,abstractive,kategori,review_cln,ext_review
0,232,232,Sekumpul adalah nama air terjun yang berada d...,Air terjun Sekumpul adalah salah satu destinas...,labeled-attraction,sekumpul adalah nama air terjun yang berada di...,dan selanjutnya anda cukup berjalan 10 menit a...
1,59,59,Kami menginap selama 3 hari yaitu mulai tangg...,Harris Resort Barelang sangat menyenangkan kar...,labeled-hotel,kami menginap selama 3 hari yaitu mulai tangga...,breakfast nya lumayan variatif pelayanan yang ...
2,6,6,Hampir seminggu menginap di Pop Gubeng suraba...,"Hotel ini terletak di Gubeng, Surabaya dengan ...",labeled-hotel,hampir seminggu menginap di pop gubeng surabay...,staff hotel nya ramah ramah makanannya juga en...
3,185,185,"disambut dengan ramah pada saat kedatangan, m...",Rumah makan Ondo dengan menu khas Babi Panggan...,labeled-restaurant,disambut dengan ramah pada saat kedatangan men...,excellent makanan sangat enak suasana sangat t...
4,173,173,Nasi tempong adalah menu makanan yang bukan f...,Nasi Tempong Mbok Nah di Banyuwangi merupakan ...,labeled-restaurant,nasi tempong adalah menu makanan yang bukan fa...,porsi basic nya nasi tahu tempe sayur sambal d...
...,...,...,...,...,...,...,...
25,286,286,Sari ater hot spring merupakan objek wisata y...,Sari Ater Hot Spring adalah tempat wisata alam...,labeled-attraction,sari ater hot spring merupakan objek wisata ya...,tempatnya sejuk dan bisa berendam di kolam air...
26,182,182,Ayam Goreng Kalasan (Kalasan) adalah resto ma...,Kalasan adalah restoran masakan Indonesia di M...,labeled-restaurant,ayam goreng kalasan kalasan adalah resto masak...,mie gorengnya terasa enak dan mantap. [SEP] ay...
27,157,157,"seperti judul di atas, pelayanan staf di rest...",Cwie Mie Gang Jangkrik adalah restoran yang te...,labeled-restaurant,seperti judul di atas pelayanan staf di resto ...,tapi paling terkenal cwei mienya. [SEP] cwie m...
28,144,144,Sesuai dengan reputasi Le Meridien sebagai ho...,Restoran ini menyediakan berbagai macam makana...,labeled-restaurant,sesuai dengan reputasi boleh meridien sebagai ...,rasanya pas banget sama lidah staffnya juga pa...


In [12]:
# import matplotlib.pyplot as plt

# text_word_count = []
# summary_word_count = []

# # populate the lists with sentence lengths
# for i in alabs['ctext']:
#       text_word_count.append(len(i.split()))

# for i in alabs['text']:
#       summary_word_count.append(len(i.split()))

# length_df = pd.DataFrame({'ulasan':text_word_count, 'ringkasan':summary_word_count})

# length_df.hist(bins = 30)
# plt.show()

In [12]:
max_ulasan = 304
print(max_ulasan)
max_ringkasan = 116
print(max_ringkasan)

304
116


In [13]:
encoder_max_length = 512
decoder_max_length = 80

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer_src(batch["review_cln"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer_tgt(batch["abstractive"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["labels"] = outputs.input_ids.copy()
  batch["decoder_attention_mask"] = outputs.attention_mask

  batch["labels"] = [
        [-100 if mask == 0 else token for mask, token in mask_and_tokens] for mask_and_tokens in [zip(masks, labels) for masks, labels in zip(batch["decoder_attention_mask"], batch["labels"])]
    ]
  
  assert all([len(x) == encoder_max_length for x in inputs.input_ids])
  assert all([len(x) == decoder_max_length for x in outputs.input_ids])
  return batch

In [14]:
batch_size=32

train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    # remove_columns=["review", "extractive", "review_cln", "extractive_cln", "sent_tok", "review_key"]
)
val_data = val_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    # remove_columns=["review", "extractive", "review_cln", "extractive_cln", "sent_tok", "review_key"]
)

100%|██████████| 8/8 [00:02<00:00,  2.74ba/s]
100%|██████████| 1/1 [00:00<00:00,  2.91ba/s]


In [15]:
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

In [16]:
from transformers import EncoderDecoderModel

bert2gpt = EncoderDecoderModel.from_encoder_decoder_pretrained("indobenchmark/indobert-base-p1","cahya/gpt2-small-indonesian-522M")

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at cahya/gpt2-small-indonesian-522M and are newly initialized: ['transformer.h.0.crossattention.bias', 'transformer.h.0.crossattention.masked_bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.1.crossattention.bias', 'transformer.h.1.crossattention.masked_bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.ln_cross_attn.weight', 'tran

In [17]:
# for layer in bert2gpt.encoder.encoder.layer[1:]:
#   for param in layer.parameters():
#     param.requires_grad = False

In [18]:
# for layer in bert2gpt.encoder.encoder.layer:
#   for param in layer.parameters():
#     print(param.requires_grad)

In [19]:
# bert2gpt

In [20]:
# bert2gpt.config.decoder.eos_token_id

In [21]:
bert2gpt.config.decoder_start_token_id = tokenizer_tgt.bos_token_id
bert2gpt.config.eos_token_id = tokenizer_tgt.eos_token_id
bert2gpt.config.pad_token_id = tokenizer_tgt.pad_token_id
bert2gpt.config.max_length = 80
bert2gpt.config.min_length = 3
bert2gpt.config.no_repeat_ngram_size = 3
bert2gpt.early_stopping = True
bert2gpt.length_penalty = 2.0
bert2gpt.num_beams = 8

In [22]:
# !pip install rouge_score
from datasets import load_metric

rouge = load_metric("rouge")

In [23]:
def rouge_matrix(pred, ref):
  result = rouge.compute(predictions=pred, references=ref)
  rouge1 = result['rouge1'].mid
  rouge2 = result['rouge2'].mid
  rougeL = result['rougeL'].mid

  return {
      "rouge1_fmeasure": round(rouge1.fmeasure, 4),
      "rouge2_fmeasure": round(rouge2.fmeasure, 4),
      "rougeL_fmeasure": round(rougeL.fmeasure, 4),
  }

In [24]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer_tgt.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer_tgt.pad_token_id
    label_str = tokenizer_tgt.batch_decode(labels_ids, skip_special_tokens=True)

    result = rouge_matrix(pred_str, label_str)

    return result

In [25]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=True, 
    output_dir="./",
    do_train=True,
    do_eval=True,
    logging_steps=50,  # set to 1000 for full training
    save_steps=100,  # set to 500 for full training
    eval_steps=10, 
    warmup_steps=15,  
    num_train_epochs = 128, #TRAIN_EPOCHS
    overwrite_output_dir=True,
    save_total_limit=3
)

In [26]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=bert2gpt,
    tokenizer=tokenizer_tgt,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()

Step,Training Loss
50,4.085
100,2.4616
150,1.4793
200,0.7873
250,0.4859
300,0.3261
350,0.2257
400,0.1711
450,0.126
500,0.0819


TrainOutput(global_step=3840, training_loss=0.1406769027923777, metrics={'train_runtime': 1017.5634, 'train_samples_per_second': 3.774, 'total_flos': 30252592005120000, 'epoch': 128.0})

In [28]:
trainer.save_model('./bert2gpt')

In [33]:
import os
import zipfile

zf = zipfile.ZipFile("bert2gpt_128_2.zip", "w")
for dirname, subdirs, files in os.walk('checkpoint-3800'):
    zf.write(dirname)
    for filename in files:
        zf.write(os.path.join(dirname, filename))
zf.close()

In [27]:
model = EncoderDecoderModel.from_pretrained('checkpoint-3800')

In [28]:
batch_size = 16  # change to 64 for full evaluation

# map data correctly
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer_src(batch["review_cln"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask

    outputs = model.generate(input_ids, max_length=80, attention_mask=attention_mask, num_beams=10,
                                repetition_penalty=5.0, num_return_sequences=1)

    # all special tokens including will be removed
    output_str = tokenizer_tgt.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch

In [29]:
resultsb = test_data.map(generate_summary, batched=True, batch_size=batch_size)
pred_strb = resultsb["pred"]
label_strb = resultsb["abstractive"]

100%|██████████| 2/2 [22:17<00:00, 668.68s/ba]


In [30]:
rouge_output = rouge_matrix(pred_strb, label_strb)
print(rouge_output)

INFO:absl:Using default tokenizer.


{'rouge1_fmeasure': 0.2484, 'rouge2_fmeasure': 0.0303, 'rougeL_fmeasure': 0.1625}


In [31]:
test_df['prediksi'] = pred_strb

In [32]:
test_df.to_csv('bert2gpt-128.csv')