<a href="https://colab.research.google.com/github/namwootree/Portfolio/blob/main/Alphaco_(Deep_Learning_Boot_Camp)/Long-Term%20Program/Text_Summarization/%5BMain%5D_Modeling_%26_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

Sat Jun 18 18:29:41 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P0    36W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q datasets transformers rouge-score nltk sentencepiece wandb

In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, EarlyStoppingCallback
from datasets import load_dataset, load_metric, Dataset
import wandb
import pandas as pd
import numpy as np
import re
import nltk
import torch
import os

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load Model & Tokenizer & Metric

In [None]:
model_name = 'ainize/kobart-news'
metric_name = 'rouge'

In [None]:
config = AutoConfig.from_pretrained(model_name)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [None]:
del config.label2id
del config.id2label
del config.task_specific_params

# config.max_position_embeddings=2050

In [None]:
config

BartConfig {
  "_name_or_path": "ainize/kobart-news",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.1,
  "d_model": 768,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 1,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": false,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_position_embeddings": 1026,
  "model_type": "bart",
  "normalize_before": false,
  "normalize_embedding": true,
  "n

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_name)
metric = load_metric(metric_name)

### Declear Functions

In [None]:
prefix = ""
# prefix = "summarize: "

max_input_length = 1026
max_target_length = 514

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["context"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

### Load Data

In [None]:
train_path = '/content/drive/MyDrive/장기 프로젝트/문서 요약/augmented_data_set_247756.csv'
eval_path = '/content/drive/MyDrive/장기 프로젝트/문서 요약/train_df.csv'

In [None]:
train_df = pd.read_csv(train_path).iloc[:150000]

eval_df = pd.read_csv(eval_path)

In [None]:
train_df.shape, eval_df.shape

((150000, 3), (2994, 3))

In [None]:
split_ratio = 0.1
train_dataset = Dataset.from_pandas(train_df).shuffle(seed=100)
eval_dataset = Dataset.from_pandas(eval_df).train_test_split(split_ratio, seed=100)['test']

In [None]:
print(train_dataset)
print(eval_dataset)

Dataset({
    features: ['Unnamed: 0', 'context', 'summary'],
    num_rows: 150000
})
Dataset({
    features: ['context', 'evidence', 'summary'],
    num_rows: 300
})


In [None]:
train_dataset = train_dataset.map(preprocess_function, 
                                  batched=True, 
                                  num_proc=4, 
                                  remove_columns=train_dataset.column_names)

eval_dataset = eval_dataset.map(preprocess_function, 
                                batched=True, 
                                num_proc=4, 
                                remove_columns=eval_dataset.column_names)



     

#0:   0%|          | 0/38 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/38 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/38 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/38 [00:00<?, ?ba/s]

In [None]:
print(train_dataset)
print(eval_dataset)

### Logging and WandB Configs

In [None]:
wandb.login()

In [None]:
output_dir = "./log"

In [None]:
%env WANDB_PROJECT=BART-Generative-Summarization
report_to="wandb"
run_name="2994-Samples-Augmented-to-150000-Samples"

### Training

In [None]:
num_train_epochs = 3

per_device_train_batch_size = 1
per_device_eval_batch_size = 1

gradient_accumulation_steps = 1

es = EarlyStoppingCallback(early_stopping_patience=8)
save_total_limit = 10
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

In [None]:
training_args = Seq2SeqTrainingArguments(
    seed=100,

    output_dir=output_dir,

    num_train_epochs=num_train_epochs,
    learning_rate=2e-5,

    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    
    gradient_accumulation_steps=gradient_accumulation_steps,
    # fp16=True,

    save_total_limit=save_total_limit,
    save_strategy='steps',
    evaluation_strategy="steps",
    save_steps=7500,
    eval_steps=7500,

    logging_strategy='steps',
    logging_first_step=True,
    logging_steps= 500,

    weight_decay=0.01,

    lr_scheduler_type='cosine',
    warmup_ratio=0.1,

    metric_for_best_model='eval_rouge1',
    load_best_model_at_end=True,

    predict_with_generate=True,
    generation_max_length=512,
    generation_num_beams=5,

    report_to=report_to,
    run_name=run_name,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[es],
)

In [None]:
trainer.train()
# wandb.finish()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 150000
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 450000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
7500,0.8466,0.564626,60.6352,42.2628,60.2617,60.3759,37.0667
15000,0.5651,0.357319,63.8005,45.0482,63.4355,63.5838,36.86
22500,0.3657,0.201218,66.3295,47.7389,66.03,66.0444,37.7067
30000,0.1979,0.104445,68.1969,48.887,67.7668,67.9532,38.7433
37500,0.1521,0.074487,69.9577,50.3036,69.7987,69.7602,38.9967
45000,0.1168,0.06162,70.1521,50.6869,70.108,69.8253,39.2633
52500,0.098,0.050459,69.9657,49.9236,69.7351,69.9619,38.4533
60000,0.0712,0.044529,70.3319,51.9099,70.2088,70.3626,39.7567
67500,0.0626,0.028301,71.9495,53.2863,71.9149,71.8513,39.7067
75000,0.0512,0.029703,71.5821,53.2674,71.6287,71.5193,40.3467


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 300
  Batch size = 1
Saving model checkpoint to ./log/checkpoint-7500
Configuration saved in ./log/checkpoint-7500/config.json
Model weights saved in ./log/checkpoint-7500/pytorch_model.bin
tokenizer config file saved in ./log/checkpoint-7500/tokenizer_config.json
Special tokens file saved in ./log/checkpoint-7500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 300
  Batch 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/gen_len,▁▁▃▅▅▆▄▇▇█▇▇█▇▇▇▇▇
eval/loss,█▅▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
eval/rouge1,▁▃▄▅▆▆▆▇▇▇████████
eval/rouge2,▁▃▄▅▅▆▅▆▇▇▇███████
eval/rougeL,▁▃▄▅▆▆▆▆▇▇████████
eval/rougeLsum,▁▃▄▅▆▆▆▇▇▇████████
eval/runtime,▄▃▂█▄▇▁▇▄▇▆▁▃▂▂▅▂▁
eval/samples_per_second,▅▆▇▁▅▂█▂▅▂▃█▆▇▇▄▇█
eval/steps_per_second,▅▆▇▁▅▂█▂▅▂▃█▆▇▇▄▇█
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/gen_len,39.8267
eval/loss,0.00452
eval/rouge1,72.8889
eval/rouge2,55.0
eval/rougeL,72.8889
eval/rougeLsum,72.7778
eval/runtime,138.7824
eval/samples_per_second,2.162
eval/steps_per_second,2.162
train/epoch,0.9


### Resume Training from CKPT

In [None]:
trainer.train('/content/drive/MyDrive/checkpoint-135000')
wandb.finish()

Loading model from /content/drive/MyDrive/checkpoint-135000.
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 150000
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 450000
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 135000
  Will skip the first 0 epochs then the first 135000 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/135000 [00:00<?, ?it/s]

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
142500,0.0236,0.005599,73.0639,54.6699,72.8944,72.9694,39.92
150000,0.0221,0.00513,73.1111,55.0,73.0,73.0,39.89
157500,0.0134,0.004135,73.1111,54.7333,72.8889,73.0,39.8233
165000,0.0126,0.004049,73.0833,54.9091,72.9167,72.9444,39.7033
172500,0.0149,0.007563,73.0444,54.6667,72.8222,72.95,39.83
180000,0.0142,0.009425,72.9241,54.545,72.7296,72.7778,39.8933
187500,0.0087,0.007579,72.8799,54.4095,72.7438,72.811,40.0267


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 300
  Batch size = 1
Saving model checkpoint to ./log/checkpoint-142500
Configuration saved in ./log/checkpoint-142500/config.json
Model weights saved in ./log/checkpoint-142500/pytorch_model.bin
tokenizer config file saved in ./log/checkpoint-142500/tokenizer_config.json
Special tokens file saved in ./log/checkpoint-142500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 30

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/gen_len,▆▅▄▁▄▅█
eval/loss,▃▂▁▁▆█▆
eval/rouge1,▇██▇▆▂▁
eval/rouge2,▄█▅▇▄▃▁
eval/rougeL,▅█▅▆▃▁▁
eval/rougeLsum,▇██▆▆▁▂
eval/runtime,▇█▁▅▂▇▂
eval/samples_per_second,▂▁█▄▇▂▇
eval/steps_per_second,▂▁█▄▇▂▇
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/gen_len,40.0267
eval/loss,0.00758
eval/rouge1,72.8799
eval/rouge2,54.4095
eval/rougeL,72.7438
eval/rougeLsum,72.811
eval/runtime,88.897
eval/samples_per_second,3.375
eval/steps_per_second,3.375
train/epoch,1.25


In [None]:
trainer.train('/content/log/checkpoint-187500')
wandb.finish()

### Save Model

In [None]:
gdrive_path = '/content/drive/MyDrive/Data Science/알파코 딥러닝 부트캠프/프로젝트/AI 기반 회의 녹취록 요약 경진대회'
save_name = 'con2sum ainize-kobart-news 2994-Samples-Augmented-to-150000-Samples run 3 (from ckpt 187500)'

In [None]:
model.save_pretrained(f'{gdrive_path}/save/{save_name}')
tokenizer.save_pretrained(f'{gdrive_path}/save/{save_name}')

### Load Model

In [None]:
gdrive_path = '/content/drive/MyDrive/문서 요약'

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(f'{gdrive_path}/save')
tokenizer = AutoTokenizer.from_pretrained(f'{gdrive_path}/save')

In [None]:
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
num_train_epochs = 10
es = EarlyStoppingCallback(early_stopping_patience=3)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./log",

    num_train_epochs=num_train_epochs,
    learning_rate=2e-5,

    per_device_train_batch_size=per_device_train_batch_size,
    # gradient_accumulation_steps=16,

    per_device_eval_batch_size=per_device_eval_batch_size,
    evaluation_strategy="epoch",

    save_strategy='epoch',
    save_total_limit=3,

    # fp16=True,

    weight_decay=0.01,
    # lr_scheduler_type='linear',
    # warmup_ratio=0.1,

    metric_for_best_model='eval_loss',
    load_best_model_at_end=True,

    predict_with_generate=True,
    # generation_max_length=512,
    # generation_num_beams=5,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
test_path = '/content/drive/MyDrive/장기 프로젝트/문서 요약/test_df.csv'

test_df = pd.read_csv(test_path, index_col=False)
test_df.drop(['Unnamed: 0', 'id', 'title', 'region', 'agenda', 'total'], axis=1, inplace=True)
dataset = Dataset.from_pandas(test_df)

dataset = dataset.map(preprocess_function, batched=True)

In [None]:
preds = trainer.predict(dataset)

The following columns in the test set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: summary, context. If summary, context are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 300
  Batch size = 1


In [None]:
summary = []

for pred in preds.predictions:
    pred = tokenizer.decode(pred, skip_special_tokens=True)
    summary.append(pred)

In [None]:
sample_path = '/content/drive/MyDrive/장기 프로젝트/문서 요약/sample_submission.csv'

result = pd.read_csv(sample_path)
result['summary'] = summary

result.to_csv('result_kobart-news_24만개_6.csv', index=False)