<a href="https://colab.research.google.com/github/namwootree/Portfolio/blob/main/Alphaco_(Deep_Learning_Boot_Camp)/Long-Term%20Program/Text_Summarization/%5BMain%5D_Modeling_%26_Prediction_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting

In [2]:
!nvidia-smi

Fri Jun 17 05:52:00 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    38W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!pip install -q datasets transformers rouge-score nltk sentencepiece

In [5]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer, DataCollatorForSeq2Seq, EarlyStoppingCallback
from datasets import load_dataset, load_metric, Dataset
import pandas as pd
import numpy as np
import re
import nltk
import torch

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load Model & Tokenizer & Metric

In [7]:
model_name = 'csebuetnlp/mT5_multilingual_XLSum'
metric_name = 'rouge'

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
metric = load_metric(metric_name)

  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


# Declear Functions

In [9]:
prefix = ""
# prefix = "summarize: "

max_input_length = 512
max_target_length = 512

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["context"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

# Load Data

In [11]:
train_path = '/content/drive/MyDrive/장기 프로젝트/문서 요약/augmented_data_set_141508.csv'
train_df = pd.read_csv(train_path, index_col=False)

In [12]:
split_ratio = 0.005

In [13]:
train_df.drop(['Unnamed: 0'], axis=1, inplace=True)
dataset = Dataset.from_pandas(train_df).train_test_split(split_ratio, seed=100)

In [14]:
dataset = dataset.map(preprocess_function, batched=True)



  0%|          | 0/141 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Training

In [15]:
batch_size = 1
num_train_epochs = 2
es = EarlyStoppingCallback(early_stopping_patience=3)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

In [16]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./log",
    num_train_epochs=num_train_epochs,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.1,
    save_strategy='epoch',
    save_total_limit=3,
    fp16=True,
    predict_with_generate=True,
    generation_max_length=512,
    # generation_num_beams=,
    metric_for_best_model='eval_loss',
    load_best_model_at_end=True,
    gradient_accumulation_steps=16,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[es],
)

Using cuda_amp half precision backend


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: summary, context. If summary, context are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 140800
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 17600


Epoch,Training Loss,Validation Loss


In [None]:
gdrive_path = '/content/drive/MyDrive/문서 요약'

In [None]:
model.save_pretrained(f'{gdrive_path}/save')
tokenizer.save_pretrained(f'{gdrive_path}/save')

# Re Pre-Training

In [None]:
prefix = ""

max_input_length = 200
max_target_length = 200

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["context"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    
    return model_inputs

In [None]:
gdrive_path = '/content/drive/MyDrive/문서 요약'

model = AutoModelForSeq2SeqLM.from_pretrained(f'{gdrive_path}/save')
tokenizer = AutoTokenizer.from_pretrained(f'{gdrive_path}/save')

In [None]:
split_ratio = 0.005

train_df.drop(['Unnamed: 0'], axis=1, inplace=True)
dataset = Dataset.from_pandas(train_df).train_test_split(split_ratio, seed=100)

dataset = dataset.map(preprocess_function, batched=True)

In [None]:
batch_size = 1
num_train_epochs = 2
es = EarlyStoppingCallback(early_stopping_patience=3)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./log",
    num_train_epochs=num_train_epochs,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.1,
    save_strategy='epoch',
    save_total_limit=3,
    fp16=True,
    predict_with_generate=True,
    generation_max_length=512,
    # generation_num_beams=,
    metric_for_best_model='eval_loss',
    load_best_model_at_end=True,
    gradient_accumulation_steps=16,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[es],
)

In [None]:
trainer.train()

In [None]:
gdrive_path = '/content/drive/MyDrive/문서 요약'

In [None]:
model.save_pretrained(f'{gdrive_path}/save')
tokenizer.save_pretrained(f'{gdrive_path}/save')

# Prediction

In [None]:
train_path = '/content/drive/MyDrive/장기 프로젝트/문서 요약/augmented_data_set_141508.csv'
train_df = pd.read_csv(train_path, index_col=False)

In [None]:
prefix = ""

max_input_length = 200
max_target_length = 200

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["context"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    
    return model_inputs

In [None]:
gdrive_path = '/content/drive/MyDrive/문서 요약'

model = AutoModelForSeq2SeqLM.from_pretrained(f'{gdrive_path}/save')
tokenizer = AutoTokenizer.from_pretrained(f'{gdrive_path}/save')

In [None]:
batch_size = 1
num_train_epochs = 10
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

training_args = Seq2SeqTrainingArguments(
    output_dir="./log",
    num_train_epochs=num_train_epochs,
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.1,
    save_strategy='epoch',
    save_total_limit=3,
    fp16=True,
    predict_with_generate=True,
    generation_max_length=30,
    generation_num_beams=5,
    metric_for_best_model='eval_loss',
    load_best_model_at_end=True,
    gradient_accumulation_steps=16,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
test_path = '/content/drive/MyDrive/장기 프로젝트/문서 요약/test_df.csv'

test_df = pd.read_csv(test_path, index_col=False)
test_df.drop(['Unnamed: 0', 'id', 'title', 'region', 'agenda', 'total'], axis=1, inplace=True)
dataset = Dataset.from_pandas(test_df)

dataset = dataset.map(preprocess_function, batched=True)

In [None]:
preds = trainer.predict(dataset)

In [None]:
summary = []

for pred in preds.predictions:
    pred = tokenizer.decode(pred, skip_special_tokens=True)
    summary.append(pred)

In [None]:
sample_path = '/content/drive/MyDrive/장기 프로젝트/문서 요약/sample_submission.csv'

result = pd.read_csv(sample_path)
result['summary'] = summary

result.to_csv('result_bart_r3f_14만개_3.csv', index=False)