In [1]:
import os
import re
import yaml
import json
import torch
import pandas as pd
import torch.nn as nn
import pytorch_lightning as pl

from glob import glob
from tqdm import tqdm
from rouge import Rouge
from transformers import EarlyStoppingCallback
from torch.utils.data import Dataset , DataLoader
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer, BartForConditionalGeneration, BartConfig

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 8
num_workers = 32

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, input_len, summ_len, is_train=True):
        self.tokenizer = tokenizer
        self.df = df
        self.source_len = input_len
        self.summ_len = summ_len
        self.is_train = is_train
        if self.is_train:
            self.input_ids = tokenizer(self.df['dialogue'].tolist(), return_tensors="pt", padding=True,
                                add_special_tokens=True, truncation=True, max_length=512, return_token_type_ids=False).input_ids
            self.labels = tokenizer(self.df['summary'].tolist(), return_tensors="pt", padding=True,
                                add_special_tokens=True, truncation=True, max_length=100, return_token_type_ids=False).input_ids
        else:
            self.input_ids = tokenizer(self.df['dialogue'].tolist(), return_tensors="pt", padding=True,
                                add_special_tokens=True, truncation=True, max_length=512, return_token_type_ids=False).input_ids
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        if self.is_train:
            return self.input_ids[idx], self.labels[idx]
        else:
            return self.input_ids[idx]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("psyche/KoT5-summarization")

special_tokens_dict={'additional_special_tokens': [
    '#Person1#', '#Person2#','#Person3#', '#Person4#', '#Person5#', '#Person6#', '#Person7#', 
    '#PhoneNumber#', '#Address#', '#PassportNumber#', '#CardNumber#', '#Email#', '#DateOfBirth#',]}

tokenizer.add_special_tokens(special_tokens_dict)

In [None]:
train_df = pd.read_csv('../dataset/new_train.csv')
val_df = pd.read_csv('../dataset/new_dev.csv')

In [None]:
train_dataset = CustomDataset(train_df[['dialogue', 'summary']], tokenizer, 400, 256)
val_dataset = CustomDataset(val_df[['dialogue', 'summary']], tokenizer, 400, 256)

train_params = {
    'batch_size': batch_size,
    'shuffle': True,
    'num_workers': num_workers
}

val_params = {
    'batch_size': batch_size,
    'shuffle': False,
    'num_workers': num_workers
}

train_loader = DataLoader(train_dataset, **train_params)
val_loader = DataLoader(val_dataset, **val_params)