In [1]:
!pip install easydict



In [2]:
import easydict

In [22]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from itertools import chain
from sklearn.model_selection import train_test_split

from pprint import pprint


class CustomDataset(Dataset):
    def __init__(self, args, data, mode):
        self.data = data
        self.data_dir = args.data_dir
        self.mode = mode
        self.tokenizer = AutoTokenizer.from_pretrained(args.model_name)
        self.inputs, self.labels = self.data_loader()

    def data_loader(self):
        print('Loading ' + self.mode + ' dataset..')
        if os.path.isfile(os.path.join(self.data_dir, self.mode, self.mode + '_X.pt')):
            inputs = torch.load(os.path.join(self.data_dir, self.mode, self.mode + '_X.pt'))
            labels = torch.load(os.path.join(self.data_dir, self.mode, self.mode + '_Y.pt'))
        
        else:
            df = self.data
            inputs = pd.DataFrame(columns=['src'])
            labels = pd.DataFrame(columns=['trg'])
            inputs['src'] =  df['article_original']
#             print(inputs['src'])

            if self.mode != "test":
                labels['trg'] =  df['extractive']
                
#             print(labels)

            # Preprocessing
            inputs, labels = self.preprocessing(inputs, labels)
            print("preprocessing")

            # Save data
            torch.save(inputs, os.path.join(self.data_dir, self.mode, self.mode + '_X.pt'))
            torch.save(labels, os.path.join(self.data_dir, self.mode, self.mode + '_Y.pt'))

        inputs = inputs.values
        labels = labels.values

        return inputs, labels

    def pad(self, data, pad_id, max_len):
        padded_data = data.map(lambda x : torch.cat([x, torch.tensor([pad_id] * (max_len - len(x)), dtype=torch.int64)]))
        print(padded_data[0])
        return padded_data

    def preprocessing(self, inputs, labels):
        print('Preprocessing ' + self.mode + ' dataset..')

        # Encoding original text
        inputs['src'] = inputs['src'].map(lambda x: torch.tensor(list(chain.from_iterable([self.tokenizer.encode(x[i], max_length = int(512 / len(x)), add_special_tokens=True) for i in range(len(x))]))))
        inputs['clss'] = inputs.src.map(lambda x : torch.cat([torch.where(x == 2)[0], torch.tensor([len(x)])]))
        inputs['segs'] = inputs.clss.map(lambda x : torch.tensor(list(chain.from_iterable([[0] * (x[i+1] - x[i]) if i % 2 == 0 else [1] * (x[i+1] - x[i]) for i, val in enumerate(x[:-1])]))))
        inputs['clss'] = inputs.clss.map(lambda x : x[:-1])
        
        # Padding
        max_encoding_len = max(inputs.src.map(lambda x: len(x)))
        max_label_len = max(inputs.clss.map(lambda x: len(x)))
        inputs['src'] = self.pad(inputs.src, 0, max_encoding_len)
        inputs['segs'] = self.pad(inputs.segs, 0, max_encoding_len)
        inputs['clss'] = self.pad(inputs.clss, -1, max_label_len)
        inputs['mask'] = inputs.src.map(lambda x: ~ (x == 0))
        inputs['mask_clss'] = inputs.clss.map(lambda x: ~ (x == -1))

        # Binarize label {Extracted sentence : 1, Not Extracted sentence : 0}

        if self.mode != 'test':
            labels = labels['trg'].map(lambda  x: torch.tensor([1 if i in x else 0 for i in range(max_label_len)]))

        return inputs, labels


    def __len__(self):
        return len(self.inputs)


    def __getitem__(self, index):
        if self.mode == 'test':
            return [self.inputs[index][i] for i in range(5)]
        else:
            return [self.inputs[index][i] for i in range(5)], self.labels[index]


def get_train_loaders(args):
    """
        define train/validation pytorch dataset & loader

        Returns:
            train_loader: pytorch data loader for train data
            val_loader: pytorch data loader for validation data
    """
    # Get data from json
#     with open(os.path.join(args.data_dir, "train.json"), "r", encoding="utf-8-sig") as f:
#         data = pd.read_json(f) 
#     train_df = pd.DataFrame(data)
    path = os.path.join(args.data_dir, "train.json")
    train_df = pd.read_json(path, orient='records', encoding='utf-8-sig')
    
    # Split train & test data
    train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=args.seed)
    
    # Get train & valid dataset from dataset.py
    train_dataset = CustomDataset(args, train_data, mode='train')
    
    val_dataset = CustomDataset(args, val_data, mode='valid')

    # Define data loader based on each dataset
    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers,
                                  pin_memory=True,
                                  drop_last=False,
                                  shuffle=True)
    val_dataloader = DataLoader(dataset=val_dataset,
                                batch_size=args.batch_size,
                                num_workers=args.num_workers,
                                pin_memory=True,
                                drop_last=False,
                                shuffle=False)

    return train_dataloader, val_dataloader
    
    
def get_test_loader(args):
    # Get data from json
    with open(os.path.join(args.data_dir, "test.json"), "r", encoding="utf-8-sig") as f:
        data = pd.read_json(f) 
    test_df = pd.DataFrame(data)
    
    # Load dataset & dataloader
    test_dataset = CustomDataset(args, test_df, mode='test')
    test_dataloader = DataLoader(dataset=test_dataset,
                                 batch_size=args.batch_size,
                                 num_workers=args.num_workers,
                                 pin_memory=True,
                                 drop_last=False,
                                 shuffle=False)
    
    return test_dataloader

In [5]:
config = {}

# 설정
config['seed'] = 981201
config['device'] = "cuda" if torch.cuda.is_available() else "cpu"
config['data_dir'] = '/opt/ml/Legal-Documen t-Summarization/data'
config['model_name'] = 'beomi/KcELECTRA-base'
config['batch_size'] = 64
config['num_workers']=1


args = easydict.EasyDict(config)

In [23]:
train_loader, val_loader = get_train_loaders(args)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Loading train dataset..
Preprocessing train dataset..
tensor([    2,    61,    21,    63,  2730,  4374,  8463,  4202,  4036,  4190,
        16977,  4578,  4180, 12876,  4651,    16,     3,     2,  2843, 15153,
         4180, 12876,  4651,  4192, 30407,  4023, 12539, 17565,  4082, 30407,
         4195,  9576, 11003,  7974, 30087,  4053, 13884, 24794,  8048,    16,
            3,     2, 30407, 12159,  4050, 30407,  4023, 27821,  4069,  1074,
         8180, 30407,  4195,  9576, 11003,  7974, 30087,     3,     2, 11707,
        32682,  1776,  4065,  4063, 30407, 15466,    16,     3,     2, 32682,
         2153,  4232,    12, 25453, 26762,    13,  2544, 49026,    16,     3,
            2,  2153,  4232, 15466,    12, 25453, 26762, 26225, 26507,    13,
           16,     3,     2, 23873, 15466,  1616, 30407,  4067,    12, 26239,
        26553, 26147,    13,  2730, 14628, 18368,  7977,    16,     3,     2,
        30407,  4194,  4041,  2744,  4053,  2445,  4517,  8457, 12047, 30407,
         4

FileNotFoundError: [Errno 2] No such file or directory: '/opt/ml/Legal-Document-Summarization/data/train/train_X.pt'