In [1]:
import joblib
import os

from torchtext.data import Field, BucketIterator
from torchtext.data import TabularDataset
import torch

import pandas as pd
from tqdm import tqdm_notebook as tqdm
import re
from torchtext.data import Iterator, BucketIterator

# Config

In [2]:
development = True
use_cuda = True
sample_data_folder = r'cnndm-pj/'

MIN_LEN_X = 10
MIN_LEN_Y = 10
MAX_LEN_X = 400
MAX_LEN_Y = 100
MIN_NUM_X = 1
MAX_NUM_X = 1
MAX_NUM_Y = None
W_LS = "<s>"
W_RS = "</s>"
SUMM_BEGIN_TOKEN = r"-lrb- .* -rrb-"
PRETRAINED_VECTOR = 'glove.6B.200d' 

In [3]:
device = (
        torch.device("cuda")
        if use_cuda and torch.cuda.is_available()
        else torch.device("cpu")
    )

# Data Preprocessing

In [4]:
def load_lines(d_path, f_name):
    lines = []
    f_path = os.path.join(d_path , f_name)
    with open(f_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc='Processing file...'):
            line = line.strip("\n").lower()
            fs = line.split("<summ-content>")
            if len(fs) == 2:
                xy_tuple = get_xy_tuple(fs[1], fs[0])
            else:
                print("ERROR:" + line)
                continue
            if xy_tuple != None:
                lines.append(xy_tuple)
    return lines

def get_xy_tuple(cont, head):
    x = read_cont(cont)
    y = read_head(head)

    if x != None and y != None:
        return (x, y)
    else:
        return None

def read_cont(f_cont):
    f_cont = re.sub(SUMM_BEGIN_TOKEN,'', f_cont)
    f_cont = f_cont.replace("--","")
    f_cont = f_cont.replace("-lrb-","").replace("-rrb-","")
    words = f_cont.split()
    num_words = len(words)
    return f_cont if num_words >= MIN_LEN_X and num_words <= MAX_LEN_X+1 else None

def read_head(f_head):
    sents = abstract2sents(f_head)
    line = ' '.join(sents)
    words = line.split()
    num_words = len(words)   
    return line if num_words >= MIN_LEN_Y and num_words <= MAX_LEN_Y+1  else None

def abstract2sents(abstract):
    cur = 0
    sents = []
    while True:
        try:
            start_p = abstract.index(W_LS, cur)
            end_p = abstract.index(W_RS, start_p + 1)
            cur = end_p + len(W_RS)
            sents.append(abstract[start_p+len(W_LS):end_p])
        except ValueError as e: # no more sentences
            return sents

In [5]:
d_path = 'cnndm-pj'

In [6]:
%%time
print ("train set...")
train_xy_list = load_lines(d_path, "train.txt")

print ("test set...")
test_xy_list = load_lines(d_path, "test.txt")
print ("validation set...")
valid_xy_list = load_lines(d_path, "val.txt")

train set...


HBox(children=(IntProgress(value=1, bar_style='info', description='Processing file...', max=1, style=ProgressS…


test set...


HBox(children=(IntProgress(value=1, bar_style='info', description='Processing file...', max=1, style=ProgressS…


validation set...


HBox(children=(IntProgress(value=1, bar_style='info', description='Processing file...', max=1, style=ProgressS…


Wall time: 23.5 s


In [7]:
def write_file(filename, l):
    f = open(filename, 'w', encoding='utf')
    for t in l:
        line = '|'.join(str(x) for x in t)
        f.write(line + '\n')
    f.close()

In [8]:
write_file(os.path.join(d_path,"train_processed.txt"), train_xy_list)
write_file(os.path.join(d_path,"val_processed.txt"), valid_xy_list)
write_file(os.path.join(d_path,"test_processed.txt"), test_xy_list)