In [1]:
# !wget https://huggingface.co/datasets/cnn_dailymail/resolve/11343c3752184397d56efc19a8a7cceb68089318/data/cnn_stories.tgz
# !wget https://huggingface.co/datasets/cnn_dailymail/resolve/11343c3752184397d56efc19a8a7cceb68089318/data/dailymail_stories.tgz

In [2]:
# !wget https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt

In [3]:
# !tar zxf cnn_stories.tgz
# !tar zxf dailymail_stories.tgz
# !rm cnn_stories.tgz dailymail_stories.tgz

In [4]:
import hashlib
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [5]:
def _get_url_hashes(path):
    """Get hashes of urls in file."""
    urls = _read_text_file_path(path)

    def url_hash(u):
        h = hashlib.sha1()
        try:
            u = u.encode("utf-8")
        except UnicodeDecodeError:
            logger.error("Cannot hash url: %s", u)
        h.update(u)
        return h.hexdigest()

    return {url_hash(u) for u in urls}


def _get_hash_from_path(p):
    """Extract hash from path."""
    return os.path.splitext(os.path.basename(p))[0]


DM_SINGLE_CLOSE_QUOTE = "\u2019"  # unicode
DM_DOUBLE_CLOSE_QUOTE = "\u201d"
# acceptable ways to end a sentence
END_TOKENS = [".", "!", "?", "...", "'", "`", '"', DM_SINGLE_CLOSE_QUOTE, DM_DOUBLE_CLOSE_QUOTE, ")"]

In [6]:
def _read_text_file_path(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f]
    return lines


def _read_text_file(file):
    return [line.strip() for line in file]


def _get_art_abs(story_file, tfds_version = "1.0"):
    """Get abstract (highlights) and article from a story file path."""
    # Based on https://github.com/abisee/cnn-dailymail/blob/master/
    #     make_datafiles.py

    lines = _read_text_file(story_file)

    # The github code lowercase the text and we removed it in 3.0.0.

    # Put periods on the ends of lines that are missing them
    # (this is a problem in the dataset because many image captions don't end in
    # periods; consequently they end up in the body of the article as run-on
    # sentences)
    def fix_missing_period(line):
        """Adds a period to a line that is missing a period."""
        if "@highlight" in line:
            return line
        if not line:
            return line
        if line[-1] in END_TOKENS:
            return line
        return line + " ."

    lines = [fix_missing_period(line) for line in lines]

    # Separate out article and abstract sentences
    article_lines = []
    highlights = []
    next_is_highlight = False
    for line in lines:
        if not line:
            continue  # empty line
        elif line.startswith("@highlight"):
            next_is_highlight = True
        elif next_is_highlight:
            highlights.append(line)
        else:
            article_lines.append(line)

    # Make article into a single string
    article = " ".join(article_lines)

    if tfds_version >= "2.0.0":
        abstract = "\n".join(highlights)
    else:
        abstract = " ".join(highlights)

    return article, abstract

In [7]:
urls = _get_url_hashes('all_train.txt')

In [8]:
from glob import glob

In [9]:
cnns = glob('cnn/*/*')
dailymails = glob('dailymail/*/*')
len(cnns), len(dailymails)

(92579, 219506)

In [10]:
files = cnns + dailymails

In [11]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/finetune-translation-t5-small-standard-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained('mesolitica/finetune-translation-t5-small-standard-bahasa-cased')

In [12]:
_ = model.cuda()

In [13]:
input_ids = tokenizer.encode(f'terjemah Inggeris ke Melayu: i like', return_tensors = 'pt')
outputs = model.generate(input_ids.cuda(), max_length = 10000)
tokenizer.decode(outputs[0],skip_special_tokens=True)

'saya suka'

In [14]:
import torch

In [15]:
torch.cuda.empty_cache()

In [16]:
directory = 'translated-train'
!mkdir {directory}

mkdir: cannot create directory ‘translated-train’: File exists


In [17]:
from tqdm import tqdm

selected_hash = []
for f in tqdm(files):
    path = f
    hash_from_path = _get_hash_from_path(path)
    if hash_from_path in urls:
        selected_hash.append(f)
        
selected_hash = sorted(selected_hash)
len(selected_hash)

100%|███████████████████████████████| 312085/312085 [00:00<00:00, 981251.26it/s]


287227

In [18]:
import malaya
import json

maxlen = 128

for f in tqdm(selected_hash):
    path = f
    hash_from_path = _get_hash_from_path(path)
    new_path = os.path.join(directory, hash_from_path)
    if os.path.exists(new_path):
        continue
        
    torch.cuda.empty_cache()
    
    file = _read_text_file_path(path)
    article, highlights = _get_art_abs(file)
    splitted = malaya.text.function.split_into_sentences(article, minimum_length = 2)

    r, temp = [], []
    for s in splitted:
        temp.append(s)
        if len(' '.join(temp).split()) > maxlen:
            r.append(temp)
            temp = []

    if len(temp):
        r.append(temp)

    articles = []
    for r_ in r:
        s = ' '.join(r_)
        input_ids = tokenizer.encode(f'terjemah Inggeris ke Melayu: {s}', return_tensors = 'pt').cuda()
        outputs = model.generate(input_ids, max_length = 10000)
        articles.append(tokenizer.decode(outputs[0],skip_special_tokens=True))

    input_ids = tokenizer.encode(f'terjemah Inggeris ke Melayu: {highlights}', return_tensors = 'pt').cuda()
    outputs = model.generate(input_ids, max_length = 10000)
    t_highlights = tokenizer.decode(outputs[0],skip_special_tokens=True)

    trans = {'article': articles, 'highlight': t_highlights}
    with open(new_path, 'w') as fopen:
        json.dump(trans, fopen)

  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
 92%|██████████████████████████▋  | 264908/287227 [27:17:23<10:49:18,  1.75s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|████████████████████████████████| 287227/287227 [41:37:54<00:00,  1.92it/s]
