In [5]:
# !wget https://huggingface.co/datasets/cnn_dailymail/resolve/11343c3752184397d56efc19a8a7cceb68089318/data/cnn_stories.tgz
# !wget https://huggingface.co/datasets/cnn_dailymail/resolve/11343c3752184397d56efc19a8a7cceb68089318/data/dailymail_stories.tgz

In [8]:
# !wget https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt
# !wget https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt
# !wget https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt

In [9]:
!ls all_*.txt

all_test.txt  all_train.txt  all_val.txt


In [14]:
import hashlib
import os
import json

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
def _get_url_hashes(path):
    """Get hashes of urls in file."""
    urls = _read_text_file_path(path)

    def url_hash(u):
        h = hashlib.sha1()
        try:
            u = u.encode("utf-8")
        except UnicodeDecodeError:
            logger.error("Cannot hash url: %s", u)
        h.update(u)
        return h.hexdigest()

    return {url_hash(u) for u in urls}


def _get_hash_from_path(p):
    """Extract hash from path."""
    return os.path.splitext(os.path.basename(p))[0]


DM_SINGLE_CLOSE_QUOTE = "\u2019"  # unicode
DM_DOUBLE_CLOSE_QUOTE = "\u201d"
# acceptable ways to end a sentence
END_TOKENS = [".", "!", "?", "...", "'", "`", '"', DM_SINGLE_CLOSE_QUOTE, DM_DOUBLE_CLOSE_QUOTE, ")"]

In [3]:
def _read_text_file_path(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f]
    return lines


def _read_text_file(file):
    return [line.strip() for line in file]


def _get_art_abs(story_file, tfds_version = "1.0"):
    """Get abstract (highlights) and article from a story file path."""
    # Based on https://github.com/abisee/cnn-dailymail/blob/master/
    #     make_datafiles.py

    lines = _read_text_file(story_file)

    # The github code lowercase the text and we removed it in 3.0.0.

    # Put periods on the ends of lines that are missing them
    # (this is a problem in the dataset because many image captions don't end in
    # periods; consequently they end up in the body of the article as run-on
    # sentences)
    def fix_missing_period(line):
        """Adds a period to a line that is missing a period."""
        if "@highlight" in line:
            return line
        if not line:
            return line
        if line[-1] in END_TOKENS:
            return line
        return line + " ."

    lines = [fix_missing_period(line) for line in lines]

    # Separate out article and abstract sentences
    article_lines = []
    highlights = []
    next_is_highlight = False
    for line in lines:
        if not line:
            continue  # empty line
        elif line.startswith("@highlight"):
            next_is_highlight = True
        elif next_is_highlight:
            highlights.append(line)
        else:
            article_lines.append(line)

    # Make article into a single string
    article = " ".join(article_lines)

    if tfds_version >= "2.0.0":
        abstract = "\n".join(highlights)
    else:
        abstract = " ".join(highlights)

    return article, abstract

In [4]:
urls = _get_url_hashes('all_train.txt')

In [10]:
from glob import glob

In [11]:
cnns = glob('cnn/*/*')
dailymails = glob('dailymail/*/*')
len(cnns), len(dailymails)

(92579, 219506)

In [12]:
files = cnns + dailymails

In [13]:
from tqdm import tqdm

selected_hash = []
for f in tqdm(files):
    path = f
    hash_from_path = _get_hash_from_path(path)
    if hash_from_path in urls:
        selected_hash.append(f)
        
selected_hash = sorted(selected_hash)
len(selected_hash)

100%|██████████████████████████████| 312085/312085 [00:00<00:00, 1053696.18it/s]


287227

In [30]:
directory = 'translated-train'

with open('train-translated-cnn-daily.jsonl', 'w') as fopenl:
    for f in tqdm(selected_hash):
        try:
            path = f
            hash_from_path = _get_hash_from_path(path)
            new_path = os.path.join(directory, hash_from_path)
            file = _read_text_file_path(path)
            article, highlights = _get_art_abs(file)
            with open(new_path) as fopen:
                t = json.load(fopen)

            t_article = ' '.join(t['article'])
            t_highlight = t['highlight']
            d = {
                'article': article,
                'highlights': highlights,
                'translated_article': t_article,
                'translated_highlight': t_highlight,
            }
            fopenl.write(f'{json.dumps(d)}\n')
        except Exception as e:
            print(e)

 50%|████████████████                | 144248/287227 [00:09<00:10, 13196.60it/s]

Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)


100%|█████████████████████████████████| 287227/287227 [00:52<00:00, 5497.36it/s]


In [31]:
urls = _get_url_hashes('all_val.txt')

selected_hash = []
for f in tqdm(files):
    path = f
    hash_from_path = _get_hash_from_path(path)
    if hash_from_path in urls:
        selected_hash.append(f)
        
selected_hash = sorted(selected_hash)

100%|██████████████████████████████| 312085/312085 [00:00<00:00, 1286155.25it/s]


In [32]:
directory = 'translated-val'

with open('val-translated-cnn-daily.jsonl', 'w') as fopenl:
    for f in tqdm(selected_hash):
        try:
            path = f
            hash_from_path = _get_hash_from_path(path)
            new_path = os.path.join(directory, hash_from_path)
            file = _read_text_file_path(path)
            article, highlights = _get_art_abs(file)
            with open(new_path) as fopen:
                t = json.load(fopen)

            t_article = ' '.join(t['article'])
            t_highlight = t['highlight']
            d = {
                'article': article,
                'highlights': highlights,
                'translated_article': t_article,
                'translated_highlight': t_highlight,
            }
            fopenl.write(f'{json.dumps(d)}\n')
        except Exception as e:
            print(e)

100%|███████████████████████████████████| 13368/13368 [00:02<00:00, 4887.75it/s]


In [33]:
urls = _get_url_hashes('all_test.txt')

selected_hash = []
for f in tqdm(files):
    path = f
    hash_from_path = _get_hash_from_path(path)
    if hash_from_path in urls:
        selected_hash.append(f)
        
selected_hash = sorted(selected_hash)

100%|██████████████████████████████| 312085/312085 [00:00<00:00, 1299690.48it/s]


In [34]:
directory = 'translated-test'

with open('test-translated-cnn-daily.jsonl', 'w') as fopenl:
    for f in tqdm(selected_hash):
        try:
            path = f
            hash_from_path = _get_hash_from_path(path)
            new_path = os.path.join(directory, hash_from_path)
            file = _read_text_file_path(path)
            article, highlights = _get_art_abs(file)
            with open(new_path) as fopen:
                t = json.load(fopen)

            t_article = ' '.join(t['article'])
            t_highlight = t['highlight']
            d = {
                'article': article,
                'highlights': highlights,
                'translated_article': t_article,
                'translated_highlight': t_highlight,
            }
            fopenl.write(f'{json.dumps(d)}\n')
        except Exception as e:
            print(e)

100%|███████████████████████████████████| 11490/11490 [00:02<00:00, 4097.59it/s]


In [1]:
!head -n 1 test-translated-cnn-daily.jsonl

{"article": "(CNN)James Best, best known for his portrayal of bumbling sheriff Rosco P. Coltrane on TV's \"The Dukes of Hazzard,\" died Monday after a brief illness. He was 88. Best died in hospice in Hickory, North Carolina, of complications from pneumonia, said Steve Latshaw, a longtime friend and Hollywood colleague. Although he'd been a busy actor for decades in theater and in Hollywood, Best didn't become famous until 1979, when \"The Dukes of Hazzard's\" cornpone charms began beaming into millions of American homes almost every Friday night. For seven seasons, Best's Rosco P. Coltrane chased the moonshine-running Duke boys back and forth across the back roads of fictitious Hazzard County, Georgia, although his \"hot pursuit\" usually ended with him crashing his patrol car. Although Rosco was slow-witted and corrupt, Best gave him a childlike enthusiasm that got laughs and made him endearing. His character became known for his distinctive \"kew-kew-kew\" chuckle and for goofy catc