In [1]:
# !wget http://groups.di.unipi.it/~gulli/newsspace200.xml.bz
# !bzip2 -d newsspace200.xml.bz

In [2]:
# !wget http://lil.nlp.cornell.edu/resources/newsroom/r8625bda324/newsroom-release.tar
# !gzip -d release/train.jsonl.gz
# !gzip -d release/test.jsonl.gz
# !gzip -d release/dev.jsonl.gz

In [3]:
from glob import glob

files = glob('release/*.jsonl')
files

['release/train.jsonl', 'release/dev.jsonl', 'release/test.jsonl']

In [4]:
with open('release/dev.jsonl') as fopen:
    data = fopen.read().split('\n')
    
len(data)

108838

In [5]:
import json

d = json.loads(data[2])
d

{'url': 'http://www.nydailynews.com/archives/news/1995/10/17/1995-10-17_new_yorkers__only_regret_was.html',
 'archive': 'http://web.archive.org/web/20110210093603id_/http://www.nydailynews.com:80/archives/news/1995/10/17/1995-10-17_new_yorkers__only_regret_was.html',
 'title': "NEW YORKERS' ONLY REGRET WAS STAYING HOME",
 'date': '20110210093603',
 'text': 'This story was reported by: NICK CHARLES, AUSTIN EVANS FENNER AND SAMSON MULUGETA It was written by: KAREN HUNTER\n\nTuesday, October 17th 1995, 4:20AM\n\nAs many black men marched on Washington yesterday, some New Yorkers spoke of their pride in the event and their disappointment in not being there, too.\n\n"I felt like the only black person working," said Roderick Vinson, 38, of Harlem. "That feeling made me sick to my stomach. I couldn\'t believe I missed one of the important events of my life."\n\nWinston Ford, 50, had to work, too. He makes his living selling incense and body oils in Brooklyn.\n\n"I didn\'t have the finances to

In [6]:
# !python3 -m spacy download en_core_web_sm
# import spacy
# nlp = spacy.load('en_core_web_sm')

In [16]:
import re
from unidecode import unidecode

alphabets = '([A-Za-z])'
prefixes = (
    '(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt|Puan|puan|Tuan|tuan|sir|Sir)[.]'
)
suffixes = '(Inc|Ltd|Jr|Sr|Co|Mo)'
starters = '(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever|Dia|Mereka|Tetapi|Kita|Itu|Ini|Dan|Kami|Beliau|Seri|Datuk|Dato|Datin|Tuan|Puan)'
acronyms = '([A-Z][.][A-Z][.](?:[A-Z][.])?)'
websites = '[.](com|net|org|io|gov|me|edu|my)'
another_websites = '(www|http|https)[.]'
digits = '([0-9])'
before_digits = '([Nn]o|[Nn]ombor|[Nn]umber)'
month = '([Jj]an(?:uari)?|[Ff]eb(?:ruari)?|[Mm]a(?:c)?|[Aa]pr(?:il)?|Mei|[Jj]u(?:n)?|[Jj]ula(?:i)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]kt(?:ober)?|[Nn]ov(?:ember)?|[Dd]is(?:ember)?)'


def split_into_sentences(text, minimum_length = 10):
    text = text.replace('\x97', '\n')
    text = '. '.join([s for s in text.split('\n') if len(s)])
    text = text + '.'
    text = unidecode(text)
    text = ' ' + text + '  '
    text = text.replace('\n', ' ')
    text = re.sub(prefixes, '\\1<prd>', text)
    text = re.sub(websites, '<prd>\\1', text)
    text = re.sub(another_websites, '\\1<prd>', text)
    text = re.sub('[,][.]+', '<prd>', text)
    if '...' in text:
        text = text.replace('...', '<prd><prd><prd>')
    if 'Ph.D' in text:
        text = text.replace('Ph.D.', 'Ph<prd>D<prd>')
    text = re.sub('[.]\s*[,]', '<prd>,', text)
    text = re.sub(before_digits + '[.]\s*' + digits, '\\1<prd>\\2', text)
    text = re.sub(month + '[.]\s*' + digits, '\\1<prd>\\2', text)
    text = re.sub('\s' + alphabets + '[.][ ]+', ' \\1<prd> ', text)
    text = re.sub(acronyms + ' ' + starters, '\\1<stop> \\2', text)
    text = re.sub(
        alphabets + '[.]' + alphabets + '[.]' + alphabets + '[.]',
        '\\1<prd>\\2<prd>\\3<prd>',
        text,
    )
    text = re.sub(
        alphabets + '[.]' + alphabets + '[.]', '\\1<prd>\\2<prd>', text
    )
    text = re.sub(' ' + suffixes + '[.][ ]+' + starters, ' \\1<stop> \\2', text)
    text = re.sub(' ' + suffixes + '[.]', ' \\1<prd>', text)
    text = re.sub(' ' + alphabets + '[.]', ' \\1<prd>', text)
    text = re.sub(digits + '[.]' + digits, '\\1<prd>\\2', text)
    if '”' in text:
        text = text.replace('.”', '”.')
    if '"' in text:
        text = text.replace('."', '".')
    if '!' in text:
        text = text.replace('!"', '"!')
    if '?' in text:
        text = text.replace('?"', '"?')
    text = text.replace('.', '.<stop>')
    text = text.replace('?', '?<stop>')
    text = text.replace('!', '!<stop>')
    text = text.replace('<prd>', '.')
    sentences = text.split('<stop>')
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences if len(s) > minimum_length]
    return sentences

In [8]:
def split(story, highlights, cap = 1000):
    a, s_ = [], ''
    for s in story:
        if len(s_ + ' ' + s) >= cap:
            a.append(s_.strip())
            s_ = ''
        else:
            s_ = s_ + ' ' + s
    if len(s_):
        a.append(s_.strip())
    a.append('[SUMMARY]: ' + highlights)
    return a

In [9]:
%%time
d = json.loads(data[7])
r = split(split_into_sentences(d['text']), d['summary'])

CPU times: user 4.69 ms, sys: 586 µs, total: 5.28 ms
Wall time: 5.23 ms


In [10]:
r, len(r[0].split())

([" Reporter: Brice stimon is now serving a seven-year sentence after he terrorized soraida, posting fake profiles of her on escort websites and worse, posting a sex tape of the two of them. To this day. Reporter: Brice stimon is now serving a seven-year sentence after he terrorized soraida, posting fake profiles of her on escort websites and worse, posting a sex tape of the two of them. To this day, she says she has no idea how he recorded them. She never saw a camera or a phone to capture them. I have no idea he had ever taken this. And he forwarded this to over 3,000 people. Reporter: And this was video of the two of you -- of me and him having sex, yes. Reporter: And that video spread faster than she could have imagined. Multiple porn sites. But what was worse was the family and the friends who authorities say were sent links to it by the manss soraida met on that plane. Could you believe this was happening to you? No, I didn't believe.",
  ' Reporter: But it was. But it was. Repor

In [11]:
from tqdm import tqdm

x = []

for file in files:
    with open(file) as fopen:
        data = fopen.read().split('\n')
        
    for i in tqdm(range(len(data))):
        try:
            d = json.loads(data[i])
            r = split(split_into_sentences(d['text']), d['summary'])
            x.append(r)
        except:
            pass

100%|██████████| 995042/995042 [39:11<00:00, 423.10it/s]  
100%|██████████| 108838/108838 [04:09<00:00, 436.57it/s]
100%|██████████| 108863/108863 [04:16<00:00, 423.62it/s]


In [12]:
with open('newsroom.json', 'w') as fopen:
    json.dump(x, fopen)

In [14]:
x[-1]

[" Digital money pioneer PayPal is on the prowl. Since eBay Inc. bought the Internet payment upstart for $1.5 billion in 2002, PayPal has grown at a heady clip, reeling in 71.6 million account holders who pumped $6.2 billion through its money-transfer system during the first quarter this year. The company claims twice as many accounts as Bank of America and recently surpassed Discover, too. Yet PayPal's identity remains virtually inseparable from eBay, the auction market where it greases the wheels of commerce between strangers by letting them zap money to any e-mail address. Three-quarters of the goods and services swapped on eBay's U.S. site are paid for with PayPal, and auctions account for a large share of PayPal's $233 million in quarterly revenue. EBay hopes that will change this year as PayPal rolls out a major marketing campaign to tout new payment services to larger merchants selling from sites independent of eBay.",
 ' "We feel confident that PayPal can become the online wall

In [23]:
texts, index = [], 0
while len(texts) < 1500000:
    texts.extend(x[index][:-1])
    index += 1

In [24]:
batch_size = 100000

for i in range(0, len(texts), batch_size):
    b = texts[i: i + batch_size]
    with open(f'dataset-{i}.json', 'w') as fopen:
        json.dump(b, fopen)