# Preprocessing the data 

In [6]:
import os, os.path, random
import subprocess
import hashlib
import struct
import collections
import tensorflow as tf
from tensorflow.core.example import example_pb2
from stanfordcorenlp import StanfordCoreNLP


nlp = StanfordCoreNLP(r'/home/ubuntu/stanford-corenlp-full-2018-02-27')

In [7]:
def split_data(stories_dir,train=0.5,test=0.3):
    val = 1-(train+test)
    
    all_files = os.listdir(stories_dir)
    total = len(all_files)
    shuffled = all_files[:]
    random.shuffle(shuffled)
    
    train_stories = int(round(train*total))
    test_stories = int(round(test*total))
    val_stories = int(round(val*total))

    train_data = shuffled[:train_stories]
    test_data = shuffled[train_stories:train_stories+test_stories]
    val_data = shuffled[train_stories+test_stories:train_stories+test_stories+val_stories]
    return train_data,test_data,val_data

In [8]:
def tokenize_stories(stories_dir, tokenized_stories_dir):
    """Maps a whole directory of .story files to a tokenized version using Stanford CoreNLP Tokenizer"""
    print(f"Preparing to tokenize {stories_dir} to {tokenized_stories_dir}...") 
    stories = os.listdir(stories_dir)
    # make IO list file
    print("Making list of files to tokenize...")
    with open("mapping.txt", "w") as f:
        for s in stories:
            f.write("%s \t %s\n" % (os.path.join(stories_dir, s), os.path.join(tokenized_stories_dir, s)))
    
    command = ['java', 'edu.stanford.nlp.process.PTBTokenizer', '-ioFileList', '-preserveLines', 'mapping.txt']
    print(f"Tokenizing {len(stories)} files in {stories_dir} and saving in {tokenized_stories_dir}...")
    subprocess.call(command)
    
    print("Stanford CoreNLP Tokenizer has finished.")
    
    os.remove("mapping.txt")

    # Check that the tokenized stories directory contains the same number of files as the original directory
    num_orig = len(os.listdir(stories_dir))
    num_tokenized = len(os.listdir(tokenized_stories_dir))
    print (f"Successfully finished tokenizing {stories_dir} to {tokenized_stories_dir}.\n")


In [9]:
dm_single_close_quote = u'\u2019' # unicode
dm_double_close_quote = u'\u201d'
END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', dm_single_close_quote, dm_double_close_quote, ")"] # acceptable ways to end a sentence
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'
VOCAB_SIZE = 200000
CHUNK_SIZE = 1000


def read_text_file(text_file):
    lines = []
    with open(text_file, "r") as f:
        for line in f:
            lines.append(line.strip())
    return lines

def fix_missing_period(line):
    """Adds a period to a line that is missing a period"""
    if "@highlight" in line: return line
    if line=="": return line
    if line[-1] in END_TOKENS: return line
    return line + " ."


def get_art_abs(story_file):
    lines = read_text_file(story_file)

    # Lowercase everything
    lines = [line.lower() for line in lines]

    # Put periods on the ends of lines that are missing them (this is a problem in the dataset because many image captions don't end in periods; consequently they end up in the body of the article as run-on sentences)
    lines = [fix_missing_period(line) for line in lines]

    # Separate out article and abstract sentences
    article_lines = []
    highlights = []
    next_is_highlight = False
   
    for idx,line in enumerate(lines):
        if line == "":
            continue # empty line
        elif line.startswith("@highlight"):
            next_is_highlight = True
        elif next_is_highlight:
            highlights.append(line)
        else:
            article_lines.append(line)

        # Make article into a single string
        article = ' '.join(article_lines)

        # Make abstract into a signle string, putting <s> and </s> tags around the sentences
        abstract = ' '.join(["%s %s %s" % (SENTENCE_START, sent, SENTENCE_END) for sent in highlights])

    return article, abstract

In [10]:
def write_to_bin(file_names,story_dir,out_file,finished_files_dir, makevocab=False):
    """Reads the tokenized .story files corresponding to the names listed in the url_file and writes them to an out_file."""
    
    story_fnames = file_names
    num_stories = len(story_fnames)

    if makevocab:
        vocab_counter = collections.Counter()

    with open(out_file, 'wb') as writer:
        for idx,s in enumerate(story_fnames):
            if idx % 1000 == 0:
                print (f"Writing story {idx} of {num_stories}; {float(idx)*100.0/float(num_stories)} percent done")

            story_file = os.path.join(story_dir, s)
            
            # Get the strings to write to .bin file
            article, abstract = get_art_abs(story_file)

            # Write to tf.Example
            if bytes(article, 'utf-8') == 0:
                print('error!')
                
            tf_example = example_pb2.Example()
            tf_example.features.feature['article'].bytes_list.value.extend([bytes(article, 'utf-8')])
            tf_example.features.feature['abstract'].bytes_list.value.extend([bytes(abstract, 'utf-8')])
            tf_example_str = tf_example.SerializeToString()
            str_len = len(tf_example_str)
            writer.write(struct.pack('q', str_len))
            writer.write(struct.pack('%ds' % str_len, tf_example_str))

            # Write the vocab to file, if applicable
            if makevocab:
                art_tokens = article.split(' ')
                abs_tokens = abstract.split(' ')
                abs_tokens = [t for t in abs_tokens if t not in [SENTENCE_START, SENTENCE_END]] # remove these tags from vocab
                tokens = art_tokens + abs_tokens
                tokens = [t.strip() for t in tokens] # strip
                tokens = [t for t in tokens if t!=""] # remove empty
                vocab_counter.update(tokens)

    print(f"Finished writing file {out_file}\n") 
    # write vocab to file
    if makevocab:
        print ("Writing vocab file...")
        with open(os.path.join(finished_files_dir, "vocab"), 'w') as writer:
            for word, count in vocab_counter.most_common(VOCAB_SIZE):
                writer.write(word + ' ' + str(count) + '\n')
        print ("Finished writing vocab file" )   
    

In [11]:
stories_dir = "/home/ubuntu/W266/final_0/W266_Final/data/test"
tokenized_stories_dir = "/home/ubuntu/W266/final_0/W266_Final/data/test_tokenized"
processed_dir = "/home/ubuntu/W266/final_0/W266_Final/data/test_processed"

In [8]:
train,test,val = split_data(stories_dir)

In [9]:
test

['7f3317b1e717c0efb427552c486ea0dad471cc32.story',
 'ffff2dc1cc4888253a4733f808959f0b4eab26a6.story',
 'fffd170a9d15b1f9751e969e6f5b0ce5b9f7d027.story']

In [10]:
train

['7f33bc95458ae729aab0eed1c0b6a18223176a83.story',
 '7f340bb7216273f9b59d26516e3460a94c2c8125.story',
 'ffff522cebe5ad9dcfb6dfc476b8f423f3f8dd34.story',
 'ffff11a2f44d731cd80c86819a89b7e227581415.story',
 '7f35b0a3a1a60e35e560b0d9ae17c84d848d1637.story']

In [11]:
val

['7f35cb654a1627dfa9b3458a5e0356256e694941.story',
 'fffe0c4eb70bde9733b858adfd5b4eeeae631f28.story']

In [12]:
tokenize_stories(stories_dir, tokenized_stories_dir)

Preparing to tokenize /home/ubuntu/W266/final_0/W266_Final/data/test to /home/ubuntu/W266/final_0/W266_Final/data/test_tokenized...
Making list of files to tokenize...
Tokenizing 10 files in /home/ubuntu/W266/final_0/W266_Final/data/test and saving in /home/ubuntu/W266/final_0/W266_Final/data/test_tokenized...
Stanford CoreNLP Tokenizer has finished.
Successfully finished tokenizing /home/ubuntu/W266/final_0/W266_Final/data/test to /home/ubuntu/W266/final_0/W266_Final/data/test_tokenized.



In [13]:
train_out_file = os.path.join(processed_dir, "train.bin")
test_out_file = os.path.join(processed_dir, "test.bin")
validation_out_file = os.path.join(processed_dir, "validation.bin")

In [14]:
write_to_bin(train,tokenized_stories_dir,train_out_file,processed_dir, makevocab=True)

Making bin file for stories listed in ['7f33bc95458ae729aab0eed1c0b6a18223176a83.story', '7f340bb7216273f9b59d26516e3460a94c2c8125.story', 'ffff522cebe5ad9dcfb6dfc476b8f423f3f8dd34.story', 'ffff11a2f44d731cd80c86819a89b7e227581415.story', '7f35b0a3a1a60e35e560b0d9ae17c84d848d1637.story']...
Writing story 0 of 5; 0.0 percent done
Finished writing file /home/ubuntu/W266/final_0/W266_Final/data/test_processed/train.bin

Writing vocab file...
Finished writing vocab file


In [15]:
write_to_bin(test,tokenized_stories_dir,test_out_file,processed_dir, makevocab=False)

Making bin file for stories listed in ['7f3317b1e717c0efb427552c486ea0dad471cc32.story', 'ffff2dc1cc4888253a4733f808959f0b4eab26a6.story', 'fffd170a9d15b1f9751e969e6f5b0ce5b9f7d027.story']...
Writing story 0 of 3; 0.0 percent done
Finished writing file /home/ubuntu/W266/final_0/W266_Final/data/test_processed/test.bin



In [16]:
write_to_bin(val,tokenized_stories_dir,validation_out_file,processed_dir, makevocab=False)

Making bin file for stories listed in ['7f35cb654a1627dfa9b3458a5e0356256e694941.story', 'fffe0c4eb70bde9733b858adfd5b4eeeae631f28.story']...
Writing story 0 of 2; 0.0 percent done
Finished writing file /home/ubuntu/W266/final_0/W266_Final/data/test_processed/validation.bin



# Preprocess all the data

In [17]:
def get_file_count(DIR):
    return len(os.listdir(DIR))

In [27]:
cnn_stories_dir = "/home/ubuntu/W266/final_0/W266_Final/data/cnn/stories"
dm_stories_dir = "/home/ubuntu/W266/final_0/W266_Final/data/dailymail/stories"

tokenized_stories_dir = "/home/ubuntu/W266/final_0/W266_Final/data/final_tokenized"
processed_dir = "/home/ubuntu/W266/final_0/W266_Final/data/final_processed"

In [19]:
get_file_count(cnn_stories_dir)

92579

In [20]:
get_file_count(dm_stories_dir)

219506

In [21]:
tokenize_stories(cnn_stories_dir, tokenized_stories_dir)

Preparing to tokenize /home/ubuntu/W266/final_0/W266_Final/data/cnn/stories to /home/ubuntu/W266/final_0/W266_Final/data/final_tokenized...
Making list of files to tokenize...
Tokenizing 92579 files in /home/ubuntu/W266/final_0/W266_Final/data/cnn/stories and saving in /home/ubuntu/W266/final_0/W266_Final/data/final_tokenized...
Stanford CoreNLP Tokenizer has finished.
Successfully finished tokenizing /home/ubuntu/W266/final_0/W266_Final/data/cnn/stories to /home/ubuntu/W266/final_0/W266_Final/data/final_tokenized.



In [22]:
get_file_count(tokenized_stories_dir)

92579

In [23]:
tokenize_stories(dm_stories_dir, tokenized_stories_dir)

Preparing to tokenize /home/ubuntu/W266/final_0/W266_Final/data/dailymail/stories to /home/ubuntu/W266/final_0/W266_Final/data/final_tokenized...
Making list of files to tokenize...
Tokenizing 219506 files in /home/ubuntu/W266/final_0/W266_Final/data/dailymail/stories and saving in /home/ubuntu/W266/final_0/W266_Final/data/final_tokenized...
Stanford CoreNLP Tokenizer has finished.
Successfully finished tokenizing /home/ubuntu/W266/final_0/W266_Final/data/dailymail/stories to /home/ubuntu/W266/final_0/W266_Final/data/final_tokenized.



In [24]:
get_file_count(tokenized_stories_dir)

312085

In [31]:
train,test,val = split_data(tokenized_stories_dir,train=0.6,test=0.3)

In [32]:
print(len(train),len(test),len(val))

187251 93626 31208


In [33]:
train_out_file = os.path.join(processed_dir, "train.bin")
write_to_bin(train,tokenized_stories_dir,train_out_file,processed_dir, makevocab=True)

Writing story 0 of 187251; 0.0 percent done
Writing story 1000 of 187251; 0.5340425418288821 percent done
Writing story 2000 of 187251; 1.0680850836577642 percent done
Writing story 3000 of 187251; 1.6021276254866463 percent done
Writing story 4000 of 187251; 2.1361701673155284 percent done
Writing story 5000 of 187251; 2.6702127091444106 percent done
Writing story 6000 of 187251; 3.2042552509732927 percent done
Writing story 7000 of 187251; 3.738297792802175 percent done
Writing story 8000 of 187251; 4.272340334631057 percent done
Writing story 9000 of 187251; 4.806382876459939 percent done
Writing story 10000 of 187251; 5.340425418288821 percent done
Writing story 11000 of 187251; 5.874467960117703 percent done
Writing story 12000 of 187251; 6.408510501946585 percent done
Writing story 13000 of 187251; 6.942553043775467 percent done
Writing story 14000 of 187251; 7.47659558560435 percent done
Writing story 15000 of 187251; 8.010638127433232 percent done
Writing story 16000 of 187251;

Writing story 132000 of 187251; 70.49361552141244 percent done
Writing story 133000 of 187251; 71.02765806324132 percent done
Writing story 134000 of 187251; 71.5617006050702 percent done
Writing story 135000 of 187251; 72.09574314689908 percent done
Writing story 136000 of 187251; 72.62978568872796 percent done
Writing story 137000 of 187251; 73.16382823055685 percent done
Writing story 138000 of 187251; 73.69787077238573 percent done
Writing story 139000 of 187251; 74.23191331421461 percent done
Writing story 140000 of 187251; 74.76595585604349 percent done
Writing story 141000 of 187251; 75.29999839787237 percent done
Writing story 142000 of 187251; 75.83404093970125 percent done
Writing story 143000 of 187251; 76.36808348153014 percent done
Writing story 144000 of 187251; 76.90212602335902 percent done
Writing story 145000 of 187251; 77.4361685651879 percent done
Writing story 146000 of 187251; 77.97021110701678 percent done
Writing story 147000 of 187251; 78.50425364884566 percent

In [34]:
test_out_file = os.path.join(processed_dir, "test.bin")
write_to_bin(test,tokenized_stories_dir,test_out_file,processed_dir, makevocab=False)

Writing story 0 of 93626; 0.0 percent done
Writing story 1000 of 93626; 1.0680793796594963 percent done
Writing story 2000 of 93626; 2.1361587593189926 percent done
Writing story 3000 of 93626; 3.204238138978489 percent done
Writing story 4000 of 93626; 4.272317518637985 percent done
Writing story 5000 of 93626; 5.340396898297482 percent done
Writing story 6000 of 93626; 6.408476277956978 percent done
Writing story 7000 of 93626; 7.4765556576164744 percent done
Writing story 8000 of 93626; 8.54463503727597 percent done
Writing story 9000 of 93626; 9.612714416935466 percent done
Writing story 10000 of 93626; 10.680793796594964 percent done
Writing story 11000 of 93626; 11.74887317625446 percent done
Writing story 12000 of 93626; 12.816952555913955 percent done
Writing story 13000 of 93626; 13.885031935573451 percent done
Writing story 14000 of 93626; 14.953111315232949 percent done
Writing story 15000 of 93626; 16.021190694892443 percent done
Writing story 16000 of 93626; 17.08927007455

In [35]:
validation_out_file = os.path.join(processed_dir, "validation.bin")
write_to_bin(val,tokenized_stories_dir,validation_out_file,processed_dir, makevocab=False)

Writing story 0 of 31208; 0.0 percent done
Writing story 1000 of 31208; 3.204306588054345 percent done
Writing story 2000 of 31208; 6.40861317610869 percent done
Writing story 3000 of 31208; 9.612919764163035 percent done
Writing story 4000 of 31208; 12.81722635221738 percent done
Writing story 5000 of 31208; 16.021532940271726 percent done
Writing story 6000 of 31208; 19.22583952832607 percent done
Writing story 7000 of 31208; 22.430146116380417 percent done
Writing story 8000 of 31208; 25.63445270443476 percent done
Writing story 9000 of 31208; 28.838759292489105 percent done
Writing story 10000 of 31208; 32.04306588054345 percent done
Writing story 11000 of 31208; 35.247372468597796 percent done
Writing story 12000 of 31208; 38.45167905665214 percent done
Writing story 13000 of 31208; 41.65598564470648 percent done
Writing story 14000 of 31208; 44.860292232760834 percent done
Writing story 15000 of 31208; 48.06459882081518 percent done
Writing story 16000 of 31208; 51.26890540886952

In [23]:
def chunk_file(set_name):
    in_file = f"{processed_dir}/{set_name}.bin"
    print(in_file)
    reader = open(in_file, "rb")
    chunk = 0
    finished = False
    while not finished:
        chunk_fname = os.path.join(chunks_dir, '%s_%03d.bin' % (set_name, chunk)) # new chunk
        with open(chunk_fname, 'wb') as writer:
            for _ in range(CHUNK_SIZE):
                len_bytes = reader.read(8)
                if not len_bytes:
                    finished = True
                    break
                str_len = struct.unpack('q', len_bytes)[0]
                example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
                writer.write(struct.pack('q', str_len))
                writer.write(struct.pack('%ds' % str_len, example_str))
        chunk += 1

In [24]:
def chunk_all():
    # Make a dir to hold the chunks
    if not os.path.isdir(chunks_dir):
        os.mkdir(chunks_dir)
    # Chunk the data
    for set_name in ['train', 'validation', 'test']:
        print(f"Splitting {set_name} data into chunks...")
        chunk_file(set_name)
    print(f"Saved chunked data in {chunks_dir}")

In [28]:
chunks_dir = "/home/ubuntu/W266/final_0/W266_Final/data/final_chunked"

In [29]:
chunk_all()

Splitting train data into chunks...
/home/ubuntu/W266/final_0/W266_Final/data/final_processed/train.bin
Splitting validation data into chunks...
/home/ubuntu/W266/final_0/W266_Final/data/final_processed/validation.bin
Splitting test data into chunks...
/home/ubuntu/W266/final_0/W266_Final/data/final_processed/test.bin
Saved chunked data in /home/ubuntu/W266/final_0/W266_Final/data/final_chunked
