# Imports
Centralized module import for easy tampering.

In [1]:
import os
root_dir = "/"

import pickle

# Getting Started
Make sure we're on the right path.

In [2]:
os.chdir(os.path.join(root_dir,'home', 'pablo', 'meap'))
stories_path = os.path.join(os.getcwd(), 'cnn', 'stories')

Now, I'm initializing a dictionary (of specific size) to hold the CNN stories.
The `assert` command makes sure we have a dictionary key created for each and all stories in the `./cnn/stories/` directory.

In [3]:
# if using a dictionary
stories = dict.fromkeys(os.listdir(stories_path))
assert len(stories) == 92579

In [4]:
# if using a list
#stories = [None] * 92579

## Functions
Create some handy story manipulation functions.

In [5]:
# Split story from summaries
def split_story_from_summaries(filepath):
    
    split_list = []
    with open(filepath, 'r') as f:
        text = f.read()
        # split story body from summaries
        split_text = text.split("@highlight")
        # remove surrounding whitespace in the story and summaries
        for entry in split_text:
            split_list.append(entry.strip())
    return split_list


# Populate the stories dictionary
def populate_stories_dictionary(story_id, split_list): 
    
    stories[story_id] = (split_list[0],    # [0] is the story  
                         split_list[1:])   # [1:] are the highlights


# Method to populate the stories list
# TODO


The `populate_stories_dictionary` creates a dictionary where the key is the story's filename and the value is a tuple of (a) the story's body, and (b) a list contaning all its highlights.

```python
{
    '0a0a4c90d59df9e36ffec4ba306b4f20f3ba4acb.story':
    # (a)
        ( 'Can a movie actually convince you to support torture? ...', 
    # (b)
        [ 'Dean Obeidallah: A movie or TV show can educate or (mis)educate you', 
          'Obeidallah: Two new films about hot issues are firing up both the left and right', 
          'Senators slammed "Zero Dark Thirty," and energy industry attacked "Promised Land"',
          'Obeidallah: What does Hollywood want? To make money, of course' ]
        ),
     
     'other-story-key.story': ...
}
```




# Main
Where main processing happens.

In [9]:
os.listdir(stories_path)


['9b738675c6ef5ed43683f7aa44289b9129290814.story',
 '35402c8b38d2e28a748d1d6fe81086fcf0cdd1ee.story',
 'e32ffe5bc1f2547c5edbf6218d549c8fdc845cf7.story',
 'e2d5d21e37d1253d489658154bfab4c51c6c2e3e.story',
 '3d94c8c042b35665b737f9218922993c1c5f4964.story',
 '4d86119e62274ffc6eb48d12040313d7405168b9.story',
 '0c455ee3242ecbf9eb5223032e4a3c250266e07f.story',
 '6cd773f6cd0c25b4d05c23ba661d720e6553d055.story',
 '7970c49189618d0e0ba3f59e93625c5faa7153f3.story',
 '1d9d8ef401e994c2e0390ebcf83eaa4d76544aac.story',
 '5fc2774be1101f1e3bed6e76a3c2bb12c914e990.story',
 'afe96a23b44478fef578733b9171c19c73cfa2a5.story',
 'dacacc3f0e1a9c631a7769d81573369527a34f2c.story',
 '1376ed9a497b127ac84a790e089ae8d61c0a3cb6.story',
 '97615b61e8c63744f466ed0c50fe8775ef1f700c.story',
 '6348a8b734e4fa85fdfafc22153f2a19d667490b.story',
 '902152042b683ffb039db0f63681fdc0f7fbe901.story',
 'ce9e1e9f9584278ce16469cf7251df6b03d953e4.story',
 'b95d7ee7ac4201a8044e05b4523736922f3d864f.story',
 '34a7332c551d5d72b1ddb999795de

In [10]:
for file in os.listdir(stories_path):
    split_list = split_story_from_summaries(os.path.join(stories_path, file))
    populate_stories_dictionary(file, split_list)

Export the stories as a Pickle file.

In [11]:
with open('stories.pkl', 'wb') as file:
    pickle.dump(stories, file)

## Test
Test all is good.

In [None]:
print(stories['0a0a4c90d59df9e36ffec4ba306b4f20f3ba4acb.story'])