In [1]:
import csv
import json
import random
import collections
import string

In [2]:
printable = set(string.printable)
csv_reader = csv.DictReader(open('stories.csv'))
list_stories = []

topics = collections.Counter()
themes = collections.Counter()

for row in csv_reader:
    for col in ['Events', 'Regions', 'Persons / Groups', 'Topics', 'Places', 'Themes']:
        row[col] = row[col].split('|')
    row['Content'] = row['Content no HTML']
    row['Persons'] = row['Persons / Groups']
    del row['Content no HTML']
    del row['Persons / Groups']
    if 'The Facts' in row['Topics']:
        row['Topics'].remove('The Facts')
    
    for topic in row['Topics']: topics[topic] += 1
    for theme in row['Themes']: themes[theme] += 1
    
    for key in row:
        if isinstance(row[key], str):
            row[key] = ''.join(filter(lambda x: x in printable, row[key]))
        elif isinstance(row[key], list):
            row[key] = list(filter(None, row[key]))
    
    if row['Topics']:
        list_stories.append(dict(row))
    
random.shuffle(list_stories)

num_articles = len(list_stories)
train_rate, dev_rate, test_rate = 0.6, 0.2, 0.2
train_start, dev_start, test_start = 0, int(train_rate * num_articles), int((train_rate + dev_rate) * num_articles)

train = list_stories[train_start : dev_start]
dev = list_stories[dev_start : test_start]
test = list_stories[test_start :]

json.dump({'train': train, 'dev': dev, 'test': test}, open('stories.json', 'w'), indent=2)

In [3]:
topics

Counter({'Politics': 292,
         'Trump': 239,
         'Law &amp; Justice': 280,
         'Russia': 105,
         'Elections': 81,
         'World': 444,
         'Context': 57,
         'War and Conflict': 208,
         'Business': 88,
         'Economy': 74,
         'Technology': 53,
         'Immigration': 79,
         'Climate and Environment': 26,
         'Accidents and Natural Disasters': 65,
         'Science': 8,
         '': 6,
         'Religion': 11,
         'Islamic State': 29,
         'Syria': 37,
         'UK': 32,
         'France': 8,
         'Health and Medicine': 46,
         'North Korea': 39,
         'Sports': 13,
         'Arts and Entertainment': 8})

In [4]:
themes

Counter({'communications': 2,
         'media relations': 1,
         'investigation': 17,
         '': 178,
         'immigration': 55,
         'diplomacy': 27,
         'hurricanes': 6,
         'justice': 29,
         'law': 23,
         'natural disasters': 17,
         'sanctions': 52,
         'tropical storms': 4,
         'weather': 47,
         'journalism': 20,
         'press freedom': 7,
         'climate change': 16,
         'civil rights': 32,
         'LGBTQ': 17,
         'military': 53,
         'terrorism': 42,
         'tourism': 3,
         'transgender': 7,
         'travel': 20,
         'international relations': 125,
         'migrants': 5,
         'flooding': 4,
         'trade': 51,
         'Affordable Care Act': 4,
         'healthcare': 28,
         'drugs': 17,
         'suicide bombing': 4,
         'violence': 28,
         'war': 49,
         'Confederacy': 1,
         'memorial': 1,
         'protest': 8,
         'astronomy': 7,
         'asylum': 1

In [5]:
len(list_stories)

936

In [6]:
len(themes)

584

In [7]:
len(topics)

25