In [None]:
import numpy as np
import sklearn.datasets
import scipy.sparse
import matplotlib.pyplot as plt
import tensorflow as tf
import os
%matplotlib inline

In [None]:
flags = tf.app.flags
FLAGS = flags.FLAGS

flags.DEFINE_string('dir_data', 'data_rcv1', 'Directory to store data.')

**From Dropout (Bruna did the same)**

We took the dataset and split it into 63 classes based on the the 63 categories at the second-level of the category tree. We removed 11 categories that did not have any data and one category that had only 4 training examples. We also removed one category that covered a huge chunk (25%) of the examples. This left us with 50 classes and 402,738 documents. We divided the documents into equal-sized training and test sets randomly. Each document was represented
using the 2000 most frequent non-stopwords in the dataset.

In [None]:
# Get dataset.
rcv1 = sklearn.datasets.fetch_rcv1('data_rcv1')
N, C = rcv1.target.shape
print('N={} documents, C={} classes'.format(N, C))

#def select_classes

# All classes.
class_names = ['C11', 'C12', 'C13','C14','C15','C151','C1511','C152','C16','C17',
               'C171','C172','C173','C174','C18','C181','C182','C183','C21','C22',
               'C23','C24','C31', 'C311','C312','C313','C32','C33','C331','C34',
               'C41','C411','C42','CCAT','E11', 'E12','E121','E13','E131','E132',
               'E14','E141','E142','E143','E21', 'E211','E212','E31','E311','E312',
               'E313','E41','E411','E51','E511','E512','E513','E61','E71','ECAT',
               'G15','G151','G152','G153','G154','G155','G156','G157','G158','G159',
               'GCAT','GCRIM','GDEF','GDIP','GDIS','GENT','GENV','GFAS','GHEA',
               'GJOB','GMIL','GOBIT','GODD','GPOL','GPRO','GREL','GSCI','GSPO',
               'GTOUR','GVIO','GVOTE','GWEA','GWELF','M11','M12','M13','M131',
               'M132','M14','M141','M142','M143','MCAT']
assert len(class_names) == 103  # There is 103 categories according to LYRL2004.

# Second-level classes.
keep = ['C11','C12','C13','C14','C15','C16','C17','C18','C21','C22','C23','C24',
        'C31','C32','C33','C34','C41','C42','E11','E12','E13','E14','E21','E31',
        'E41','E51','E61','E71','G15','GCRIM','GDEF','GDIP','GDIS','GENT','GENV',
        'GFAS','GHEA','GJOB','GMIL','GOBIT','GODD','GPOL','GPRO','GREL','GSCI',
        'GSPO','GTOUR','GVIO','GVOTE','GWEA','GWELF','M11','M12','M13','M14']
assert len(keep) == 55  # There is 55 second-level categories according to LYRL2004.
keep.remove('C15')   # 151785 documents
keep.remove('GMIL')  # 5 documents only

# Construct a lookup table for labels.
labels_row = []
labels_col = []
class_lookup = {}
for i,name in enumerate(class_names):
    class_lookup[name] = i

# Index of classes to keep.
idx_keep = np.empty(len(keep))
for i,cat in enumerate(keep):
    idx_keep[i] = class_lookup[cat]
target = rcv1.target[:,idx_keep]

# Number of documents per class.
def show_doc_per_class(names, target, print_=False):
    docs_per_class = np.array(target.astype(np.uint64).sum(axis=0)).squeeze()
    print('categories ({} assignments in total)'.format(docs_per_class.sum()))
    if print_:
        for i,cat in enumerate(names):
            print('  {:5s}: {:6d} documents'.format(cat, docs_per_class[i]))
    plt.figure(figsize=(17,5))
    plt.plot(sorted(docs_per_class[::-1]),'.')
show_doc_per_class(rcv1.target_names, rcv1.target)
show_doc_per_class(keep, target, True)

#def select_documents

# Number of classes per document.
def show_classes_per_doc(target):
    classes_per_doc = np.array(target.sum(axis=1)).squeeze()
    plt.figure(figsize=(17,5))
    plt.plot(sorted(classes_per_doc[::-1]),'.')
    return classes_per_doc
classes_per_doc = show_classes_per_doc(rcv1.target)
classes_per_doc = show_classes_per_doc(target)

target = target[classes_per_doc==1]
data = rcv1.data[classes_per_doc==1, :]

# Convert labels from indicator form to single value.
N, C = target.shape
assert C == len(keep)
target = target.tocoo()
target = target.col
assert target.min() == 0
assert target.max() == C - 1

# Bruna and Dropout used 2 * 201369 = 402738 documents. Probably the difference btw v1 and v2.
print('N = {} documents and C = {} classes left'.format(N, C))

In [None]:
dates = []
n = 0
for path, subdirs, files in os.walk('data_rcv1/rcv1/'):
    for file in files:
        if 'newsML.xml' in file:
            root = ET.parse(os.path.join(path, file)).getroot()
            date = root.attrib['date']
            dates.append(date)
            n+=1
print(n)
print(len(dates))

In [None]:
import xml.etree.ElementTree as ET

root = ET.parse('data_rcv1/rcv1/19960820/2286newsML.xml').getroot()
date = root.attrib['date']

# Fetch textual content.
text = root.find('title').text
for p in root.find('text').findall('p'):
    text = ' '.join((text, p.text))
print(text)

# Find the labels of a document.
classes = []
doc = 0
for codes in root.find('metadata').findall('codes'):
    if codes.attrib['class'] == 'bip:topics:1.0':
        for code in codes.findall('code'):
            labels_row.append(doc)
            labels_col.append(class_lookup[code.attrib['code']])
            classes.append(code.attrib['code'])

assert len(labels_row) == len(labels_col)
labels_val = np.ones(len(labels_row), dtype=np.bool)
labels = scipy.sparse.csr_matrix((labels_val, (labels_row, labels_col)))

print(labels)
labels.sum()