# Create Balanced Sample from the current training, validation and test list
based on the number of documents in them

In [1]:
import json
import math
import os
import time
from collections import namedtuple
import cPickle as pickle

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import random

from multiprocessing.dummy import Pool as ThreadPool
import itertools


import logging
from logging import info
from functools import partial

from thesis.utils.metrics import *

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
#training_file = "/home/local/shalaby/docs_output_sample_100.json"

root_location = "/big/s/shalaby/"
exports_location = root_location + "exported_data/"

doc2vec_model_save_location = os.path.join(root_location, "parameter_search_doc2vec_models_new", "full")

training_file = root_location + "docs_output.json"

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
classification_index_file = exports_location + "classification_index.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
valid_classes_file = exports_location + "valid_classes.pkl"
valid_subclasses_file = exports_location + "valid_subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"

preprocessed_location = root_location + "preprocessed_data/"

In [4]:
class OneHotEncoder():
    
    def __init__(self, classifications):
        self.classifications = classifications
        self.one_hot_indices = {}

        # convert character classifications to bit vectors
        for i, clssf in enumerate(classifications):
            bits = [0] * len(classifications)
            bits[i] = 1
            self.one_hot_indices[clssf] = i
    
    def get_label_vector(self, labels):
        """
        classes: array of string with the classes assigned to the instance
        """
        output_vector = [0] * len(self.classifications)
        for label in labels:
            index = self.one_hot_indices[label]
            output_vector[index] = 1
            
        return output_vector

def get_label_data(classifications, doc_ids, doc_classification_map):
    one_hot_encoder = OneHotEncoder(classifications)
    data_labels = []
    for i, doc_id in enumerate(doc_ids):
        eligible_classifications = [clssf for clssf in doc_classification_map[doc_id] if clssf in classifications]
        data_labels.append(one_hot_encoder.get_label_vector(eligible_classifications))
        #if i % 1000 == 0: info(i)
    data_labels = np.array(data_labels)
    return data_labels

In [5]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
valid_classes = pickle.load(open(valid_classes_file))
valid_subclasses = pickle.load(open(valid_subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))
classifications_index = pickle.load(open(classification_index_file))

CPU times: user 44.5 s, sys: 3.75 s, total: 48.2 s
Wall time: 48.9 s


In [6]:
training_docs_set = set(training_docs_list)
validation_docs_set = set(validation_docs_list)
test_docs_set = set(test_docs_list)

In [37]:
SAMPLE_RATIO = 0.155
MIN_TRAINING_NUMBER_PER_SUBCLASS = 10
MIN_VALIDATION_NUMBER_PER_SUBCLASS = 5
MIN_TEST_NUMBER_PER_SUBCLASS = 5

In [38]:
SAMPLE_SEED = 1234

In [39]:
random.seed(SAMPLE_SEED)

In [40]:
# %%time

sample_training_docs_set = set()
sample_validation_docs_set = set()
sample_test_docs_set = set()

for subclass in subclasses:
    subclass_docs = set(classifications_index[subclass])
    training_subclass_docs = training_docs_set & subclass_docs
    validation_subclass_docs = validation_docs_set & subclass_docs
    test_subclass_docs = test_docs_set & subclass_docs
    
    # info("{} => Training: {}, Validation: {}".format(subclass, len(training_subclass_docs), len(validation_subclass_docs)))
    if len(training_subclass_docs) > MIN_TRAINING_NUMBER_PER_SUBCLASS:
        num_of_samples = max(MIN_TRAINING_NUMBER_PER_SUBCLASS, int(len(training_subclass_docs) * SAMPLE_RATIO))
        train_subclass_sample = random.sample(training_subclass_docs, num_of_samples)
        sample_training_docs_set.update(train_subclass_sample)
    else:
        sample_training_docs_set.update(training_subclass_docs)
    
    if len(validation_subclass_docs) > MIN_VALIDATION_NUMBER_PER_SUBCLASS:
        num_of_samples = max(MIN_VALIDATION_NUMBER_PER_SUBCLASS, int(len(validation_subclass_docs) * SAMPLE_RATIO))
        val_subclass_sample = random.sample(validation_docs_set, num_of_samples)
        sample_validation_docs_set.update(val_subclass_sample)
    else:
        sample_validation_docs_set.update(validation_subclass_docs)
    
    if len(test_subclass_docs) > MIN_VALIDATION_NUMBER_PER_SUBCLASS:
        num_of_samples = max(MIN_TEST_NUMBER_PER_SUBCLASS, int(len(test_subclass_docs) * SAMPLE_RATIO))
        test_subclass_sample = random.sample(test_subclass_docs, num_of_samples)
        sample_test_docs_set.update(test_subclass_sample)
    else:
        sample_test_docs_set.update(test_subclass_docs)

In [41]:
len(sample_training_docs_set)

254767

In [42]:
len(sample_validation_docs_set)

60957

In [43]:
len(sample_test_docs_set)

79785

In [44]:
sample_training_docs_list = sorted(sample_training_docs_set)
sample_validation_docs_list = sorted(sample_validation_docs_set)
sample_test_docs_list = sorted(sample_test_docs_set)

In [45]:
sample_training_docs_list[:10]

[u'06981294',
 u'06981298',
 u'06981301',
 u'06981303',
 u'06981307',
 u'06981309',
 u'06981312',
 u'06981314',
 u'06981317',
 u'06981321']

In [47]:
pickle.dump(sample_training_docs_list, open(exports_location + "extended_pv_training_docs_list_0.15.pkl", "w"))
pickle.dump(sample_validation_docs_list, open(exports_location + "extended_pv_validation_docs_list_0.15.pkl", "w"))
pickle.dump(sample_test_docs_list, open(exports_location + "extended_pv_test_docs_list_0.15.pkl", "w"))

## Make sure that the valid classes and subclasses are the same

In [18]:
all_sample_docs_set = sample_training_docs_set | sample_validation_docs_set | sample_test_docs_set

In [19]:
INVALID_CLASSIFICATION_LIMIT = 3
invalid_classes = set()
invalid_subclasses = set()
for clsf in classifications_index.keys():
    clsf_docs = set(classifications_index[clsf]) & all_sample_docs_set
    if len(clsf_docs) < INVALID_CLASSIFICATION_LIMIT:
        if clsf in classes:
            invalid_classes.add(clsf)
        if clsf in subclasses:
            invalid_subclasses.add(clsf)
valid_classes = sorted(list(set(classes) - invalid_classes))
valid_subclasses = sorted(list(set(subclasses) - invalid_subclasses))

In [20]:
len(valid_classes)

244

In [22]:
len(valid_subclasses)

940