# Create Valid Classes and Subclasses list
based on the number of documents in them

In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import time
from collections import namedtuple
import cPickle as pickle

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import random

from multiprocessing.dummy import Pool as ThreadPool
import itertools


import logging
from logging import info
from functools import partial

from thesis.utils.metrics import *

In [2]:
#training_file = "/home/local/shalaby/docs_output_sample_100.json"

root_location = "/big/s/shalaby/"
exports_location = root_location + "exported_data/"

doc2vec_model_save_location = os.path.join(root_location, "parameter_search_doc2vec_models_new", "full")

training_file = root_location + "docs_output.json"

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
classification_index_file = exports_location + "classification_index.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
valid_classes_file = exports_location + "valid_classes.pkl"
valid_subclasses_file = exports_location + "valid_subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"

preprocessed_location = root_location + "preprocessed_data/"

In [3]:
class OneHotEncoder():
    
    def __init__(self, classifications):
        self.classifications = classifications
        self.one_hot_indices = {}

        # convert character classifications to bit vectors
        for i, clssf in enumerate(classifications):
            bits = [0] * len(classifications)
            bits[i] = 1
            self.one_hot_indices[clssf] = i
    
    def get_label_vector(self, labels):
        """
        classes: array of string with the classes assigned to the instance
        """
        output_vector = [0] * len(self.classifications)
        for label in labels:
            index = self.one_hot_indices[label]
            output_vector[index] = 1
            
        return output_vector

def get_label_data(classifications, doc_ids, doc_classification_map):
    one_hot_encoder = OneHotEncoder(classifications)
    data_labels = []
    for i, doc_id in enumerate(doc_ids):
        eligible_classifications = [clssf for clssf in doc_classification_map[doc_id] if clssf in classifications]
        data_labels.append(one_hot_encoder.get_label_vector(eligible_classifications))
        #if i % 1000 == 0: info(i)
    data_labels = np.array(data_labels)
    return data_labels

In [4]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))
classifications_index = pickle.load(open(classification_index_file))
#test_docs_list = pickle.load(open(test_docs_list_file))

CPU times: user 45.5 s, sys: 6.82 s, total: 52.3 s
Wall time: 52.4 s


In [6]:
INVALID_CLASSIFICATION_LIMIT = 3
invalid_classes = set()
invalid_subclasses = set()
for clsf in classifications_index.keys():
    if len(classifications_index[clsf]) < INVALID_CLASSIFICATION_LIMIT:
        if clsf in classes:
            invalid_classes.add(clsf)
        if clsf in subclasses:
            invalid_subclasses.add(clsf)
valid_classes = sorted(list(set(classes) - invalid_classes))
valid_subclasses = sorted(list(set(subclasses) - invalid_subclasses))

In [7]:
print len(valid_classes)

244


In [8]:
print len(valid_subclasses)

940


In [11]:
pickle.dump(valid_classes, open(valid_classes_file, "w"))
pickle.dump(valid_subclasses, open(valid_subclasses_file, "w"))

### Making sure no training instance has no classes assigned to it

In [16]:
classifications = valid_classes

data_type = "sublinear_tf"
data_training_location = exports_location + "{}_training_sparse_data.pkl".format(data_type)
data_training_docids_location = exports_location + "{}_training_sparse_docids.pkl".format(data_type)
data_validation_location = exports_location + "{}_validation_sparse_data.pkl".format(data_type)
data_validation_docids_location = exports_location + "{}_validation_sparse_docids.pkl".format(data_type)

# Get the training data
training_data_docids = pickle.load(open(data_training_docids_location, "r"))
info('Getting y')
%time y = get_label_data(classifications, training_data_docids, doc_classification_map)


CPU times: user 1min 27s, sys: 3.41 s, total: 1min 31s
Wall time: 1min 31s


In [23]:
training_documents_set = set(training_docs_list)
validation_documents_set = set(validation_docs_list)
test_documents_set = set(test_docs_list)

In [17]:
y.shape

(1286325, 244)

In [26]:
y.shape[0] - len(y.sum(axis=1).nonzero()[0])

67

In [27]:
for val_class in valid_classes:
    ll = len(classifications_index[val_class])
    train_num = len(set(classifications_index[val_class]) & training_documents_set)
    print "{} -> {}, of them training: {}".format(val_class, ll, train_num)

A-00 -> 5, of them training: 4
A-01 -> 47218, of them training: 30214
A-02 -> 5, of them training: 1
A-03 -> 5, of them training: 3
A-04 -> 18, of them training: 9
A-05 -> 9, of them training: 6
A-06 -> 28, of them training: 15
A-07 -> 20, of them training: 14
A-10 -> 21, of them training: 15
A-11 -> 10, of them training: 8
A-12 -> 8, of them training: 6
A-16 -> 67, of them training: 45
A-21 -> 1222, of them training: 808
A-22 -> 1034, of them training: 657
A-23 -> 6144, of them training: 3954
A-24 -> 1075, of them training: 713
A-26 -> 4, of them training: 0
A-27 -> 6, of them training: 2
A-31 -> 35, of them training: 17
A-36 -> 4, of them training: 3
A-37 -> 4, of them training: 3
A-41 -> 3714, of them training: 2449
A-42 -> 898, of them training: 575
A-43 -> 2680, of them training: 1727
A-44 -> 2316, of them training: 1485
A-45 -> 4341, of them training: 2803
A-46 -> 1762, of them training: 1133
A-47 -> 26233, of them training: 16778
A-48 -> 3, of them training: 3
A-51 -> 15, of the

In [34]:
subclasses_set = set(subclasses)
classes_set = set(classes)

In [113]:
TRAIN_MIN = 0.3
VAL_MIN = 0.15
TEST_MIN = 0.15

In [119]:
def resolve_problematic(subclass):
    pass
def is_train_problematic(subclass, add=0):
    subclass_num = len(classifications_index[subclass])
    train_docs_num = len(set(classifications_index[subclass]) & training_documents_set)
    return (float(train_docs_num) - add) / subclass_num < TRAIN_MIN
def is_val_problematic(subclass, add=0):
    subclass_num = len(classifications_index[subclass])
    val_docs_num = len(set(classifications_index[subclass]) & validation_documents_set)
    return (float(val_docs_num) - add) / subclass_num < VAL_MIN
def is_test_problematic(subclass, add=0):
    subclass_num = len(classifications_index[subclass])
    test_docs_num = len(set(classifications_index[subclass]) & test_documents_set)
    return (float(test_docs_num) - add) / subclass_num < TEST_MIN
def is_problematic(subclass):
    if is_train_problematic(subclass) or is_val_problematic(subclass) or is_test_problematic(subclass):
        return True
    return False

## Showing the problematic classes and subclasses

#### Resetting the training, validation and test sets

In [207]:
training_documents_set = set(training_docs_list)
validation_documents_set = set(validation_docs_list)
test_documents_set = set(test_docs_list)

#### Classes

In [208]:
num_problematic = 0
for val_class in valid_classes:
    if is_problematic(val_class):
        
        num_problematic += 1
        
        ll = len(classifications_index[val_class])
        train_docs = set(classifications_index[val_class]) & training_documents_set
        val_docs = set(classifications_index[val_class]) & validation_documents_set
        test_docs = set(classifications_index[val_class]) & test_documents_set
        train_docs_with_one_subclass = len([tr for tr in train_docs if len(set(doc_classification_map[tr]) & classes_set) == 1 ])
        val_docs_with_one_subclass = len([tr for tr in val_docs if len(set(doc_classification_map[tr]) & classes_set) == 1 ])
        test_docs_with_one_subclass = len([tr for tr in test_docs if len(set(doc_classification_map[tr]) & classes_set) == 1 ])
        print ("{} -> {:5} | Train: {:5}, one sub: {:5} | " + \
              "Validation: {:4}, one sub: {:4} | " + \
              "Test: {:4}, one sub: {:4}").format(val_class, ll, len(train_docs), train_docs_with_one_subclass, 
                                                      len(val_docs), val_docs_with_one_subclass,
                                                      len(test_docs), test_docs_with_one_subclass)
        
print "Problematic: {}, All: {}".format(num_problematic, len(valid_classes))

A-00 ->     5 | Train:     4, one sub:     1 | Validation:    0, one sub:    0 | Test:    1, one sub:    1
A-02 ->     5 | Train:     1, one sub:     1 | Validation:    2, one sub:    2 | Test:    2, one sub:    0
A-03 ->     5 | Train:     3, one sub:     3 | Validation:    0, one sub:    0 | Test:    2, one sub:    0
A-04 ->    18 | Train:     9, one sub:     6 | Validation:    2, one sub:    2 | Test:    7, one sub:    4
A-05 ->     9 | Train:     6, one sub:     3 | Validation:    1, one sub:    1 | Test:    2, one sub:    0
A-06 ->    28 | Train:    15, one sub:     8 | Validation:    9, one sub:    1 | Test:    4, one sub:    2
A-07 ->    20 | Train:    14, one sub:     3 | Validation:    1, one sub:    0 | Test:    5, one sub:    0
A-10 ->    21 | Train:    15, one sub:     8 | Validation:    1, one sub:    0 | Test:    5, one sub:    3
A-11 ->    10 | Train:     8, one sub:     2 | Validation:    1, one sub:    1 | Test:    1, one sub:    0
A-12 ->     8 | Train:     6, one sub

#### Subclasses

In [209]:
num_problematic = 0
for val_subclass in valid_subclasses:
    if is_problematic(val_subclass):
        
        num_problematic += 1
        
        ll = len(classifications_index[val_subclass])
        train_docs = set(classifications_index[val_subclass]) & training_documents_set
        val_docs = set(classifications_index[val_subclass]) & validation_documents_set
        test_docs = set(classifications_index[val_subclass]) & test_documents_set
        train_docs_with_one_subclass = len([tr for tr in train_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        val_docs_with_one_subclass = len([tr for tr in val_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        test_docs_with_one_subclass = len([tr for tr in test_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        print ("{} -> {:5} | Train: {:5}, one sub: {:5} | " + \
              "Validation: {:4}, one sub: {:4} | " + \
              "Test: {:4}, one sub: {:4}").format(val_subclass, ll, len(train_docs), train_docs_with_one_subclass, 
                                                      len(val_docs), val_docs_with_one_subclass,
                                                      len(test_docs), test_docs_with_one_subclass)
        
print "Problematic: {}, All: {}".format(num_problematic, len(valid_subclasses))

A-00-N ->     3 | Train:     3, one sub:     0 | Validation:    0, one sub:    0 | Test:    0, one sub:    0
A-03-K ->     4 | Train:     3, one sub:     3 | Validation:    0, one sub:    0 | Test:    1, one sub:    0
A-04-B ->     6 | Train:     1, one sub:     0 | Validation:    1, one sub:    1 | Test:    4, one sub:    3
A-04-D ->     3 | Train:     2, one sub:     1 | Validation:    0, one sub:    0 | Test:    1, one sub:    0
A-06-F ->     9 | Train:     5, one sub:     3 | Validation:    3, one sub:    1 | Test:    1, one sub:    0
A-06-N ->     3 | Train:     3, one sub:     1 | Validation:    0, one sub:    0 | Test:    0, one sub:    0
A-07-K ->    12 | Train:     9, one sub:     1 | Validation:    0, one sub:    0 | Test:    3, one sub:    0
A-10-G ->     5 | Train:     3, one sub:     2 | Validation:    0, one sub:    0 | Test:    2, one sub:    1
A-10-H ->     5 | Train:     4, one sub:     2 | Validation:    0, one sub:    0 | Test:    1, one sub:    1
A-10-K ->     3 | T

### Fixing with documents that only have one subclass, so we dont mess up any other subclasses

In [187]:
num_problematic = 0
num_fixed = 0
num_val_fixed = 0
num_test_fixed = 0
total_moved = 0
do_test_moving = True
do_val_moving = True
for val_subclass in valid_subclasses:
    if is_problematic(val_subclass):
        is_fixed = False
        num_problematic += 1
        
        ll = len(classifications_index[val_subclass])
        train_docs = set(classifications_index[val_subclass]) & training_documents_set
        val_docs = set(classifications_index[val_subclass]) & validation_documents_set
        test_docs = set(classifications_index[val_subclass]) & test_documents_set
        train_docs_with_one_subclass_list = [tr for tr in train_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ]
        val_docs_with_one_subclass_list = [tr for tr in val_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ]
        test_docs_with_one_subclass_list = [tr for tr in test_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ]
        print ("{} -> {:5} | Train: {:5}, one sub: {:5} | " + \
              "Validation: {:4}, one sub: {:4} | " + \
              "Test: {:4}, one sub: {:4}").format(val_subclass, ll, len(train_docs), len(train_docs_with_one_subclass_list), 
                                                      len(val_docs), len(val_docs_with_one_subclass_list),
                                                      len(test_docs), len(test_docs_with_one_subclass_list))

        for train_doc in train_docs_with_one_subclass_list:
            # make sure that the training set doesnt become problematic due to our actions
            if not is_train_problematic(val_subclass, add=1):
                if is_val_problematic(val_subclass):
                    training_documents_set.remove(train_doc)
                    validation_documents_set.add(train_doc)
                    total_moved += 1
                    continue
                if is_test_problematic(val_subclass):
                    training_documents_set.remove(train_doc)
                    test_documents_set.add(train_doc)
                    total_moved += 1
                    continue
        if not is_problematic(val_subclass):
            num_fixed += 1
            is_fixed = True
            
        
        ll = len(classifications_index[val_subclass])
        train_docs = set(classifications_index[val_subclass]) & training_documents_set
        val_docs = set(classifications_index[val_subclass]) & validation_documents_set
        test_docs = set(classifications_index[val_subclass]) & test_documents_set
        train_docs_with_one_subclass = len([tr for tr in train_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        val_docs_with_one_subclass = len([tr for tr in val_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        test_docs_with_one_subclass = len([tr for tr in test_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        print ("{} -> {:5} | Train: {:5}, one sub: {:5} | " + \
              "Validation: {:4}, one sub: {:4} | " + \
              "Test: {:4}, one sub: {:4}").format(val_subclass, ll, len(train_docs), train_docs_with_one_subclass, 
                                                      len(val_docs), val_docs_with_one_subclass,
                                                      len(test_docs), test_docs_with_one_subclass)
        
            
        if do_test_moving and is_problematic(val_subclass):
            for test_doc in test_docs_with_one_subclass_list:
                # make sure that the training set doesnt become problematic due to our actions
                if not is_test_problematic(val_subclass, add=1):
                    if is_val_problematic(val_subclass):
                        test_documents_set.remove(test_doc)
                        validation_documents_set.add(test_doc)
                        total_moved += 1
                        continue
                    if is_train_problematic(val_subclass):
                        test_documents_set.remove(test_doc)
                        training_documents_set.add(test_doc)
                        total_moved += 1
                        continue
            if not is_problematic(val_subclass):
                num_fixed += 1
                num_test_fixed += 1
                is_fixed = True
        
        
        if do_val_moving and is_problematic(val_subclass):
            for val_doc in val_docs_with_one_subclass_list:
                # make sure that the training set doesnt become problematic due to our actions
                if not is_val_problematic(val_subclass, add=1):
                    if is_test_problematic(val_subclass):
                        validation_documents_set.remove(val_doc)
                        test_documents_set.add(val_doc)
                        total_moved += 1
                        continue
                    if is_train_problematic(val_subclass):
                        validation_documents_set.remove(val_doc)
                        training_documents_set.add(val_doc)
                        total_moved += 1
                        continue
            if not is_problematic(val_subclass):
                num_fixed += 1
                num_val_fixed += 1
                is_fixed = True
        
        ll = len(classifications_index[val_subclass])
        train_docs = set(classifications_index[val_subclass]) & training_documents_set
        val_docs = set(classifications_index[val_subclass]) & validation_documents_set
        test_docs = set(classifications_index[val_subclass]) & test_documents_set
        train_docs_with_one_subclass = len([tr for tr in train_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        val_docs_with_one_subclass = len([tr for tr in val_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        test_docs_with_one_subclass = len([tr for tr in test_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        print ("{} -> {:5} | Train: {:5}, one sub: {:5} | " + \
              "Validation: {:4}, one sub: {:4} | " + \
              "Test: {:4}, one sub: {:4}").format(val_subclass, ll, len(train_docs), train_docs_with_one_subclass, 
                                                      len(val_docs), val_docs_with_one_subclass,
                                                      len(test_docs), test_docs_with_one_subclass)
        
        print "================= Fixed: {} ==============".format(is_fixed)
print "Problematic: {}, All: {}, Fixed: {}, Test Fixed: {}, Val Fixed: {}, Total Moved: {}".format(num_problematic, len(valid_subclasses), num_fixed, num_test_fixed, num_val_fixed, total_moved)

A-00-N ->     3 | Train:     3, one sub:     0 | Validation:    0, one sub:    0 | Test:    0, one sub:    0
A-00-N ->     3 | Train:     3, one sub:     0 | Validation:    0, one sub:    0 | Test:    0, one sub:    0
A-00-N ->     3 | Train:     3, one sub:     0 | Validation:    0, one sub:    0 | Test:    0, one sub:    0
A-03-K ->     4 | Train:     3, one sub:     3 | Validation:    0, one sub:    0 | Test:    1, one sub:    0
A-03-K ->     4 | Train:     2, one sub:     2 | Validation:    1, one sub:    1 | Test:    1, one sub:    0
A-03-K ->     4 | Train:     2, one sub:     2 | Validation:    1, one sub:    1 | Test:    1, one sub:    0
A-04-B ->     6 | Train:     1, one sub:     0 | Validation:    1, one sub:    1 | Test:    4, one sub:    3
A-04-B ->     6 | Train:     1, one sub:     0 | Validation:    1, one sub:    1 | Test:    4, one sub:    3
A-04-B ->     6 | Train:     2, one sub:     1 | Validation:    1, one sub:    1 | Test:    3, one sub:    2
A-04-D ->     3 | T

### Still Problematic after fixing

In [188]:
num_problematic = 0
for val_subclass in valid_subclasses:
    if is_problematic(val_subclass):
        
        num_problematic += 1
        
        ll = len(classifications_index[val_subclass])
        train_docs = set(classifications_index[val_subclass]) & training_documents_set
        val_docs = set(classifications_index[val_subclass]) & validation_documents_set
        test_docs = set(classifications_index[val_subclass]) & test_documents_set
        train_docs_with_one_subclass = len([tr for tr in train_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        val_docs_with_one_subclass = len([tr for tr in val_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        test_docs_with_one_subclass = len([tr for tr in test_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        print ("{} -> {:5} | Train: {:5}, one sub: {:5} | " + \
              "Validation: {:4}, one sub: {:4} | " + \
              "Test: {:4}, one sub: {:4}").format(val_subclass, ll, len(train_docs), train_docs_with_one_subclass, 
                                                      len(val_docs), val_docs_with_one_subclass,
                                                      len(test_docs), test_docs_with_one_subclass)
        other_subclasses = set()
        for tr in train_docs:
            for other_subclass in set(doc_classification_map[tr]) & subclasses_set:
                other_subclasses.add(other_subclass)
            
        # print [ "{}({}, {})".format(other_subclass, len(classifications_index[other_subclass]), 'T' if is_problematic(other_subclass) else 'F') for other_subclass in other_subclasses] 
        
        num_non_problematic = 0
        for other_subclass in other_subclasses:
            print other_subclass, len(classifications_index[other_subclass]), is_problematic(other_subclass)
            if not is_test_problematic(other_subclass):
                num_non_problematic += 1
        print "Non Problematic Others: {}".format(num_non_problematic)
        print "*********"
        
print "Problematic: {}, All: {}".format(num_problematic, len(valid_subclasses))

A-00-N ->     3 | Train:     3, one sub:     0 | Validation:    0, one sub:    0 | Test:    0, one sub:    0
A-01-N 15825 False
C-07-H 16608 False
A-61-K 77887 False
A-00-N 3 True
B-01-J 11247 False
Non Problematic Others: 4
*********
A-06-N ->     3 | Train:     2, one sub:     0 | Validation:    1, one sub:    1 | Test:    0, one sub:    0
A-01-N 15825 False
A-63-B 12542 False
A-06-N 3 True
Non Problematic Others: 2
*********
A-07-K ->    12 | Train:     8, one sub:     0 | Validation:    1, one sub:    1 | Test:    3, one sub:    0
A-01-N 15825 False
A-61-K 77887 False
C-12-N 29839 False
A-07-K 12 True
Non Problematic Others: 4
*********
A-11-N ->     3 | Train:     2, one sub:     0 | Validation:    0, one sub:    0 | Test:    1, one sub:    0
A-01-N 15825 False
C-07-H 16608 False
A-61-K 77887 False
A-11-N 3 True
Non Problematic Others: 4
*********
A-12-Q ->     3 | Train:     2, one sub:     0 | Validation:    1, one sub:    1 | Test:    0, one sub:    0
C-07-H 16608 False
A-61-K 

### Using docs that have other subclasses that are not problematic

In [189]:
num_problematic = 0
num_fixed = 0
num_val_fixed = 0
num_test_fixed = 0
total_moved = 0
do_test_moving = True
do_val_moving = True
for val_subclass in valid_subclasses:
    if is_problematic(val_subclass):
        is_fixed = False
        num_problematic += 1
        
        ll = len(classifications_index[val_subclass])
        train_docs = set(classifications_index[val_subclass]) & training_documents_set
        val_docs = set(classifications_index[val_subclass]) & validation_documents_set
        test_docs = set(classifications_index[val_subclass]) & test_documents_set
        train_docs_with_one_subclass_list = [tr for tr in train_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ]
        val_docs_with_one_subclass_list = [tr for tr in val_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ]
        test_docs_with_one_subclass_list = [tr for tr in test_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ]
        print ("{} -> {:5} | Train: {:5}, one sub: {:5} | " + \
              "Validation: {:4}, one sub: {:4} | " + \
              "Test: {:4}, one sub: {:4}").format(val_subclass, ll, len(train_docs), len(train_docs_with_one_subclass_list), 
                                                      len(val_docs), len(val_docs_with_one_subclass_list),
                                                      len(test_docs), len(test_docs_with_one_subclass_list))

        other_train_subclasses = set()
        for tr in train_docs:
            for other_subclass in set(doc_classification_map[tr]) & subclasses_set:
                if other_subclass != val_subclass and not is_problematic(other_subclass):
                    other_train_subclasses.add(other_subclass)
        if len(other_train_subclasses) > 0:
            print "Other Valid Train Subclasses: {}".format(len(other_train_subclasses))
        else:
            print "**** No Other Train Subclass"
        
        if len(other_train_subclasses) > 0:
            # here we are using all the train docs, but we filter to only use the largest other subclass
            for train_doc in train_docs:
                would_be_problematic = False
                for oth_sub in other_train_subclasses:
                    if oth_sub in doc_classification_map[train_doc]:
                        if is_train_problematic(oth_sub, add=1):
                            would_be_problematic = True
                            print "**** Would be Problematic for {}->{}".format(oth_sub, len(classifications_index[oth_sub]))
                # we make sure that moving this doc would not be problematic for its other subclasses
                if not would_be_problematic:
                    # make sure that the training set doesnt become problematic due to our actions
                    if not is_train_problematic(val_subclass, add=1):
                        if is_val_problematic(val_subclass):
                            training_documents_set.remove(train_doc)
                            validation_documents_set.add(train_doc)
                            total_moved += 1
                            continue
                        if is_test_problematic(val_subclass):
                            training_documents_set.remove(train_doc)
                            test_documents_set.add(train_doc)
                            total_moved += 1
                            continue
                            
            if not is_problematic(val_subclass):
                num_fixed += 1
                is_fixed = True
        
                   
        if do_val_moving and is_problematic(val_subclass): 
            other_val_subclasses = set()
            for vd in val_docs:
                for other_subclass in set(doc_classification_map[vd]) & subclasses_set:
                    if other_subclass != val_subclass and not is_problematic(other_subclass):
                        other_val_subclasses.add(other_subclass)
            if len(other_val_subclasses) > 0:
                print "Other Valid Valdidation Subclasses: {}".format(len(other_val_subclasses))
            else:
                print "**** No Other Validation Subclass"

            if len(other_val_subclasses) > 0:
                # here we are using all the validation docs, but we filter to only use the largest other subclass
                for val_doc in val_docs:
                    would_be_problematic = False
                    for oth_sub in other_val_subclasses:
                        if oth_sub in doc_classification_map[val_doc]:
                            if is_val_problematic(oth_sub, add=1):
                                would_be_problematic = True
                                print "**** Would be Problematic for {}->{}".format(oth_sub, len(classifications_index[oth_sub]))
                    # we make sure that moving this doc would not be problematic for its other subclasses
                    if not would_be_problematic:
                        # make sure that the training set doesnt become problematic due to our actions
                        if not is_val_problematic(val_subclass, add=1):
                            if is_train_problematic(val_subclass):
                                validation_documents_set.remove(val_doc)
                                training_documents_set.add(val_doc)
                                total_moved += 1
                                continue
                            if is_test_problematic(val_subclass):
                                validation_documents_set.remove(val_doc)
                                test_documents_set.add(val_doc)
                                total_moved += 1
                                continue

                if not is_problematic(val_subclass):
                    num_fixed += 1
                    num_val_fixed += 1
                    is_fixed = True

        
                       
        if do_test_moving and is_problematic(val_subclass): 
            other_test_subclasses = set()
            for test_doc in test_docs:
                for other_subclass in set(doc_classification_map[test_doc]) & subclasses_set:
                    if other_subclass != val_subclass and not is_problematic(other_subclass):
                        other_test_subclasses.add(other_subclass)
            if len(other_test_subclasses) > 0:
                print "Other Valid Test Subclasses: {}".format(len(other_test_subclasses))
            else:
                print "**** No Other Test Subclass"

            if len(other_test_subclasses) > 0:
                # here we are using all the train docs, but we filter to only use the largest other subclass
                for test_doc in test_docs:
                    would_be_problematic = False
                    for oth_sub in other_test_subclasses:
                        if oth_sub in doc_classification_map[test_doc]:
                            if is_test_problematic(oth_sub, add=1):
                                would_be_problematic = True
                                print "**** Would be Problematic for {}->{}".format(oth_sub, len(classifications_index[oth_sub]))
                    # we make sure that moving this doc would not be problematic for its other subclasses
                    if not would_be_problematic:
                        # make sure that the training set doesnt become problematic due to our actions
                        if not is_test_problematic(val_subclass, add=1):
                            if is_train_problematic(val_subclass):
                                test_documents_set.remove(test_doc)
                                training_documents_set.add(test_doc)
                                total_moved += 1
                                continue
                            if is_val_problematic(val_subclass):
                                test_documents_set.remove(test_doc)
                                validation_documents_set.add(test_doc)
                                total_moved += 1
                                continue

                if not is_problematic(val_subclass):
                    num_fixed += 1
                    num_test_fixed += 1
                    is_fixed = True

        
        ll = len(classifications_index[val_subclass])
        train_docs = set(classifications_index[val_subclass]) & training_documents_set
        val_docs = set(classifications_index[val_subclass]) & validation_documents_set
        test_docs = set(classifications_index[val_subclass]) & test_documents_set
        train_docs_with_one_subclass = len([tr for tr in train_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        val_docs_with_one_subclass = len([tr for tr in val_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        test_docs_with_one_subclass = len([tr for tr in test_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        print ("{} -> {:5} | Train: {:5}, one sub: {:5} | " + \
              "Validation: {:4}, one sub: {:4} | " + \
              "Test: {:4}, one sub: {:4}").format(val_subclass, ll, len(train_docs), train_docs_with_one_subclass, 
                                                      len(val_docs), val_docs_with_one_subclass,
                                                      len(test_docs), test_docs_with_one_subclass)
        
            
#         if do_test_moving and is_problematic(val_subclass):
#             for test_doc in test_docs_with_one_subclass_list:
#                 # make sure that the training set doesnt become problematic due to our actions
#                 if not is_test_problematic(val_subclass, add=1):
#                     if is_val_problematic(val_subclass):
#                         test_documents_set.remove(test_doc)
#                         validation_documents_set.add(test_doc)
#                         total_moved += 1
#                         continue
#                     if is_train_problematic(val_subclass):
#                         test_documents_set.remove(test_doc)
#                         training_documents_set.add(test_doc)
#                         total_moved += 1
#                         continue
#             if not is_problematic(val_subclass):
#                 num_fixed += 1
#                 num_test_fixed += 1
#                 is_fixed = True
        
        
#         if do_val_moving and is_problematic(val_subclass):
#             for val_doc in val_docs_with_one_subclass_list:
#                 # make sure that the training set doesnt become problematic due to our actions
#                 if not is_val_problematic(val_subclass, add=1):
#                     if is_test_problematic(val_subclass):
#                         validation_documents_set.remove(val_doc)
#                         test_documents_set.add(val_doc)
#                         total_moved += 1
#                         continue
#                     if is_train_problematic(val_subclass):
#                         validation_documents_set.remove(val_doc)
#                         training_documents_set.add(val_doc)
#                         total_moved += 1
#                         continue
#             if not is_problematic(val_subclass):
#                 num_fixed += 1
#                 num_val_fixed += 1
#                 is_fixed = True
        
#         ll = len(classifications_index[val_subclass])
#         train_docs = set(classifications_index[val_subclass]) & training_documents_set
#         val_docs = set(classifications_index[val_subclass]) & validation_documents_set
#         test_docs = set(classifications_index[val_subclass]) & test_documents_set
#         train_docs_with_one_subclass = len([tr for tr in train_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
#         val_docs_with_one_subclass = len([tr for tr in val_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
#         test_docs_with_one_subclass = len([tr for tr in test_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
#         print ("{} -> {:5} | Train: {:5}, one sub: {:5} | " + \
#               "Validation: {:4}, one sub: {:4} | " + \
#               "Test: {:4}, one sub: {:4}").format(val_subclass, ll, len(train_docs), train_docs_with_one_subclass, 
#                                                       len(val_docs), val_docs_with_one_subclass,
#                                                       len(test_docs), test_docs_with_one_subclass)
        
        print "================= Fixed: {} ==============".format(is_fixed)
print "Problematic: {}, All: {}, Fixed: {}, Test Fixed: {}, Val Fixed: {}, Total Moved: {}".format(num_problematic, len(valid_subclasses), num_fixed, num_test_fixed, num_val_fixed, total_moved)

A-00-N ->     3 | Train:     3, one sub:     0 | Validation:    0, one sub:    0 | Test:    0, one sub:    0
Other Valid Train Subclasses: 4
A-00-N ->     3 | Train:     1, one sub:     0 | Validation:    1, one sub:    0 | Test:    1, one sub:    0
A-06-N ->     3 | Train:     2, one sub:     0 | Validation:    1, one sub:    1 | Test:    0, one sub:    0
Other Valid Train Subclasses: 2
A-06-N ->     3 | Train:     1, one sub:     0 | Validation:    1, one sub:    1 | Test:    1, one sub:    0
A-07-K ->    12 | Train:     8, one sub:     0 | Validation:    1, one sub:    1 | Test:    3, one sub:    0
Other Valid Train Subclasses: 3
A-07-K ->    12 | Train:     7, one sub:     0 | Validation:    2, one sub:    1 | Test:    3, one sub:    0
A-11-N ->     3 | Train:     2, one sub:     0 | Validation:    0, one sub:    0 | Test:    1, one sub:    0
Other Valid Train Subclasses: 3
A-11-N ->     3 | Train:     1, one sub:     0 | Validation:    1, one sub:    0 | Test:    1, one sub:    0


#### Train Docs

In [190]:
print "Train Old: {:,d}".format(len(training_docs_list))
print "Train New: {:,d}".format(len(training_documents_set))
print "Difference: {:,d}".format(len(training_docs_list) - len(training_documents_set))
balanced_training_documents_list = sorted(list(training_documents_set))

Train Old: 1,286,325
Train New: 1,285,558
Difference: 767


In [161]:
pickle.dump(balanced_training_documents_list, open(exports_location + "balanced_training_docs_list.pkl", "w"))

In [177]:
new_train_docs = sorted(list(set(training_docs_list) - training_documents_set))
len(new_train_docs)

783

In [178]:
pickle.dump(new_train_docs, open(exports_location + "balanced_additional_training_docs_list.pkl", "w"))

#### Validation Docs

In [173]:
print "Validation Old: {:,d}".format(len(validation_documents_set))
print "Validation New: {:,d}".format(len(validation_docs_list))
print "Difference: {:,d}".format(len(validation_docs_list) - len(validation_documents_set))
balanced_validation_documents_list = sorted(list(validation_documents_set))

Validation Old: 322,128
Validation New: 321,473
Difference: -655


In [163]:
pickle.dump(balanced_validation_documents_list, open(exports_location + "balanced_validation_docs_list.pkl", "w"))

In [179]:
new_validation_docs = sorted(list(set(validation_docs_list) - validation_documents_set))
len(new_validation_docs)

23

In [180]:
pickle.dump(new_validation_docs, open(exports_location + "balanced_additional_validation_docs_list.pkl", "w"))

#### Test Docs

In [174]:
print "Test Old: {:,d}".format(len(test_documents_set))
print "Test New: {:,d}".format(len(test_docs_list))
print "Difference: {:,d}".format(len(test_docs_list) - len(test_documents_set))
balanced_test_documents_list = sorted(list(test_documents_set))

Test Old: 401,989
Test New: 401,877
Difference: -112


In [165]:
pickle.dump(balanced_test_documents_list, open(exports_location + "balanced_test_docs_list.pkl", "w"))

In [181]:
new_test_docs = sorted(list(set(test_docs_list) - test_documents_set))
len(new_test_docs)

26

In [182]:
pickle.dump(new_test_docs, open(exports_location + "balanced_additional_test_docs_list.pkl", "w"))

### Still Problematic SubClasses

In [195]:
num_problematic = 0
for val_subclass in valid_subclasses:
    if is_problematic(val_subclass):
        
        num_problematic += 1
        
        ll = len(classifications_index[val_subclass])
        train_docs = set(classifications_index[val_subclass]) & training_documents_set
        val_docs = set(classifications_index[val_subclass]) & validation_documents_set
        test_docs = set(classifications_index[val_subclass]) & test_documents_set
        train_docs_with_one_subclass = len([tr for tr in train_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        val_docs_with_one_subclass = len([tr for tr in val_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        test_docs_with_one_subclass = len([tr for tr in test_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
        print ("{} -> {:5} | Train: {:5}, one sub: {:5} | " + \
              "Validation: {:4}, one sub: {:4} | " + \
              "Test: {:4}, one sub: {:4}").format(val_subclass, ll, len(train_docs), train_docs_with_one_subclass, 
                                                      len(val_docs), val_docs_with_one_subclass,
                                                      len(test_docs), test_docs_with_one_subclass)
        other_subclasses = set()
        for tr in train_docs:
            for other_subclass in set(doc_classification_map[tr]) & subclasses_set:
                other_subclasses.add(other_subclass)
            
        # print [ "{}({}, {})".format(other_subclass, len(classifications_index[other_subclass]), 'T' if is_problematic(other_subclass) else 'F') for other_subclass in other_subclasses] 
        
        num_non_problematic = 0
        for other_subclass in other_subclasses:
            print other_subclass, len(classifications_index[other_subclass]), is_problematic(other_subclass)
            if not is_problematic(other_subclass):
                num_non_problematic += 1
        print "Non Problematic Others: {}".format(num_non_problematic)
        print "*********"
        
print "Problematic: {}, All: {}".format(num_problematic, len(valid_subclasses))

B-38-B ->     3 | Train:     2, one sub:     0 | Validation:    1, one sub:    0 | Test:    0, one sub:    0
B-38-B 3 True
B-37-B 4 False
Non Problematic Others: 1
*********
Problematic: 1, All: 940


## Still Problematic classes

In [197]:
num_problematic = 0
for val_class in valid_classes:
    if is_problematic(val_class):
        
        num_problematic += 1
        
        ll = len(classifications_index[val_class])
        train_docs = set(classifications_index[val_class]) & training_documents_set
        val_docs = set(classifications_index[val_class]) & validation_documents_set
        test_docs = set(classifications_index[val_class]) & test_documents_set
        train_docs_with_one_subclass = len([tr for tr in train_docs if len(set(doc_classification_map[tr]) & classes_set) == 1 ])
        val_docs_with_one_subclass = len([tr for tr in val_docs if len(set(doc_classification_map[tr]) & classes_set) == 1 ])
        test_docs_with_one_subclass = len([tr for tr in test_docs if len(set(doc_classification_map[tr]) & classes_set) == 1 ])
        print ("{} -> {:5} | Train: {:5}, one sub: {:5} | " + \
              "Validation: {:4}, one sub: {:4} | " + \
              "Test: {:4}, one sub: {:4}").format(val_class, ll, len(train_docs), train_docs_with_one_subclass, 
                                                      len(val_docs), val_docs_with_one_subclass,
                                                      len(test_docs), test_docs_with_one_subclass)
        other_classes = set()
        for tr in train_docs:
            for other_class in set(doc_classification_map[tr]) & classes_set:
                other_classes.add(other_class)
            
        # print [ "{}({}, {})".format(other_subclass, len(classifications_index[other_subclass]), 'T' if is_problematic(other_subclass) else 'F') for other_subclass in other_subclasses] 
        
        num_non_problematic = 0
        for other_class in other_classes:
            #print other_subclass, len(classifications_index[other_class]), is_problematic(other_class)
            if not is_problematic(other_class):
                num_non_problematic += 1
        print "Non Problematic Others: {}".format(num_non_problematic)
        print "*********"
        
print "Problematic: {}, All: {}".format(num_problematic, len(valid_classes))

A-02 ->     5 | Train:     1, one sub:     1 | Validation:    2, one sub:    2 | Test:    2, one sub:    0
Non Problematic Others: 0
*********
A-05 ->     9 | Train:     6, one sub:     3 | Validation:    1, one sub:    1 | Test:    2, one sub:    0
Non Problematic Others: 5
*********
A-11 ->    10 | Train:     7, one sub:     2 | Validation:    2, one sub:    1 | Test:    1, one sub:    0
Non Problematic Others: 4
*********
A-12 ->     8 | Train:     5, one sub:     0 | Validation:    1, one sub:    1 | Test:    2, one sub:    0
Non Problematic Others: 5
*********
A-26 ->     4 | Train:     0, one sub:     0 | Validation:    2, one sub:    2 | Test:    2, one sub:    1
Non Problematic Others: 0
*********
A-36 ->     4 | Train:     1, one sub:     1 | Validation:    1, one sub:    1 | Test:    2, one sub:    1
Non Problematic Others: 0
*********
A-37 ->     4 | Train:     3, one sub:     2 | Validation:    1, one sub:    1 | Test:    0, one sub:    0
Non Problematic Others: 1
*********

## Initial Output for understanding

In [42]:
for val_subclass in valid_subclasses:
    ll = len(classifications_index[val_subclass])
    train_docs = set(classifications_index[val_subclass]) & training_documents_set
    val_docs = set(classifications_index[val_subclass]) & validation_documents_set
    test_docs = set(classifications_index[val_subclass]) & test_documents_set
    train_docs_with_one_subclass = len([tr for tr in train_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
    val_docs_with_one_subclass = len([tr for tr in val_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
    test_docs_with_one_subclass = len([tr for tr in test_docs if len(set(doc_classification_map[tr]) & subclasses_set) == 1 ])
    print ("{} -> {:5} | Train: {:5}, one sub: {:5} | " + \
          "Validation: {:4}, one sub: {:4} | " + \
          "Test: {:4}, one sub: {:4}").format(val_subclass, ll, len(train_docs), train_docs_with_one_subclass, 
                                                  len(val_docs), val_docs_with_one_subclass,
                                                  len(test_docs), test_docs_with_one_subclass)

A-00-N ->     3 | Train:     3, one sub:     0 | Validation:    0, one sub:    0 | Test:    0, one sub:    0
A-01-A ->     3 | Train:     1, one sub:     0 | Validation:    1, one sub:    0 | Test:    1, one sub:    1
A-01-B ->  1442 | Train:   912, one sub:   524 | Validation:  237, one sub:  143 | Test:  293, one sub:  172
A-01-C ->   970 | Train:   623, one sub:   328 | Validation:  154, one sub:   84 | Test:  193, one sub:  103
A-01-D ->  1973 | Train:  1262, one sub:   980 | Validation:  317, one sub:  248 | Test:  394, one sub:  296
A-01-F ->   431 | Train:   279, one sub:   150 | Validation:   67, one sub:   36 | Test:   85, one sub:   45
A-01-G ->  2005 | Train:  1293, one sub:   697 | Validation:  317, one sub:  160 | Test:  395, one sub:  214
A-01-H -> 18137 | Train: 11614, one sub:  6473 | Validation: 2902, one sub: 1616 | Test: 3621, one sub: 2018
A-01-J ->   521 | Train:   332, one sub:   207 | Validation:   85, one sub:   52 | Test:  104, one sub:   69
A-01-K ->  5847 | T