# Generation of training and validation matrices for classifiers

In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import time
from collections import namedtuple, defaultdict
import cPickle as pickle
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
import gzip

from multiprocessing.dummy import Pool as ThreadPool
import itertools

from sklearn.metrics import coverage_error
import sklearn.metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model
from sklearn.preprocessing import MultiLabelBinarizer

from gensim.models.doc2vec import Doc2Vec, LabeledSentence

import logging
from logging import info
from functools import partial

import seaborn

from sklearn.model_selection import ParameterSampler

from thesis.utils.metrics import *
from thesis.utils.classification import *
from thesis.utils.file import *

# Global variables used throughout the script

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
NUM_CORES = 16

In [4]:
GLOBAL_VARS = namedtuple('GLOBAL_VARS', ['MODEL_NAME', 'DOC2VEC_MODEL_NAME', 'DOC2VEC_MODEL', 'DOC2VEC_RAW_MODEL_NAME'])

In [5]:
VOCAB_MODEL = "vocab_model"
MODEL_PREFIX = "model"
VALIDATION_MATRIX = "validation_matrix.pkl"
VALIDATION_DICT = "validation_dict.pkl"
TEST_MATRIX = "test_matrix.pkl"
TEST_DICT = "test_dict.pkl"
METRICS = "metrics.pkl"
CLASSIFIER = "classifier.pkl"
TYPE_CLASSIFIER= "{}_classifier.pkl"

TRAINING_DATA_MATRIX = "X_level_{}.npy"
TRAINING_LABELS_MATRIX = "y_{}.npy"
VALIDATION_DATA_MATRIX = "Xv_level_{}.npy"
VALIDATION_LABELS_MATRIX = "yv_{}.npy"
TEST_DATA_MATRIX = "Xt_level_{}.npy"
TEST_LABELS_MATRIX = "yt_{}.npy"

In [6]:
GZIP_EXTENSION = ".gz"

In [7]:
root_location = "/mnt/virtual-machines/data/"
big_data_location = "/mnt/virtual-machines/data/"

exports_location = root_location + "exported_data/"

matrices_save_location = big_data_location + "extended_pv_matrices"

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
valid_classes_file = exports_location + "valid_classes.pkl"
valid_subclasses_file = exports_location + "valid_subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"

# Load general data required for classification

In [8]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
valid_classes = pickle.load(open(valid_classes_file))
valid_subclasses = pickle.load(open(valid_subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))

CPU times: user 18.2 s, sys: 1.32 s, total: 19.6 s
Wall time: 19.6 s


In [9]:
len(training_docs_list)

1286325

In [10]:
len(validation_docs_list)

321473

In [11]:
len(test_docs_list)

401877

# Global variables for generation process

In [12]:
NUM_ABSTRACT_CHUNKS = 3
NUM_DESC_CHUNKS = 23
NUM_CLAIMS_CHUNKS = 4

In [13]:
LEVEL_1_ID = "{}"
LEVEL_2_ID = "{}_{}"
LEVEL_3_ID = "{}_{}_part-{}"

PART_LEVEL_NAME = "{}_{}"

In [41]:
DOCUMENT_ORDER = [
    (1, "document"), 
    (2, "abstract"), (3, "abstract"), 
    (2, "description"), (3, "description"), 
    (2, "claims"), (3, "claims")
]
DOCUMENT_PART_SIZES = {
    
    "1_document": 1,
    "2_abstract": 1,
    "2_description": 1,
    "2_claims": 1,
    "3_abstract": NUM_ABSTRACT_CHUNKS,
    "3_description": NUM_DESC_CHUNKS,
    "3_claims": NUM_CLAIMS_CHUNKS
}

In [42]:
DOC2VEC_SIZE = 200
DOC2VEC_WINDOW = 2
DOC2VEC_MAX_VOCAB_SIZE = None
DOC2VEC_SAMPLE = 1e-3
DOC2VEC_TYPE = 1
DOC2VEC_HIERARCHICAL_SAMPLE = 0
DOC2VEC_NEGATIVE_SAMPLE_SIZE = 10
DOC2VEC_CONCAT = 0
DOC2VEC_MEAN = 1
DOC2VEC_TRAIN_WORDS = 0
DOC2VEC_EPOCHS = 1 # we do our training manually one epoch at a time
DOC2VEC_MAX_EPOCHS = 8
REPORT_DELAY = 20 # report the progress every x seconds
REPORT_VOCAB_PROGRESS = 100000 # report vocab progress every x documents

DOC2VEC_MMAP = 'r'

DOC2VEC_EPOCH = 1


raw_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}'.format(DOC2VEC_SIZE,
                        DOC2VEC_WINDOW,
                        'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                        DOC2VEC_CONCAT, DOC2VEC_MEAN,
                        DOC2VEC_TRAIN_WORDS,
                        DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                        str(DOC2VEC_MAX_VOCAB_SIZE)
                        )
raw_model_name = os.path.join(raw_model_name, "epoch_{}")
GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME = raw_model_name.format(DOC2VEC_EPOCH)

### Utilities for data matrix filling

In [43]:
def get_part_ids(doc_id, part_level, part_name):
    """
    Returns the ids to look for, for a given document id, part level and part name
    ex get_part_ids(x, 3, "abstract) => ["x_abstract_part-1", "x_abstract_part-2", "x_abstract_part-3", ...]
    """
    if part_name == "document": 
        return [LEVEL_1_ID.format(doc_id)]
    elif part_level == 2:
        return [LEVEL_2_ID.format(doc_id, part_name)]
    elif part_level == 3:
        ids = []
        for i in range(DOCUMENT_PART_SIZES[PART_LEVEL_NAME.format(part_level, part_name)]):
            ids.append(LEVEL_3_ID.format(doc_id, part_name, i+1))
        return ids

In [44]:
def get_sequence_insert_location(my_part_level, my_part_name, max_level):
    """
    for a given level and name, determines where its position in the sequence begins
    """
    assert DOCUMENT_PART_SIZES.get(PART_LEVEL_NAME.format(my_part_level, my_part_name)) is not None
    loc = 0
    for part_level, part_name in DOCUMENT_ORDER:
        if part_level <= max_level:
            if part_level == my_part_level and part_name == my_part_name:
                break
            else:
                loc += DOCUMENT_PART_SIZES[PART_LEVEL_NAME.format(part_level, part_name)]
    return loc

## Generating Training and Validation matrices

In [45]:
LEVEL_TO_GENERATE = 2
EMBEDDING_SIZE = DOC2VEC_SIZE
ZERO_VECTOR = [0] * DOC2VEC_SIZE

In [46]:
sequence_size = sum([DOCUMENT_PART_SIZES["{}_{}".format(part_level, part_name)] for part_level, part_name in DOCUMENT_ORDER if part_level <= LEVEL_TO_GENERATE])
print sequence_size

4


In [49]:
X_data = np.ndarray((len(training_docs_list), sequence_size, EMBEDDING_SIZE), dtype=np.float32)
Xv_data = np.ndarray((len(validation_docs_list), sequence_size, EMBEDDING_SIZE), dtype=np.float32)

info("********** Generating Matrices for LEVEL:{} ************".format(LEVEL_TO_GENERATE))

for part_level, part_name in DOCUMENT_ORDER:
    if part_level <= LEVEL_TO_GENERATE:
        
        info("======== Working on Level: {} => {}".format(part_level, part_name))
        
        sequence_insert_location = get_sequence_insert_location(part_level, part_name, LEVEL_TO_GENERATE)
        
        
        doc2vec_model_save_location = os.path.join(root_location,
                                                   "parameter_search_doc2vec_models_" + str(part_level) + '_' + part_name,
                                                   "full")
        
        placeholder_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}_model_{}'.format(DOC2VEC_SIZE,
                                                                DOC2VEC_WINDOW,
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_TRAIN_WORDS,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                                                                str(DOC2VEC_MAX_VOCAB_SIZE),
                                                                str(part_level) + '_' + part_name
                                                                )
        GLOBAL_VARS.DOC2VEC_MODEL_NAME = placeholder_model_name
        placeholder_model_name = os.path.join(placeholder_model_name, "epoch_{}")
        epoch = DOC2VEC_EPOCH
        GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)
        
        
        info("Loading Doc2vec model: {}".format(GLOBAL_VARS.MODEL_NAME))
        doc2vec_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX), mmap=DOC2VEC_MMAP)
        
        
        info("Loading Validation Dict")
        validation_dict = dict(pickle.load(gzip.open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_DICT + GZIP_EXTENSION))))
        part_level_name = PART_LEVEL_NAME.format(part_level, part_name)
        
        
        def fill_matrix(data_matrix, source_dict, docs_list, start_location, use_get=False):
            """
            use_get is for doc2vec_model.docvecs since it doesnt support .get(), so we catch the exception and
            fill with zeros in that case. This should really happen very rarely (if ever) so this exception handling
            should not be a drain on performance
            """
            for i, doc_id in enumerate(docs_list):
                child_ids = get_part_ids(doc_id, part_level, part_name)

                j = start_location
                for child_id in child_ids:
                    try:
                        if not use_get or source_dict.get(child_id) is not None:
                            data_matrix[i][j] = source_dict[child_id]
                        else:
                            info("ZERO_VECTOR for {}".format(child_id))
                            data_matrix[i][j] = ZERO_VECTOR
                    except:
                        info("ZERO_VECTOR for {}".format(child_id))
                        data_matrix[i][j] = ZERO_VECTOR
                    j+= 1
        
        info("Filling training matrix")
        fill_matrix(X_data, doc2vec_model.docvecs, training_docs_list, sequence_insert_location, use_get=False)
        info("Filling validation matrix")
        fill_matrix(Xv_data, validation_dict, validation_docs_list, sequence_insert_location, use_get=True)
        
        
ensure_disk_location_exists(os.path.join(matrices_save_location, GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME))
info("Saving training matrix")
np.save(open(os.path.join(matrices_save_location, GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME, 
                          TRAINING_DATA_MATRIX.format(LEVEL_TO_GENERATE)), "w"), X_data)
info("Saving validation matrix")
np.save(open(os.path.join(matrices_save_location, GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME, 
                          VALIDATION_DATA_MATRIX.format(LEVEL_TO_GENERATE)), "w"), Xv_data)

2017-04-18 21:27:03,276 : INFO : ********** Generating Matrices for LEVEL:2 ************
2017-04-18 21:27:03,279 : INFO : Loading Doc2vec model: doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_1_document/epoch_1
2017-04-18 21:27:03,279 : INFO : loading Doc2Vec object from /mnt/virtual-machines/data/parameter_search_doc2vec_models_1_document/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_1_document/epoch_1/model
2017-04-18 21:27:09,467 : INFO : loading docvecs recursively from /mnt/virtual-machines/data/parameter_search_doc2vec_models_1_document/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_1_document/epoch_1/model.docvecs.* with mmap=r
2017-04-18 21:27:09,468 : INFO : loading doctag_syn0 from /mnt/virtual-machines/data/parameter_search_doc2vec_models_1_document/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_N

2017-04-18 21:29:07,966 : INFO : setting ignored attribute syn0norm to None
2017-04-18 21:29:07,967 : INFO : loading syn1neg from /mnt/virtual-machines/data/parameter_search_doc2vec_models_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_1/model.syn1neg.npy with mmap=r
2017-04-18 21:29:07,967 : INFO : setting ignored attribute cum_table to None
2017-04-18 21:29:07,968 : INFO : loaded /mnt/virtual-machines/data/parameter_search_doc2vec_models_2_claims/full/doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None_model_2_claims/epoch_1/model
2017-04-18 21:29:08,827 : INFO : Loading Validation Dict
2017-04-18 21:29:30,857 : INFO : Filling training matrix
2017-04-18 21:29:39,070 : INFO : Filling validation matrix
2017-04-18 21:29:40,193 : INFO : Saving training matrix
2017-04-18 21:29:42,940 : INFO : Saving validation matrix


## Generate test matrices

In [None]:
LEVEL_TO_GENERATE = 3
EMBEDDING_SIZE = DOC2VEC_SIZE
ZERO_VECTOR = [0] * DOC2VEC_SIZE

In [74]:
sequence_size = sum([DOCUMENT_PART_SIZES["{}_{}".format(part_level, part_name)] for part_level, part_name in DOCUMENT_ORDER if part_level <= LEVEL_TO_GENERATE])
print sequence_size

34


In [75]:
Xt_data = np.ndarray((len(test_docs_list), sequence_size, EMBEDDING_SIZE), dtype=np.float32)

info("********** Generating Matrices for LEVEL:{} ************".format(LEVEL_TO_GENERATE))

for part_level, part_name in DOCUMENT_ORDER:
    if part_level <= LEVEL_TO_GENERATE:
        
        info("======== Working on Level: {} => {}".format(part_level, part_name))
        
        sequence_insert_location = get_sequence_insert_location(part_level, part_name, LEVEL_TO_GENERATE)
        
        
        doc2vec_model_save_location = os.path.join(root_location,
                                                   "parameter_search_doc2vec_models_" + str(part_level) + '_' + part_name,
                                                   "full")
        
        placeholder_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}_model_{}'.format(DOC2VEC_SIZE,
                                                                DOC2VEC_WINDOW,
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_TRAIN_WORDS,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                                                                str(DOC2VEC_MAX_VOCAB_SIZE),
                                                                str(part_level) + '_' + part_name
                                                                )
        GLOBAL_VARS.DOC2VEC_MODEL_NAME = placeholder_model_name
        placeholder_model_name = os.path.join(placeholder_model_name, "epoch_{}")
        epoch = DOC2VEC_EPOCH
        GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)
        
        
        info("Loading Test Dict")
        test_dict = dict(pickle.load(gzip.open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, TEST_DICT))))
        #test_dict = dict(pickle.load(gzip.open(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, TEST_DICT + GZIP_EXTENSION))))
        part_level_name = PART_LEVEL_NAME.format(part_level, part_name)
        
        
        def fill_matrix(data_matrix, source_dict, docs_list, start_location, use_get=False):
            """
            use_get is for doc2vec_model.docvecs since it doesnt support .get(), so we catch the exception and
            fill with zeros in that case. This should really happen very rarely (if ever) so this exception handling
            should not be a drain on performance
            """
            for i, doc_id in enumerate(docs_list):
                child_ids = get_part_ids(doc_id, part_level, part_name)

                j = start_location
                for child_id in child_ids:
                    try:
                        if not use_get or source_dict.get(child_id) is not None:
                            data_matrix[i][j] = source_dict[child_id]
                        else:
                            info("ZERO_VECTOR for {}".format(child_id))
                            data_matrix[i][j] = ZERO_VECTOR
                    except:
                        info("ZERO_VECTOR for {}".format(child_id))
                        data_matrix[i][j] = ZERO_VECTOR
                    j+= 1
        
        info("Filling test matrix")
        fill_matrix(Xt_data, test_dict, test_docs_list, sequence_insert_location, use_get=True)
        
        
ensure_disk_location_exists(os.path.join(matrices_save_location, GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME))
info("Saving test matrix")
np.save(open(os.path.join(matrices_save_location, GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME, 
                          TEST_DATA_MATRIX.format(LEVEL_TO_GENERATE)), "w"), Xt_data)

2017-04-13 01:00:31,855 : INFO : ********** Generating Matrices for LEVEL:3 ************
2017-04-13 01:00:31,857 : INFO : Loading Test Dict
2017-04-13 01:01:08,159 : INFO : Filling test matrix
2017-04-13 01:01:10,006 : INFO : Loading Test Dict
2017-04-13 01:01:45,889 : INFO : Filling test matrix
2017-04-13 01:01:47,583 : INFO : Loading Test Dict
2017-04-13 01:03:38,426 : INFO : Filling test matrix
2017-04-13 01:03:42,309 : INFO : Loading Test Dict
2017-04-13 01:04:19,430 : INFO : Filling test matrix
2017-04-13 01:04:21,065 : INFO : Loading Test Dict
2017-04-13 01:18:28,382 : INFO : Filling test matrix
2017-04-13 01:18:53,300 : INFO : Loading Test Dict
2017-04-13 01:19:43,208 : INFO : Filling test matrix
2017-04-13 01:19:44,831 : INFO : Loading Test Dict
2017-04-13 01:22:10,799 : INFO : Filling test matrix
2017-04-13 01:22:12,089 : INFO : ZERO_VECTOR for 07371868_claims_part-2
2017-04-13 01:22:12,094 : INFO : ZERO_VECTOR for 07371868_claims_part-3
2017-04-13 01:22:12,098 : INFO : ZERO_V

## Generate training and validation labels

In [50]:
def create_labels(classifications, docs_list):
    one_hot_encoder = OneHotEncoder(classifications)
    classifications_set = set(classifications)
    labels_mat = np.zeros((len(docs_list), len(classifications)), dtype=np.int8)
    for i, doc_id in enumerate(docs_list):
        eligible_classifications = set(doc_classification_map[doc_id]) & classifications_set
        labels_mat[i][:] = one_hot_encoder.get_label_vector(eligible_classifications)
    return labels_mat

In [51]:
classifications_to_create = [
    ("sections", sections),
#     ("classes", valid_classes),
#     ("subclasses", valid_subclasses)
]

for classifications_type, classifications in classifications_to_create:
    info("Creating Training Labels for {}".format(classifications_type))
    y = create_labels(classifications, training_docs_list)
    info("Creating Validation Labels for {}".format(classifications_type))
    yv = create_labels(classifications, validation_docs_list)
    info("Creating Test Labels for {}".format(classifications_type))
    yt = create_labels(classifications, test_docs_list)
    
    ensure_disk_location_exists(os.path.join(matrices_save_location, GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME))
    np.save(open(os.path.join(matrices_save_location, GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME, 
                                  TRAINING_LABELS_MATRIX.format(classifications_type)), "w"), y)
    np.save(open(os.path.join(matrices_save_location, GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME, 
                                  VALIDATION_LABELS_MATRIX.format(classifications_type)), "w"), yv)
    np.save(open(os.path.join(matrices_save_location, GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME, 
                                  TEST_LABELS_MATRIX.format(classifications_type)), "w"), yt)

2017-04-18 21:58:05,482 : INFO : Creating Training Labels for sections
2017-04-18 21:58:09,454 : INFO : Creating Validation Labels for sections
2017-04-18 21:58:10,486 : INFO : Creating Test Labels for sections
