In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import time
from collections import namedtuple, defaultdict
import cPickle as pickle
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
import gzip

from multiprocessing.dummy import Pool as ThreadPool
import itertools

from sklearn.metrics import coverage_error
import sklearn.metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model
from sklearn.preprocessing import MultiLabelBinarizer

from gensim.models.doc2vec import Doc2Vec, LabeledSentence

import logging
from logging import info
from functools import partial

import seaborn

from sklearn.model_selection import ParameterSampler

from thesis.utils.metrics import *
from thesis.utils.classification import *
from thesis.utils.file import *

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
GLOBAL_VARS = namedtuple('GLOBAL_VARS', ['MODEL_NAME', 'DOC2VEC_MODEL_NAME', 'DOC2VEC_MODEL', 'DOC2VEC_RAW_MODEL_NAME'])

In [4]:
VOCAB_MODEL = "vocab_model"
MODEL_PREFIX = "model"
VALIDATION_MATRIX = "validation_matrix.pkl"
VALIDATION_DICT = "validation_dict.pkl"
TEST_MATRIX = "test_matrix.pkl"
TEST_DICT = "test_dict.pkl"
METRICS = "metrics.pkl"
CLASSIFIER = "classifier.pkl"
TYPE_CLASSIFIER= "{}_classifier.pkl"

TRAINING_DATA_MATRIX = "X_level_{}.npy"
TRAINING_LABELS_MATRIX = "y_{}.npy"
VALIDATION_DATA_MATRIX = "Xv_level_{}.npy"
VALIDATION_LABELS_MATRIX = "yv_{}.npy"
TEST_DATA_MATRIX = "Xt_level_{}.npy"
TEST_LABELS_MATRIX = "yt_{}.npy"

TRAINING_DATA_MATRIX_PART = "X_level_{}-{}.npy"
TRAINING_LABELS_MATRIX_PART = "y_{}-{}.npy"

In [5]:
GZIP_EXTENSION = ".gz"

In [6]:
root_location = "/mnt/data2/shalaby/"
big_data_location = "/mnt/data/shalaby/"

exports_location = root_location + "exported_data/"

matrices_save_location = big_data_location + "extended_pv_matrices"

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
valid_classes_file = exports_location + "valid_classes.pkl"
valid_subclasses_file = exports_location + "valid_subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"

In [7]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
valid_classes = pickle.load(open(valid_classes_file))
valid_subclasses = pickle.load(open(valid_subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))

CPU times: user 19.6 s, sys: 3.15 s, total: 22.8 s
Wall time: 22.9 s


In [8]:
DOC2VEC_SIZE = 200
DOC2VEC_WINDOW = 2
DOC2VEC_MAX_VOCAB_SIZE = None
DOC2VEC_SAMPLE = 1e-3
DOC2VEC_TYPE = 1
DOC2VEC_HIERARCHICAL_SAMPLE = 0
DOC2VEC_NEGATIVE_SAMPLE_SIZE = 10
DOC2VEC_CONCAT = 0
DOC2VEC_MEAN = 1
DOC2VEC_TRAIN_WORDS = 0
DOC2VEC_EPOCHS = 1 # we do our training manually one epoch at a time
DOC2VEC_MAX_EPOCHS = 8
REPORT_DELAY = 20 # report the progress every x seconds
REPORT_VOCAB_PROGRESS = 100000 # report vocab progress every x documents

DOC2VEC_MMAP = 'r'

DOC2VEC_EPOCH = 8


raw_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}'.format(DOC2VEC_SIZE,
                        DOC2VEC_WINDOW,
                        'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                        DOC2VEC_CONCAT, DOC2VEC_MEAN,
                        DOC2VEC_TRAIN_WORDS,
                        DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                        str(DOC2VEC_MAX_VOCAB_SIZE)
                        )
raw_model_name = os.path.join(raw_model_name, "epoch_{}")
GLOBAL_VARS.DOC2VEC_RAW_MODEL_NAME = raw_model_name.format(DOC2VEC_EPOCH)

In [9]:
def get_training_data(classifications_type, level):
    info("Loading Training Data from file")
    training_data = np.load(open(os.path.join(matrices_save_location, GLOBAL_VARS.MODEL_NAME, 
                                              TRAINING_DATA_MATRIX.format(level))))
    training_labels = np.load(open(os.path.join(matrices_save_location, GLOBAL_VARS.MODEL_NAME, 
                                                TRAINING_LABELS_MATRIX.format(classifications_type))))
    return training_data, training_labels

def get_validation_data(classifications_type, level):
    info("Loading Validation Data from file")
    validation_data = np.load(open(os.path.join(matrices_save_location, GLOBAL_VARS.MODEL_NAME, 
                                                VALIDATION_DATA_MATRIX.format(level))))
    validation_labels = np.load(open(os.path.join(matrices_save_location, GLOBAL_VARS.MODEL_NAME, 
                                                  VALIDATION_LABELS_MATRIX.format(classifications_type))))
    return validation_data, validation_labels

def get_test_data(classifications_type, level):
    info("Loading Test Data from file")
    test_data = np.load(open(os.path.join(matrices_save_location, GLOBAL_VARS.MODEL_NAME, 
                                                TEST_DATA_MATRIX.format(level))))
    test_labels = np.load(open(os.path.join(matrices_save_location, GLOBAL_VARS.MODEL_NAME, 
                                                  TEST_LABELS_MATRIX.format(classifications_type))))
    return test_data, test_labels

In [10]:
placeholder_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}'.format(DOC2VEC_SIZE, 
                                                            DOC2VEC_WINDOW, 
                                                            'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                            DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                            DOC2VEC_TRAIN_WORDS,
                                                            DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                                                            str(DOC2VEC_MAX_VOCAB_SIZE))
GLOBAL_VARS.DOC2VEC_MODEL_NAME = placeholder_model_name
placeholder_model_name = os.path.join(placeholder_model_name, "epoch_{}")

epoch = 8

GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)
print GLOBAL_VARS.MODEL_NAME


doc2vec_size_200_w_2_type_dm_concat_0_mean_1_trainwords_0_hs_0_neg_10_vocabsize_None/epoch_8


In [11]:
LEVEL_TO_GENERATE = 3

In [12]:
classifications_type = "subclasses"

In [13]:
X,y = get_training_data(classifications_type, LEVEL_TO_GENERATE)

2017-04-13 20:23:33,122 : INFO : Loading Training Data from file


In [16]:
X.shape

(1286325, 34, 200)

In [19]:
batch_size = 100000

In [20]:
for i in range(int(math.ceil(float(X.shape[0])/batch_size))):
    index = i*batch_size
    X_part = X[index: (i+1)* batch_size,:]
    print index, X_part.shape
    np.save(open(os.path.join(matrices_save_location, GLOBAL_VARS.MODEL_NAME, 
                          TRAINING_DATA_MATRIX_PART.format(LEVEL_TO_GENERATE, index)), "w"), X_part)

0 (100000, 34, 200)
100000 (100000, 34, 200)
200000 (100000, 34, 200)
300000 (100000, 34, 200)
400000 (100000, 34, 200)
500000 (100000, 34, 200)
600000 (100000, 34, 200)
700000 (100000, 34, 200)
800000 (100000, 34, 200)
900000 (100000, 34, 200)
1000000 (100000, 34, 200)
1100000 (100000, 34, 200)
1200000 (86325, 34, 200)


## For y labels

In [21]:
for i in range(int(math.ceil(float(y.shape[0])/batch_size))):
    index = i * batch_size
    y_part = y[index: (i+1)* batch_size,:]
    print index, y_part.shape
    np.save(open(os.path.join(matrices_save_location, GLOBAL_VARS.MODEL_NAME, 
                          TRAINING_LABELS_MATRIX_PART.format(classifications_type, index)), "w"), y_part)

(100000, 940)
(100000, 940)
(100000, 940)
(100000, 940)
(100000, 940)
(100000, 940)
(100000, 940)
(100000, 940)
(100000, 940)
(100000, 940)
(100000, 940)
(100000, 940)
(86325, 940)
