## Creates the doc2vec vector embeddings for a specific configuration

In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import time
from collections import namedtuple
import cPickle as pickle
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
import gzip

from multiprocessing import Process, Queue

from multiprocessing.dummy import Pool as ThreadPool
import itertools

from sklearn.metrics import coverage_error
import sklearn.metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model
from sklearn.preprocessing import MultiLabelBinarizer

from gensim.models.doc2vec import Doc2Vec, LabeledSentence

import logging
from logging import info
from functools import partial

from thesis.utils.metrics import *
from thesis.utils.file import *

ERROR (theano.sandbox.cuda): nvcc compiler not found on $PATH. Check your nvcc installation and try again.


## Global variables used throughout the script

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
SVM_SEED = 1234
DOC2VEC_SEED = 1234

In [4]:
MIN_WORD_COUNT = 100
NUM_CORES = 16

In [5]:
GLOBAL_VARS = namedtuple('GLOBAL_VARS', ['MODEL_NAME', 'DOC2VEC_MODEL_NAME', 'DOC2VEC_MODEL', 
                                         'SVM_MODEL_NAME', 'NN_MODEL_NAME'])

In [6]:
VOCAB_MODEL = "vocab_model"
MODEL_PREFIX = "model"
VALIDATION_DICT = "validation_dict.pkl"
TEST_MATRIX = "test_matrix.pkl"
TEST_DICT = "test_dict.pkl"
METRICS = "metrics.pkl"
CLASSIFIER = "classifier.pkl"

In [7]:
# root_location = "/mnt/virtual-machines/data/"
root_location = "/home/local/shalaby/"
exports_location = root_location + "exported_data/"

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"

preprocessed_location = root_location + "preprocessed_data/extended_pv_abs_desc_claims_full_chunks/"

training_preprocessed_files_prefix = preprocessed_location + "extended_pv_training_docs_data_preprocessed-"
validation_preprocessed_files_prefix = preprocessed_location + "extended_pv_validation_docs_data_preprocessed-"
test_preprocessed_files_prefix = preprocessed_location + "extended_pv_test_docs_data_preprocessed-"

## Load general data required for classification

In [8]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))

CPU times: user 30 s, sys: 1.33 s, total: 31.3 s
Wall time: 31.6 s


In [9]:
len(training_docs_list)

1286325

In [10]:
len(validation_docs_list)

321473

In [11]:
len(test_docs_list)

401877

# Utility functions for data loading

In [12]:
class ExtendedPVDocumentBatchGenerator(Process):
    def __init__(self, filename_prefix, queue, batch_size=10000, start_file=0, offset=10000):
        super(ExtendedPVDocumentBatchGenerator, self).__init__()
        self.queue = queue
        self.offset = offset
        self.filename_prefix = filename_prefix
        self.files_loaded = start_file - offset

    def run(self):
        cur_file = None
        while True:
            try:
                if cur_file is None:
                    info("Loading new file for index: {}".format(str(self.files_loaded + self.offset)))
#                     cur_file = gzip.open(self.filename_prefix + str(self.files_loaded + self.offset) + '.gz')
                    cur_file = open(self.filename_prefix + str(self.files_loaded + self.offset))
                    self.files_loaded += self.offset
                for line in cur_file:
                    self.queue.put(line)
                cur_file.close()
                cur_file = None
            except IOError:
                self.queue.put(False, block=True, timeout=None)
                info("All files are loaded - last file: {}".format(str(self.files_loaded + self.offset)))
                return


class BatchWrapper(object):
    def __init__(self, training_preprocessed_files_prefix, buffer_size=10000, batch_size=10000, level=1, level_type=None):
        assert batch_size <= 10000 or batch_size is None
        self.level = level
        self.level_type = level_type[0]
        self.batch_size = batch_size
        self.q = Queue(maxsize=buffer_size)
        self.p = ExtendedPVDocumentBatchGenerator(training_preprocessed_files_prefix, queue=self.q,
                                                  batch_size=batch_size, start_file=0, offset=10000)
        self.p.start()
        self.cur_data = []

    def is_correct_type(self, doc_id):
        parts = doc_id.split("_")
        len_parts = len(parts)
        if len_parts == self.level:
            if len_parts == 1:
                return True
            if len_parts == self.level and (parts[1][0] == self.level_type or self.level_type is None):
                return True
        return False

    def return_sentences(self, line):
        line_array = tuple(line.split(" "))
        doc_id = line_array[0]
        if not self.is_correct_type(doc_id):
            return False
        line_array = line_array[1:]
        len_line_array = len(line_array)
        curr_batch_iter = 0
        # divide the document to batches according to the batch size
        sentences = []
        while curr_batch_iter < len_line_array:
            sentences.append(LabeledSentence(words=line_array[curr_batch_iter: curr_batch_iter + self.batch_size], tags=[doc_id]))
            curr_batch_iter += self.batch_size
        return tuple(sentences)

    def __iter__(self):
        while True:
            item = self.q.get(block=True)
            if item is False:
                raise StopIteration()
            else:
                sentences = self.return_sentences(item)
                if not sentences:
                    None
                else:
                    for sentence in sentences:
                        yield sentence

# Doc2vec and SVM Parameters

In [17]:
DOC2VEC_SIZE = 200
DOC2VEC_WINDOW = 2
DOC2VEC_MAX_VOCAB_SIZE = None
DOC2VEC_SAMPLE = 1e-3
DOC2VEC_TYPE = 1
DOC2VEC_HIERARCHICAL_SAMPLE = 0
DOC2VEC_NEGATIVE_SAMPLE_SIZE = 10
DOC2VEC_CONCAT = 0
DOC2VEC_MEAN = 1
DOC2VEC_TRAIN_WORDS = 0
DOC2VEC_EPOCHS = 1 # we do our training manually one epoch at a time
DOC2VEC_MAX_EPOCHS = 8
REPORT_DELAY = 20 # report the progress every x seconds
REPORT_VOCAB_PROGRESS = 100000 # report vocab progress every x documents

## Create the vocabulary

In [18]:
models = [
    (3, 'description')
]

In [19]:
training_preprocessed_files_prefix

'/home/local/shalaby/preprocessed_data/extended_pv_abs_desc_claims_full_chunks/extended_pv_training_docs_data_preprocessed-'

In [20]:
for level, model_name in models:
    info("creating vocabulary for " + str(level) + ' ' + model_name + ' in ')
    doc2vec_model_save_location = os.path.join(root_location,
                                               "parameter_search_doc2vec_models_" + str(level) + '_' + model_name,
                                               "full")
    if not os.path.exists(doc2vec_model_save_location):
        os.makedirs(doc2vec_model_save_location)
    if not os.path.exists(os.path.join(doc2vec_model_save_location, VOCAB_MODEL)):
        os.makedirs(os.path.join(doc2vec_model_save_location, VOCAB_MODEL))

    placeholder_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}_model_{}'.format(DOC2VEC_SIZE,
                                                                    DOC2VEC_WINDOW,
                                                                    'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                    DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                    DOC2VEC_TRAIN_WORDS,
                                                                    DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                                                                    str(DOC2VEC_MAX_VOCAB_SIZE),
                                                                    str(level) + '_' + model_name
                                                                    )
    GLOBAL_VARS.DOC2VEC_MODEL_NAME = placeholder_model_name
    placeholder_model_name = os.path.join(placeholder_model_name, "epoch_{}")
    info("FILE " + os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
    doc2vec_model = Doc2Vec(size=DOC2VEC_SIZE, window=DOC2VEC_WINDOW, min_count=MIN_WORD_COUNT,
                    max_vocab_size= DOC2VEC_MAX_VOCAB_SIZE,
                    sample=DOC2VEC_SAMPLE, seed=DOC2VEC_SEED, workers=NUM_CORES,
                    # doc2vec algorithm dm=1 => PV-DM, dm=2 => PV-DBOW, PV-DM dictates CBOW for words
                    dm=DOC2VEC_TYPE,
                    # hs=0 => negative sampling, hs=1 => hierarchical softmax
                    hs=DOC2VEC_HIERARCHICAL_SAMPLE, negative=DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                    dm_concat=DOC2VEC_CONCAT,
                    # would train words with skip-gram on top of cbow, we don't need that for now
                    dbow_words=DOC2VEC_TRAIN_WORDS,
                    iter=DOC2VEC_EPOCHS)

    GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model

    training_docs_iterator = BatchWrapper(training_preprocessed_files_prefix, batch_size=10000, level=level,
                                          level_type=model_name)
    if not os.path.exists(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX)):
        doc2vec_model.build_vocab(sentences=training_docs_iterator, progress_per=REPORT_VOCAB_PROGRESS)
        doc2vec_model.save(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
    else:
        doc2vec_model_vocab_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
        doc2vec_model.reset_from(doc2vec_model_vocab_model)

2017-04-09 13:00:06,452 : INFO : creating vocabulary for 3 description in 
2017-04-09 13:00:06,454 : INFO : FILE /home/local/shalaby/parameter_search_doc2vec_models_3_description/full/vocab_model/model
2017-04-09 13:00:06,522 : INFO : collecting all words and their counts
2017-04-09 13:00:06,528 : INFO : Loading new file for index: 0
2017-04-09 13:00:06,542 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-04-09 13:00:29,673 : INFO : PROGRESS: at example #100000, processed 37461598 words (1619643/s), 369706 word types, 99977 tags
2017-04-09 13:00:53,123 : INFO : PROGRESS: at example #200000, processed 74340693 words (1572851/s), 653348 word types, 199977 tags
2017-04-09 13:00:58,173 : INFO : Loading new file for index: 10000
2017-04-09 13:01:14,142 : INFO : PROGRESS: at example #300000, processed 109625390 words (1678943/s), 852964 word types, 299954 tags
2017-04-09 13:01:36,039 : INFO : PROGRESS: at example #400000, processed 146980501 words (1706113

IOError: Not enough free space to write 23668380000 bytes

In [21]:
doc2vec_model.save(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))

2017-04-09 15:45:35,319 : INFO : saving Doc2Vec object under /home/local/shalaby/parameter_search_doc2vec_models_3_description/full/vocab_model/model, separately None
2017-04-09 15:45:35,322 : INFO : storing numpy array 'doctag_syn0' to /home/local/shalaby/parameter_search_doc2vec_models_3_description/full/vocab_model/model.docvecs.doctag_syn0.npy
2017-04-09 15:45:55,735 : INFO : storing numpy array 'doctag_syn0_lockf' to /home/local/shalaby/parameter_search_doc2vec_models_3_description/full/vocab_model/model.docvecs.doctag_syn0_lockf.npy


In [18]:
doc2vec_model = Doc2Vec(size=DOC2VEC_SIZE , window=DOC2VEC_WINDOW, min_count=MIN_WORD_COUNT, 
                max_vocab_size= DOC2VEC_MAX_VOCAB_SIZE,
                sample=DOC2VEC_SAMPLE, seed=DOC2VEC_SEED, workers=NUM_CORES,
                # doc2vec algorithm dm=1 => PV-DM, dm=2 => PV-DBOW, PV-DM dictates CBOW for words
                dm=DOC2VEC_TYPE,
                # hs=0 => negative sampling, hs=1 => hierarchical softmax
                hs=DOC2VEC_HIERARCHICAL_SAMPLE, negative=DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                dm_concat=DOC2VEC_CONCAT,
                # would train words with skip-gram on top of cbow, we don't need that for now
                dbow_words=DOC2VEC_TRAIN_WORDS,
                iter=DOC2VEC_EPOCHS)

GLOBAL_VARS.DOC2VEC_MODEL = doc2vec_model

## First: construct (or load) the vocabulary
Only needed to be run if you dont already haave at least one epoch computed, otherwise, just set the start_from (below) to the epoch you want to restart from

In [20]:
%%time
# training_docs_iterator = ExtendedPVDocumentBatchGenerator(training_preprocessed_files_prefix, batch_size=10000)
training_docs_iterator = BatchClass(training_preprocessed_files_prefix, batch_size=10000)
if not os.path.exists(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX)):
    doc2vec_model.build_vocab(sentences=training_docs_iterator, progress_per=REPORT_VOCAB_PROGRESS)
    doc2vec_model.save(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
else:
    doc2vec_model_vocab_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, VOCAB_MODEL, MODEL_PREFIX))
    doc2vec_model.reset_from(doc2vec_model_vocab_model)

2017-04-06 05:57:37,601 : INFO : loading Doc2Vec object from /mnt/virtual-machines/data/parameter_search_doc2vec_models_extended_abs_desc_claims_full_chunks/full/vocab_model/model
2017-04-06 05:57:37,602 : INFO : Loading new file for index: 0
2017-04-06 06:00:49,254 : INFO : loading docvecs recursively from /mnt/virtual-machines/data/parameter_search_doc2vec_models_extended_abs_desc_claims_full_chunks/full/vocab_model/model.docvecs.* with mmap=None
2017-04-06 06:00:49,255 : INFO : loading doctag_syn0 from /mnt/virtual-machines/data/parameter_search_doc2vec_models_extended_abs_desc_claims_full_chunks/full/vocab_model/model.docvecs.doctag_syn0.npy with mmap=None
2017-04-06 06:01:07,168 : INFO : loading doctag_syn0_lockf from /mnt/virtual-machines/data/parameter_search_doc2vec_models_extended_abs_desc_claims_full_chunks/full/vocab_model/model.docvecs.doctag_syn0_lockf.npy with mmap=None
2017-04-06 06:01:07,265 : INFO : loading syn1neg from /mnt/virtual-machines/data/parameter_search_doc2v

CPU times: user 12min 50s, sys: 1min 9s, total: 14min
Wall time: 14min


In [21]:
# vocab_counts = {k:doc2vec_model.vocab[k].count for k in doc2vec_model.vocab.keys()}
# dd = sorted(vocab_counts, key=vocab_counts.get)