In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import os
import time
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
import cPickle as pickle
import gzip

from sklearn.metrics import coverage_error
import sklearn.metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model

from sklearn.decomposition import LatentDirichletAllocation

import logging
from logging import info

from thesis.utils.metrics import *

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
RANDOM_SEED = 10000
random.seed(RANDOM_SEED)

In [4]:
SVM_SEED = 1234
LDA_SEED = 1234

In [5]:
NUM_CORES = 24

In [6]:
MODEL_FILE = 'lda_model.pkl'
VALIDATION_METRICS_FILENAME= '{}_validation_metrics.pkl'
TRAINING_METRICS_FILENAME = '{}_training_metrics.pkl'
TEST_METRICS_FILENAME = '{}_test_metrics.pkl'
GZIP_EXTENSION = '.gz'

In [7]:
root_location = "/big/s/shalaby/"
exports_location = root_location + "exported_data/"
lda_location = root_location + "extended_pv_lda/"


classifications_index_file = exports_location + "classifications_index.pkl"
doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
valid_classes_file = exports_location + "valid_classes.pkl"
valid_subclasses_file = exports_location + "valid_subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"
# training_docs_list_file = exports_location + "extended_pv_training_docs_list_" + str(SAMPLE_RATIO) + ".pkl"
# validation_docs_list_file = exports_location + "extended_pv_validation_docs_list_" + str(SAMPLE_RATIO) + ".pkl"
# test_docs_list_file = exports_location + "extended_pv_test_docs_list_" + str(SAMPLE_RATIO) + ".pkl"


In [8]:
class OneHotEncoder():
    
    def __init__(self, classifications):
        self.classifications = classifications
        self.one_hot_indices = {}

        # convert character classifications to bit vectors
        for i, clssf in enumerate(classifications):
            bits = [0] * len(classifications)
            bits[i] = 1
            self.one_hot_indices[clssf] = i
    
    def get_label_vector(self, labels):
        """
        classes: array of string with the classes assigned to the instance
        """
        output_vector = [0] * len(self.classifications)
        for label in labels:
            index = self.one_hot_indices[label]
            output_vector[index] = 1
            
        return output_vector

def get_label_data(classifications, doc_ids, doc_classification_map):
    one_hot_encoder = OneHotEncoder(classifications)
    classifications_set = set(classifications)
    data_labels = []
    for i, doc_id in enumerate(doc_ids):
        #if len(doc_classification_map[doc_id]) > 20: info("PROOOOBBBBBBBBBBBLEM "+  str(doc_classification_map[doc_id]))
#         eligible_classifications = [clssf for clssf in doc_classification_map[doc_id] if clssf in classifications]
        eligible_classifications = set(doc_classification_map[doc_id]) & classifications_set
        data_labels.append(one_hot_encoder.get_label_vector(eligible_classifications))
        #if i % 1000 == 0: info(i)
    data_labels = np.array(data_labels, dtype=np.int8)
    return data_labels

#### Load Classification Objects

In [9]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
valid_classes = pickle.load(open(valid_classes_file))
valid_subclasses = pickle.load(open(valid_subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))
classifications_index = pickle.load(open(classifications_index_file))

CPU times: user 46.7 s, sys: 3.81 s, total: 50.5 s
Wall time: 51 s


In [10]:
len(valid_classes)

244

In [11]:
len(valid_subclasses)

940

## Training and Validation Loop

In [12]:
LDA_TOPICS = 1000
LDA_ITERATIONS = 50
LDA_BATCH_SIZE = 4096
LDA_DECAY = 0.5
LDA_EVALUATE_EVERY = 1000
LDA_VERBOSE = 2
LDA_LEARNING_METHOD = 'online'
LDA_MODEL_NAME = "lda_{}_topics_{}_iter_{}_batch_{}_decay_{}_evaluate-every_{}".format(LDA_LEARNING_METHOD, 
                                                                                       LDA_TOPICS, LDA_ITERATIONS,
                                                                                       LDA_BATCH_SIZE, LDA_DECAY, 
                                                                                       LDA_EVALUATE_EVERY)

In [None]:
%%time
# data_types = ["sublinear_tf"]
data_types = ["bm25"]
# data_types = ["tf", "sublinear_tf", "tf_idf", "sublinear_tf_idf","bm25"]
for data_type in data_types:
    info("=============== {} Being Evaluated ================".format(data_type))
    
#     data_training_location = exports_location + "extended_pv_benchmarking_data/" + "{}_training_sparse_data.pkl".format(data_type)
#     data_training_docids_location = exports_location + "extended_pv_benchmarking_data/" + "{}_training_sparse_docids.pkl".format(data_type)
#     data_validation_location = exports_location + "extended_pv_benchmarking_data/" + "{}_validation_sparse_data.pkl".format(data_type)
#     data_validation_docids_location = exports_location + "extended_pv_benchmarking_data/" + "{}_validation_sparse_docids.pkl".format(data_type)
    
    data_training_location = exports_location + "{}_training_sparse_data.pkl".format(data_type)
    data_training_docids_location = exports_location + "{}_training_sparse_docids.pkl".format(data_type)
    data_validation_location = exports_location + "{}_validation_sparse_data.pkl".format(data_type)
    data_validation_docids_location = exports_location + "{}_validation_sparse_docids.pkl".format(data_type)
    
    # Get the training data
    info('Getting Training Data')
    %time X = pickle.load(open(data_training_location, "r"))
    
    print X.shape
    
#     # Get the validation data
#     info('Getting Valdiation Data')
#     %time Xv = pickle.load(open(data_validation_location,'r'))
#     validation_data_docids = pickle.load(open(data_validation_docids_location, "r"))
#     %time yv = get_label_data(classifications, validation_data_docids, doc_classification_map)
    
    # Get the validation data
    info('Doing LDA decomposition')
    lda = LatentDirichletAllocation(n_topics=LDA_TOPICS, max_iter=LDA_ITERATIONS, learning_method=LDA_LEARNING_METHOD, \
                                   learning_decay=LDA_DECAY, batch_size=LDA_BATCH_SIZE, \
                                    evaluate_every=LDA_EVALUATE_EVERY, n_jobs=NUM_CORES, verbose=LDA_VERBOSE, random_state=LDA_SEED)
    
    %time lda.fit(X)
    
    
    # Dump the LDA model
    data_folder = os.path.join(lda_location, LDA_MODEL_NAME, data_type)
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    pickle.dump(lda, open(os.path.join(data_folder, MODEL_FILE), "w"))


2017-04-06 13:47:36,032 : INFO : Getting Training Data
2017-04-06 14:00:20,073 : INFO : Doing LDA decomposition


CPU times: user 11min 15s, sys: 1min 28s, total: 12min 43s
Wall time: 12min 44s
(1286325, 10000)


[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  6.5min finished
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  4.6min finished
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  4.0min finished
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  3.9min finished
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  3.7min finished
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  3.7min finished
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  3.6min finished
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  3.7min finished
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  3.5min finished
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  3.6min finished
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  3.5min finished
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  3.5min finished
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  3.4min finished
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  3.4min finished
[Paral

Failed to save <type 'numpy.ndarray'> to .npy file:
Traceback (most recent call last):
  File "/home/s/shalaby/.virtualenv/thesis-env/local/lib/python2.7/site-packages/sklearn/externals/joblib/numpy_pickle.py", line 271, in save
    obj, filename = self._write_array(obj, filename)
  File "/home/s/shalaby/.virtualenv/thesis-env/local/lib/python2.7/site-packages/sklearn/externals/joblib/numpy_pickle.py", line 231, in _write_array
    self.np.save(filename, array)
  File "/home/s/shalaby/.virtualenv/thesis-env/local/lib/python2.7/site-packages/numpy/lib/npyio.py", line 491, in save
    pickle_kwargs=pickle_kwargs)
  File "/home/s/shalaby/.virtualenv/thesis-env/local/lib/python2.7/site-packages/numpy/lib/format.py", line 584, in write_array
    array.tofile(fp)
IOError: 10000000 requested and 6821366 written



IOError: [Errno 28] No space left on device

In [18]:

# Dump the LDA model
data_folder = os.path.join(lda_location, LDA_MODEL_NAME, data_type)
if not os.path.exists(data_folder):
    os.makedirs(data_folder)
pickle.dump(lda, gzip.open(os.path.join(data_folder, MODEL_FILE + GZIP_EXTENSION), "w"))


## Load the LDA Model

In [14]:
data_type = "bm25"

In [15]:
data_folder = os.path.join(lda_location, LDA_MODEL_NAME, data_type)
lda = pickle.load(gzip.open(os.path.join(data_folder, MODEL_FILE + GZIP_EXTENSION), "r"))


## Generate Training Data

In [16]:
data_training_location = exports_location + "{}_training_sparse_data.pkl".format(data_type)

# Get the training data
info('Getting Training Data')
%time X = pickle.load(open(data_training_location, "r"))


2017-04-09 05:11:15,670 : INFO : Getting Training Data


CPU times: user 11min 26s, sys: 15 s, total: 11min 41s
Wall time: 11min 48s


In [36]:
data_folder = os.path.join(lda_location, LDA_MODEL_NAME, data_type)
lda_data_training_location = os.path.join(data_folder, "lda_training_data.pkl")

In [20]:
%%time
X_lda_training = lda.transform(X)

[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed: 511.7min finished


CPU times: user 1min 19s, sys: 32.2 s, total: 1min 51s
Wall time: 8h 31min 51s


In [37]:
%%time
pickle.dump(X_lda_training, open(lda_data_training_location, "w"))

In [38]:
import numpy as np

In [41]:
%time np.save(open(os.path.join(data_folder, "lda_training_data.npy"), "w"), X_lda_training)

CPU times: user 364 ms, sys: 14.5 s, total: 14.9 s
Wall time: 1min 47s


In [39]:
X_lda_training.shape

(1286325, 1000)

## Generating Validation Data

In [23]:
data_validation_location = exports_location + "{}_validation_sparse_data.pkl".format(data_type)
data_folder = os.path.join(lda_location, LDA_MODEL_NAME, data_type)
lda_data_validation_location = os.path.join(data_folder, "lda_validation_data.pkl")

In [None]:
# Get the validation data
info('Getting Valdiation Data')
%time Xv = pickle.load(open(data_validation_location,'r'))

2017-04-09 15:12:10,993 : INFO : Getting Valdiation Data


CPU times: user 2min 57s, sys: 4.42 s, total: 3min 2s
Wall time: 3min 4s


In [None]:
%%time
X_lda_validation = lda.transform(Xv)

[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed: 192.7min finished


CPU times: user 33.2 s, sys: 24.8 s, total: 58 s
Wall time: 3h 19min 40s


In [26]:
pickle.dump(X_lda_validation, open(lda_data_validation_location, "w"))

## Generating Test Data

In [27]:
data_test_location = exports_location + "{}_test_sparse_data.pkl".format(data_type)

In [28]:
data_folder = os.path.join(lda_location, LDA_MODEL_NAME, data_type)
lda_data_test_location = os.path.join(data_folder, "lda_test_data.pkl")

In [None]:
# Get the test data
info('Getting Test Data')
%time Xt = pickle.load(open(data_test_location, "r"))

2017-04-09 20:10:25,045 : INFO : Getting Test Data


CPU times: user 3min 15s, sys: 4.92 s, total: 3min 20s
Wall time: 3min 22s


In [None]:
%%time
X_lda_test = lda.transform(Xt)

[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed: 167.9min finished


CPU times: user 24.3 s, sys: 51.9 s, total: 1min 16s
Wall time: 2h 48min 25s


In [32]:
%%time
pickle.dump(X_lda_test, open(lda_data_test_location, "w"))

CPU times: user 4min 43s, sys: 8.46 s, total: 4min 51s
Wall time: 6min 3s


In [42]:
X_lda_test.shape

(401877, 1000)