In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import time
import cPickle as pickle
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
from sklearn.metrics import coverage_error
import sklearn.metrics
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import logging

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
SVM_SEED = 1234

In [4]:
STOP_WORDS = nltk.corpus.stopwords.words('english')
NUMBER_INDICATOR = "number_inidicator"
CURRENCY_INDICATOR = "currency_inidicator"
CHEMICAL_INDICATOR = "chemical_inidicator"
MIN_WORD_COUNT = 5
MIN_SIZE = 0
NUM_CORES = 7

In [24]:
#training_file = "/home/local/shalaby/docs_output_sample_100.json"
root_location = "/big/s/shalaby/"
doc2vec_model_location = root_location + "paragraph_vector_models/"
training_file = root_location + "docs_output.json"
doc_classifications_map_file = root_location + "exported_data_non_spark_format/doc_classification_map.pkl"
sections_file = root_location + "exported_data_non_spark_format/sections.pkl"
training_docs_list_file = root_location + "exported_data_merged/training_docs_list.pkl"
validation_docs_list_file = root_location + "exported_data_merged/validation_docs_list.pkl"

In [9]:
model_save_location = "/home/local/shalaby/models/scikit-svm/"

In [11]:
doc_classifications_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
#open(sections_file).read()

In [12]:
len(training_docs_list)

1286325

In [13]:
def stemtokenizer(text):
    """ MAIN FUNCTION to get clean stems out of a text. A list of clean stems are returned """
    tokenizer = RegexpTokenizer(r'\s+', gaps=True)
    tokens = tokenizer.tokenize(text)
    stems = []  # result
    for token in tokens:
        stem = token.lower()
        stem = stem.strip(string.punctuation)
        if stem:
            if is_number(stem):
                stem = NUMBER_INDICATOR
            elif is_currency(stem):
                stem = CURRENCY_INDICATOR
            elif is_chemical(stem):
                stem = CHEMICAL_INDICATOR
            elif is_stopword(stem):
                stem = None
            else:
                stem = stem.strip(string.punctuation)
            if stem and len(stem) >= MIN_SIZE:
                # extract uni-grams
                stems.append(stem)
    del tokens
    return stems

def is_stopword(word):
  return word in STOP_WORDS

def is_number(str):
    """ Returns true if given string is a number (float or int)"""
    try:
        float(str.replace(",", ""))
        return True
    except ValueError:
        return False

def is_currency(str):
    return str[0] == "$"

def is_chemical(str):
    return str.count("-") > 3

### Doc2vec and SVM Parameters

In [14]:
DOC2VEC_SIZE = 400
DOC2VEC_WINDOW = 8
DOC2VEC_MAX_VOCAB_SIZE = None
DOC2VEC_SAMPLE = 1e-5
DOC2VEC_TYPE = 1
DOC2VEC_HIERARCHICAL_SAMPLE = 0
DOC2VEC_NEGATIVE_SAMPLE_SIZE = 10
DOC2VEC_CONCAT = 0
DOC2VEC_MEAN = 0
DOC2VEC_EPOCHS = 1
REPORT_DELAY = 30 # report the progress every x seconds
REPORT_VOCAB_PROGRESS = 10000 # report the progress every x terms

In [15]:
SVM_ITERATIONS = 1000
SVM_CONVERGENCE = 0.001
SVM_REG = 1/0.1 # scikit uses a C parameter not a lambda

### Load Doc2vec Model

In [20]:
%%time
file_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_hs_{}_iter_{}'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE)

doc2vec_model = Doc2Vec.load(doc2vec_model_location + '{}'.format(file_name), mmap='r')

2016-08-25 11:39:29,754 : INFO : loading Doc2Vec object from /big/s/shalaby/paragraph_vector_models/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10
2016-08-25 11:40:54,766 : INFO : loading docvecs recursively from /big/s/shalaby/paragraph_vector_models/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10.docvecs.* with mmap=r
2016-08-25 11:40:54,768 : INFO : loading doctag_syn0 from /big/s/shalaby/paragraph_vector_models/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10.docvecs.doctag_syn0.npy with mmap=r
2016-08-25 11:40:54,772 : INFO : loading syn1neg from /big/s/shalaby/paragraph_vector_models/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10.syn1neg.npy with mmap=r
2016-08-25 11:40:54,774 : INFO : loading syn0 from /big/s/shalaby/paragraph_vector_models/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10.syn0.npy with mmap=r
2016-08-25 11:40:54,776 : INFO : setting ignored attribute syn0norm to None
2016-08-25 11:40:54,777 : INFO : setting ignor

CPU times: user 1min 23s, sys: 24.7 s, total: 1min 48s
Wall time: 1min 48s


In [21]:
doc2vec_model

<gensim.models.doc2vec.Doc2Vec at 0x7f341d6be650>

### Actual Training

In [13]:
MultiLabelBinarizer().fit_transform([["B","C"],["D"],["E"],["A"]])

array([[0, 1, 1, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0]])

In [25]:
training_vectors = []
training_labels = []
for doc_id in training_docs_list:
    training_vectors.append(doc2vec_model.docvecs[doc_id])
    training_labels.append([classf for classf in doc_classifications_map[doc_id] if classf in sections])
training_vectors = np.array(training_vectors)

In [29]:
binary_labels = MultiLabelBinarizer().fit_transform(training_labels)

In [30]:
binary_labels.shape

(1286325, 8)

In [31]:
training_vectors.shape

(1286325, 400)

In [35]:
svm_model = OneVsRestClassifier(LinearSVC(penalty='l2', tol=SVM_CONVERGENCE, 
                                          C=SVM_REG, verbose=1, random_state=SVM_SEED, max_iter=SVM_ITERATIONS))

In [None]:
%%time
svm_model.fit(training_vectors, binary_labels)
pickle.dump(svm_model, open(model_save_location + file_name, 'w'))

[LibLinear]



[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]CPU times: user 6h 44min 16s, sys: 1h 33min 31s, total: 8h 17min 47s
Wall time: 8h 17min 24s


In [None]:
svm_model2 = OneVsRestClassifier(LinearSVC(penalty='l2', tol=SVM_CONVERGENCE, 
                                          C=SVM_REG, verbose=1, random_state=SVM_SEED, max_iter=10000))
svm_model2.fit(training_vectors, binary_labels)

[LibLinear][LibLinear]

### Create the model

In [34]:
print model

Doc2Vec(dm/c,d400,n10,w8,mc5,s1e-05,t7)


#### Initialize the model using the vocab from the previously trained model

In [117]:
model.reset_from(loaded_model)

2016-08-16 15:49:28,921 : INFO : using concatenative 6800-dimensional layer1
using concatenative 6800-dimensional layer1
2016-08-16 15:49:28,925 : INFO : resetting layer weights
resetting layer weights


#### Now for the actual training

In [None]:
%%time
model.build_vocab(sentences=LabeledLineSentence(training_file), progress_per=REPORT_VOCAB_PROGRESS)

2016-08-21 05:36:43,143 : INFO : collecting all words and their counts
2016-08-21 05:36:43,576 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags


In [25]:
%%time
model.train(sentences=LabeledLineSentence(training_file), report_delay=REPORT_DELAY)

2016-08-21 05:32:48,403 : INFO : training model with 7 workers on 5794 vocabulary and 6800 features, using sg=0 hs=0 sample=1e-05 negative=10
2016-08-21 05:32:48,405 : INFO : expecting 65 sentences, matching count from corpus used for vocabulary survey
2016-08-21 05:32:51,066 : INFO : PROGRESS: at 1.54% examples, 658 words/s, in_qsize 0, out_qsize 0
2016-08-21 05:33:05,759 : INFO : worker thread finished; awaiting finish of 6 more threads
2016-08-21 05:33:05,762 : INFO : worker thread finished; awaiting finish of 5 more threads
2016-08-21 05:33:05,763 : INFO : worker thread finished; awaiting finish of 4 more threads
2016-08-21 05:33:05,865 : INFO : worker thread finished; awaiting finish of 3 more threads
2016-08-21 05:33:05,894 : INFO : worker thread finished; awaiting finish of 2 more threads
2016-08-21 05:33:06,142 : INFO : worker thread finished; awaiting finish of 1 more threads
2016-08-21 05:33:06,551 : INFO : worker thread finished; awaiting finish of 0 more threads
2016-08-21 

CPU times: user 32.1 s, sys: 2.58 s, total: 34.7 s
Wall time: 18.2 s


70839

In [134]:
# file_name = 'doc2vec_size_{}_w_{}_type_{}_hs_{}_iter_{}'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
#                                                                 'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
#                                                                 DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE)
file_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_hs_{}_neg_{}_vocabsize_{}_iter_{}'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE,
                                                                str(DOC2VEC_MAX_VOCAB_SIZE), DOC2VEC_EPOCHS)
model.save('/home/local/shalaby/models/{}'.format(file_name))

2016-08-20 15:22:45,778 : INFO : saving Doc2Vec object under /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10, separately None
saving Doc2Vec object under /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10, separately None
2016-08-20 15:22:45,783 : INFO : storing numpy array 'doctag_syn0' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10.docvecs.doctag_syn0.npy
storing numpy array 'doctag_syn0' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10.docvecs.doctag_syn0.npy
2016-08-20 15:22:50,011 : INFO : storing numpy array 'syn1neg' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10.syn1neg.npy
storing numpy array 'syn1neg' to /home/local/shalaby/models/doc2vec_size_400_w_8_type_dm_concat_1_mean_0_hs_0_iter_10.syn1neg.npy
2016-08-20 15:40:39,540 : INFO : not storing attribute syn0norm
not storing attribute syn

## Inference Step

In [25]:
class ValidationDocumentGenerator(object):
    def __init__(self, filename):
        self.filename = filename
    def __iter__(self):
        for line in open(self.filename):
            (doc_id, text) = eval(line)
            if doc_id in validation_docs_list:
                yield doc_id, stemtokenizer(text)

In [None]:
%%time
validation_documents_reps = {}
i = 0
for (doc_id, doc_contents_array) in ValidationDocumentGenerator(training_file):
    i += 1
    if i % 1000 == 0: print i
    
    validation_documents_reps[doc_id] = doc2vec_model.infer_vector(doc_contents_array)

In [None]:
%%time
validation_vectors = []
validation_labels = []
for validation_doc_id in validation_docs_list:
    validation_vectors.append(validation_documents_reps[validation_doc_id])
    validation_labels.append([classf for classf in doc_classifications_map[validation_doc_id] if classf in sections])
validation_vectors = np.array(validation_vectors)

In [76]:
model.most_similar('08887671')

2016-08-16 14:34:05,764 : INFO : precomputing L2-norms of word weight vectors
precomputing L2-norms of word weight vectors


AttributeError: 'Doc2Vec' object has no attribute 'syn0'

In [77]:
model['08887671']

AttributeError: 'Doc2Vec' object has no attribute 'syn0'

In [116]:
len(loaded_model.vocab)

6147817

In [99]:
from collections import defaultdict
model.raw_vocab = defaultdict(int)

In [100]:
model.raw_vocab

defaultdict(int, {})

In [83]:
dir(model)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__getitem__',
 '__hash__',
 '__ignoreds',
 '__init__',
 '__module__',
 '__new__',
 '__numpys',
 '__recursive_saveloads',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__scipys',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_do_train_job',
 '_load_specials',
 '_raw_word_count',
 '_save_specials',
 '_smart_save',
 'accuracy',
 'alpha',
 'batch_words',
 'build_vocab',
 'cbow_mean',
 'clear_sims',
 'comment',
 'corpus_count',
 'create_binary_tree',
 'dbow',
 'dbow_words',
 'dm',
 'dm_concat',
 'dm_tag_count',
 'docvecs',
 'doesnt_match',
 'estimate_memory',
 'finalize_vocab',
 'hashfxn',
 'hs',
 'index2word',
 'infer_vector',
 'init_sims',
 'intersect_word2vec_format',
 'iter',
 'layer1_size',
 'load',
 'load_word2vec_format',
 'log_accuracy',
 'make_cum_table',
 'max_vocab_size',
 'min_alpha',
 'min_alph

In [95]:
len(model.vocab)

6147817

In [96]:
model.wmdistance

<bound method Doc2Vec.wmdistance of <gensim.models.doc2vec.Doc2Vec object at 0x7fd85f134210>>