In [1]:
from nltk.tokenize import RegexpTokenizer
import cPickle as pickle
import string

%matplotlib inline
import numpy as np
import random

In [2]:
SAMPLE_RATIO = 0.0001

In [3]:
root_location = "/mnt/data2/shalaby/"
exports_location = root_location + "exported_data/"

training_file = root_location + 'docs_output_training_validation_documents_' + str(SAMPLE_RATIO)

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
training_docs_list_file = exports_location + "training_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"
validation_docs_list_file = exports_location + "validation_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"

In [4]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))

CPU times: user 14.8 s, sys: 748 ms, total: 15.6 s
Wall time: 15.6 s


In [5]:
def stemtokenizer(text):
    """ MAIN FUNCTION to get clean stems out of a text. A list of clean stems are returned """
    tokenizer = RegexpTokenizer(r'\s+', gaps=True)
    tokens = tokenizer.tokenize(text)
    stems = []  # result
    for token in tokens:
        stem = token.lower()
        stem = stem.strip(string.punctuation)
        if stem:
            if is_number(stem):
                stem = NUMBER_INDICATOR
            elif is_currency(stem):
                stem = CURRENCY_INDICATOR
            elif is_chemical(stem):
                stem = CHEMICAL_INDICATOR
            else:
                stem = stem.strip(string.punctuation)
            if stem and len(stem) >= MIN_SIZE:
                # extract uni-grams
                stems.append(stem)
    del tokens
    return stems

def is_number(str):
    """ Returns true if given string is a number (float or int)"""
    try:
        float(str.replace(",", ""))
        return True
    except ValueError:
        return False

def is_currency(str):
    return str[0] == "$"

def is_chemical(str):
    return str.count("-") > 3

## Get Separate Training and Validation Data Files

In [6]:
MIN_SIZE = 0
NUMBER_INDICATOR = "number_inidicator"
CURRENCY_INDICATOR = "currency_inidicator"
CHEMICAL_INDICATOR = "chemical_inidicator"


BATCH_SIZE = 10000
DOC_IDS_PREFIX = "sdfjsdfsdf"
TRAINING_PREPROCESSED_FILES_PREFIX = "/mnt/data2/shalaby/training_docs_sample_%s_data_preprocessed-" % str(SAMPLE_RATIO)
TRAINING_PREPROCESSED_DOCIDS_FILES_PREFIX = "/mnt/data2/shalaby/training_docs_sample_%s_docids_preprocessed-" % str(SAMPLE_RATIO)
VALIDATION_PREPROCESSED_FILES_PREFIX = "/mnt/data2/shalaby/validation_docs_sample_%s_data_preprocessed-" % str(SAMPLE_RATIO)
VALIDATION_PREPROCESSED_DOCIDS_FILES_PREFIX = "/mnt/data2/shalaby/validation_docs_sample_%s_docids_preprocessed-" % str(SAMPLE_RATIO)

In [7]:

def write_batch(file_prefix, doc_files_prefix, batch_lines, doc_ids, batch_start):
    if len(batch_lines):
        print "writing batch %d" % batch_start
        with open(file_prefix + str(batch_start), 'w') as batch_file:
            for line in batch_lines:
                batch_file.write((u" ".join(line) + "\n").encode('utf-8'))
        pickle.dump(doc_ids, open(doc_files_prefix + str(batch_start), 'w'))


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 11.9 µs


Training

In [37]:
%%time

batch_index = 0
file_prefix = TRAINING_PREPROCESSED_FILES_PREFIX
doc_file_prefix = TRAINING_PREPROCESSED_DOCIDS_FILES_PREFIX
with open(training_file) as file_obj:
    token_lines, doc_ids = [], []
    for line in file_obj:
        (doc_id, text) = eval(line)
        if doc_id in training_docs_list:
            token_lines.append(stemtokenizer(text))
            doc_ids.append(doc_id)
            if len(token_lines) % BATCH_SIZE == 0:
                %time write_batch(file_prefix, doc_file_prefix, token_lines, doc_ids, batch_index * BATCH_SIZE)
                batch_index += 1
                token_lines, doc_ids = [], []
    write_batch(file_prefix, doc_file_prefix, token_lines, doc_ids, batch_index * BATCH_SIZE)

writing batch 0
CPU times: user 3.5 s, sys: 1.06 s, total: 4.56 s
Wall time: 5.2 s
writing batch 10000
CPU times: user 3.53 s, sys: 1.05 s, total: 4.58 s
Wall time: 5.2 s
writing batch 20000
CPU times: user 3.92 s, sys: 996 ms, total: 4.92 s
Wall time: 5.79 s
writing batch 30000
CPU times: user 3.84 s, sys: 1.1 s, total: 4.94 s
Wall time: 5.58 s


NameError: name 'doc_files_prefix' is not defined

Validation

In [8]:
%%time

batch_index = 0
file_prefix = VALIDATION_PREPROCESSED_FILES_PREFIX
doc_file_prefix = VALIDATION_PREPROCESSED_DOCIDS_FILES_PREFIX
with open(training_file) as file_obj:
    token_lines, doc_ids = [], []
    for line in file_obj:
        (doc_id, text) = eval(line)
        if doc_id in validation_docs_list:
            token_lines.append(stemtokenizer(text))
            doc_ids.append(doc_id)
            if len(token_lines) % BATCH_SIZE == 0:
                %time write_batch(file_prefix, doc_file_prefix, token_lines, doc_ids, batch_index * BATCH_SIZE)
                batch_index += 1
                token_lines, doc_ids = [], []
    write_batch(file_prefix, doc_file_prefix, token_lines, doc_ids, batch_index * BATCH_SIZE)

writing batch 0
CPU times: user 42.5 s, sys: 832 ms, total: 43.4 s
Wall time: 43.5 s


In [34]:
%%time
line_tokens = []
with open(TRAINING_PREPROCESSED_FILES_PREFIX + str(0)) as preproc_file:
    line_lengths = []
    for line in preproc_file:
        line_lengths.append(len(line))
        line_tokens.append(line.split(" "))

CPU times: user 1.53 s, sys: 856 ms, total: 2.38 s
Wall time: 2.39 s


### Old Method

we used to write the doc id and the tokens as tuples, then do an eval on them in reading time, but this turned out to be very slow

Training

In [None]:
%%time
def write_batch(file_prefix, batch_lines, batch_start):
    if len(batch_lines):
        print "writing batch %d" % batch_start
        %time pickle.dump(batch_lines, open(file_prefix + str(batch_start), 'w'))
#         with open(file_prefix + str(batch_start), 'w') as batch_file:
#             for line in batch_lines:
#                 batch_file.write(str(line) + "\n")

batch_index = 0
file_prefix = TRAINING_PREPROCESSED_FILES_PREFIX
with open(training_file) as file_obj:
    token_lines = []
    for line in file_obj:
        (doc_id, text) = eval(line)
        if doc_id in training_docs_list:
            token_lines.append((doc_id, stemtokenizer(text)))
            if len(token_lines) % BATCH_SIZE == 0:
                write_batch(file_prefix, token_lines, batch_index * BATCH_SIZE)
                batch_index += 1
                token_lines = []
    write_batch(file_prefix, token_lines, batch_index * BATCH_SIZE)

Validation

In [8]:
batch_index = 0
file_prefix = VALIDATION_PREPROCESSED_FILES_PREFIX
with open(training_file) as file_obj:
    token_lines = []
    for line in file_obj:
        (doc_id, text) = eval(line)
        if doc_id in validation_docs_list:
            token_lines.append((doc_id, stemtokenizer(text)))
            if len(token_lines) % BATCH_SIZE == 0:
                write_batch(file_prefix, token_lines, batch_index * BATCH_SIZE)
                batch_index += 1
                token_lines = []
    write_batch(file_prefix, token_lines, batch_index * BATCH_SIZE)

writing batch 0
writing batch 10000


In [22]:
%%time
line_tokens = []
with open(TRAINING_PREPROCESSED_FILES_PREFIX + str(0)) as preproc_file:
    for line in preproc_file:
        line_tokens.append(line.split(" "))

CPU times: user 5 s, sys: 1.2 s, total: 6.19 s
Wall time: 6.18 s


In [24]:
line_tokens[0][:10]

['technical',
 'field',
 'the',
 'present',
 'invention',
 'generally',
 'relates',
 'to',
 'wireless',
 'communications']