In [1]:
from nltk.tokenize import RegexpTokenizer
import cPickle as pickle
import string

%matplotlib inline
import numpy as np
import random
import time

import logging
from logging import info

import nltk
from nltk.tokenize import sent_tokenize

from thesis.utils.text import get_sentences

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
SAMPLE_RATIO = 0.01

In [4]:
NUMBER_INDICATOR = "num_indic"
CURRENCY_INDICATOR = "curr_indic"
CHEMICAL_INDICATOR = "chem_indic"
MIN_WORD_COUNT = 100 # Suggested by Levy and goldberg
MIN_SIZE = 0
NUM_CORES = 16

In [5]:
root_location = "/big/s/shalaby/"
exports_location = root_location + "exported_data/"

training_file = root_location + 'docs_output_training_validation_documents_' + str(SAMPLE_RATIO)
training_file = root_location + 'docs_output.json'

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
# training_docs_list_file = exports_location + "training_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"
# validation_docs_list_file = exports_location + "validation_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"
training_docs_list_file = exports_location + "extended_pv_training_docs_list.pkl"
validation_docs_list_file = exports_location + "extended_pv_validation_docs_list.pkl"

In [6]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))

CPU times: user 29.6 s, sys: 4.68 s, total: 34.3 s
Wall time: 34.4 s


In [7]:
len(training_docs_list)

120156

In [8]:
len(validation_docs_list)

29675

### Get Line Positions

In [9]:
line_positions = pickle.load(open("/big/s/shalaby/exported_data/line_positions.pkl", "r"))

### Get Doc Line Positions

In [10]:
doc_line_positions = pickle.load(open(exports_location + "doc_line_positions.pkl", "r"))

## Single-Threaded Statistics

In [None]:
sentence_stats = {}
with open(training_file) as file_obj:
    token_lines, doc_ids = [], []
    start_time = time.time()
    for i, line in enumerate(file_obj):
        (doc_id, text) = eval(line)
        sentences = sentence_tokenizer.tokenize(text)
        #sentences = stemtokenizer(text)
        sentence_stats[doc_id] = len(sentences)
        if i % 1000 == 0:
            info('Finished: {}'.format(i))
#         print len(sentences)
#         for sent in sentences:
#             print sent
#             print ''
# #         print text.find("\n")
# #         print text[:10000]
#         break

## Multi-threaded Statistics

In [11]:
from multiprocessing import Pool as ThreadPool

In [12]:
BATCH_SIZE = 10000

In [13]:
def multithreaded_stats(start_index):

    # local variables, since multi-processing threads cant share state variables as they are replicated instead
    doc_sentences_lengths = {}
    doc_sentences_length_per_sent = {}
    
    info("Batch stats working on {}\n".format(start_index))
    with open(training_file) as file_obj:
        file_obj.seek(line_positions[start_index])
        start_time = time.time()
        for i, line in enumerate(file_obj):
            (doc_id, text) = eval(line)
            sentences = get_sentences(text)
            doc_sentences_lengths[doc_id] = len(sentences)
            doc_sentences_length_per_sent[doc_id] = [len(sent) for sent in sentences]
            del sentences
            if i >= BATCH_SIZE -1:
                break
    duration = time.time() - start_time
    info("Finished batch of {:d} in {:.0f}m {:.0f}s".format(BATCH_SIZE, *divmod(duration, 60)))
    return doc_sentences_lengths, doc_sentences_length_per_sent

In [14]:
%%time
#batch_max = len(line_positions)
batch_max = 100000
try:
    pool = ThreadPool(16)
    # +1 since range is end-exclusive
    batches = range(0, (divmod(batch_max, BATCH_SIZE)[0] + 1) * BATCH_SIZE, BATCH_SIZE )
#     batches = range(0, BATCH_SIZE*2, BATCH_SIZE)
    all_results = pool.map(multithreaded_stats, batches)
    pool.close()
    pool.terminate()
finally:
    pool.close()
    pool.terminate()
    

2017-03-05 02:51:23,675 : INFO : Batch stats working on 10000

2017-03-05 02:51:23,675 : INFO : Batch stats working on 0

2017-03-05 02:51:23,675 : INFO : Batch stats working on 30000

2017-03-05 02:51:23,678 : INFO : Batch stats working on 80000

2017-03-05 02:51:23,679 : INFO : Batch stats working on 90000

2017-03-05 02:51:23,680 : INFO : Batch stats working on 60000

2017-03-05 02:51:23,678 : INFO : Batch stats working on 40000

2017-03-05 02:51:23,680 : INFO : Batch stats working on 70000

2017-03-05 02:51:23,678 : INFO : Batch stats working on 50000

2017-03-05 02:51:23,676 : INFO : Batch stats working on 20000

2017-03-05 02:51:23,679 : INFO : Batch stats working on 100000

2017-03-05 02:59:15,905 : INFO : Finished batch of 10000 in 7m 52s
2017-03-05 02:59:40,521 : INFO : Finished batch of 10000 in 8m 17s
2017-03-05 02:59:44,412 : INFO : Finished batch of 10000 in 8m 21s
2017-03-05 02:59:45,540 : INFO : Finished batch of 10000 in 8m 22s
2017-03-05 02:59:48,764 : INFO : Finished 

CPU times: user 3.13 s, sys: 1.69 s, total: 4.82 s
Wall time: 8min 55s


In [15]:
len(all_results)

11

In [16]:
doc_sentences_lengths = {} 
doc_sentences_length_per_sent = {} 
for doc_sent_length, doc_sent_length_per_sent in all_results:
    doc_sentences_lengths.update(doc_sent_length)
    doc_sentences_length_per_sent.update(doc_sent_length_per_sent)

In [17]:
len(doc_sentences_length_per_sent)

110000

In [18]:
sorted(doc_sentences_lengths.items(), key=lambda x: x[1])[:20]

[(u'08207086', 5),
 (u'07741486', 8),
 (u'07159857', 8),
 (u'08198484', 9),
 (u'08334404', 9),
 (u'07287875', 10),
 (u'08075354', 10),
 (u'07767320', 11),
 (u'08089047', 11),
 (u'08754002', 11),
 (u'08920761', 11),
 (u'07235225', 11),
 (u'08043584', 12),
 (u'07114227', 13),
 (u'07237543', 13),
 (u'07296968', 13),
 (u'08402679', 13),
 (u'07662356', 13),
 (u'08147794', 14),
 (u'08387367', 14)]

In [27]:
low_lengths = {}
for doc_id in doc_sentences_length_per_sent:
    low_length = len([dd for dd in doc_sentences_length_per_sent[doc_id] if dd < 5])
    if low_length > 0:
        low_lengths[doc_id] = low_length

In [28]:
len(low_lengths)

0

In [26]:
min(doc_sentences_length_per_sent['07675909'])

54

In [22]:
sorted(low_lengths.items(), key=lambda x: x[1], reverse=True)[:50]

[(u'07675909', 0),
 (u'08521005', 0),
 (u'07054806', 0),
 (u'08369254', 0),
 (u'08521008', 0),
 (u'08362028', 0),
 (u'07195207', 0),
 (u'07075484', 0),
 (u'07349831', 0),
 (u'07638694', 0),
 (u'07349834', 0),
 (u'08489837', 0),
 (u'08489834', 0),
 (u'07638690', 0),
 (u'07378214', 0),
 (u'07129906', 0),
 (u'07021623', 0),
 (u'08185110', 0),
 (u'08974581', 0),
 (u'08232723', 0),
 (u'08232720', 0),
 (u'08232721', 0),
 (u'07628309', 0),
 (u'08896306', 0),
 (u'08782130', 0),
 (u'08678537', 0),
 (u'08369255', 0),
 (u'08161254', 0),
 (u'08196871', 0),
 (u'08017413', 0),
 (u'08563535', 0),
 (u'07029459', 0),
 (u'07807757', 0),
 (u'07807754', 0),
 (u'07804223', 0),
 (u'08507447', 0),
 (u'07427433', 0),
 (u'08164594', 0),
 (u'07118721', 0),
 (u'08015990', 0),
 (u'07431809', 0),
 (u'08815345', 0),
 (u'08815346', 0),
 (u'08388401', 0),
 (u'07943317', 0),
 (u'08534161', 0),
 (u'07431804', 0),
 (u'08287124', 0),
 (u'08287125', 0),
 (u'08174755', 0)]

In [91]:
pickle.dump(doc_sentences_lengths, open(exports_location + 'doc_sentences_lengths_map.pkl', 'w'))

In [63]:
from thesis.utils.text import sentence_tokenizer

In [84]:
sentences = get_sentences(" 1. CLAIM OF PRIORITY This patent application claims priority to European Patent  ")

In [85]:
sentences

['  1. CLAIM OF PRIORITY This patent application claims priority to European Patent  ']

In [73]:
new_sentences

['  1. CLAIM OF PRIORITY This patent application claims priority to European Patent    1.  1.',
 'CLAIM OF PRIORITY This patent application claims priority to European Patent  ']

In [83]:
get_sentences(" 1. CLAIM OF PRIORITY This patent application claims priority to European Patent  ")

['  1. CLAIM OF PRIORITY This patent application claims priority to European Patent  ']

In [30]:
with open(training_file) as file_obj:
    file_obj.seek(doc_line_positions['07249209'])
    line = file_obj.readline()
    doc_id, text = eval(line)
    sents = get_sentences(text)
    print len(sents)
    #print line[:300]
    for sent in sents:
        print sent
        if len(sent) < 5: print '====================='
        print 

5
FIELD OF THE INVENTION The present invention relates to systems and methods for allocating bus addresses to slaves in a computer, and more particularly to a system and method for dynamically allocating inter integrated circuits (I 2 2 DESCRIPTION OF RELATED ART In modern electronic systems there are a number of inter integrated circuits (I 2 2 2 2 2 Each device connected to the I 2 2 Devices connected to the I 2 2 2 2 Therefore, each slave coupled to a system management bus based on the I 2 2 What is needed, therefore, is a system for dynamically allocating I 2 2 Similarly, what is also needed is a method for dynamically allocating I 2 2 SUMMARY OF INVENTION A system for dynamically allocating inter integrated circuits (I 2 2 2 2 2 2 2 2 2 2 2 2 2 2 Another preferred embodiment provides a method for dynamically allocating I 2 2 2 2 2 2 2 2 2 2 In summary, the system and method can dynamically allocate a unique I 2 2 Other advantages and novel features of the embodiments will be drawn

In [33]:
len(sentence_stats)

201

In [32]:
sentence_stats.items()[:10]

[(u'07426264', 1048),
 (u'08863060', 145),
 (u'08892817', 181),
 (u'07426311', 314),
 (u'08887314', 68),
 (u'08632753', 1951),
 (u'07426244', 114),
 (u'08631528', 143),
 (u'07426248', 880),
 (u'08632178', 166)]

In [82]:
max(sentence_stats.values())

2009749

In [43]:
dd = "CROSS REFERENCE TO RELATED APPLICATION This application claims priority to U.S. Provisional Application No. 60/786,326, entitled “Water Heating System and Method,” and filed on Mar. 27, 2006, which is incorporated herein by reference. This application also claims priority to U.S. Provisional Application No. 60/908,132, entitled “Water Heating Systems and Methods,” and filed on Mar. 26, 2007, which is incorporated herein by reference. RELATED ART For many decades, water heater controllers have been mechanically actuated. In this regard, at least one temperature sensitive switch is typically mounted on a side of a water tank. Thermal stresses within the switch fluctuate as the temperature of the water within the tank changes. If the temperature of water within a region in close proximity to the switch falls below a threshold, referred to as a “lower set point,” mechanical forces caused by thermal stresses in the switch actuate a mechanical component of the switch thereby allowing electrical current to flow to a heating element within the tank. Thus, the heating element begins to heat the water in the tank. Once the temperature of the water rises above a threshold, referred to as an “upper set point,” mechanical forces caused by the thermal stresses actuate the mechanical component of the switch yet again thereby stopping current from flowing to the heating element. Thus, the heating element stops heating the water in the tank."

In [4]:
ff = "FIG. 28 U.S. depicts 22.5 the housing section of FIG. 27 with electrical interfaces removed for illustrative purposes. FIG. 29 depicts an exemplary housing section and various other components for a water heater controller, such as is depicted in FIG. 19. FIG. 30 depicts the housing section of FIG. 29"

In [14]:
sent_tokenize(dd)

['CROSS REFERENCE TO RELATED APPLICATION This application claims priority to U.S.',
 'Provisional Application No.',
 '60/786,326, entitled \xe2\x80\x9cWater Heating System and Method,\xe2\x80\x9d and filed on Mar.',
 '27, 2006, which is incorporated herein by reference.',
 'This application also claims priority to U.S.',
 'Provisional Application No.',
 '60/908,132, entitled \xe2\x80\x9cWater Heating Systems and Methods,\xe2\x80\x9d and filed on Mar.',
 '26, 2007, which is incorporated herein by reference.',
 'RELATED ART For many decades, water heater controllers have been mechanically actuated.',
 'In this regard, at least one temperature sensitive switch is typically mounted on a side of a water tank.',
 'Thermal stresses within the switch fluctuate as the temperature of the water within the tank changes.',
 'If the temperature of water within a region in close proximity to the switch falls below a threshold, referred to as a \xe2\x80\x9clower set point,\xe2\x80\x9d mechanical force

In [15]:
sent_tokenize(ff)

['FIG.',
 '28 U.S. depicts 22.5 the housing section of FIG.',
 '27 with electrical interfaces removed for illustrative purposes.',
 'FIG.',
 '29 depicts an exemplary housing section and various other components for a water heater controller, such as is depicted in FIG.',
 '19.',
 'FIG.',
 '30 depicts the housing section of FIG.',
 '29']

In [12]:
del sentence_tokenizer

In [16]:
import nltk

In [20]:
reload(nltk)

<module 'nltk' from '/home/s/shalaby/.virtualenv/thesis-env/local/lib/python2.7/site-packages/nltk/__init__.pyc'>

In [5]:
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
extra_abbrv = [u'u.s', u'fig', 
               u'jan', u'feb', u'mar', u'apr', u'may', u'jun', u'jul', u'aug', u'sep', u'oct', u'nov', u'dec']
sentence_tokenizer._params.abbrev_types.update(extra_abbrv)

In [6]:
extra_abbrv = [u'u.s', u'fig', 
               u'jan', u'feb', u'mar', u'apr', u'may', u'jun', u'jul', u'aug', u'sep', u'oct', u'nov', u'dec']

In [8]:
sentence_tokenizer._params.abbrev_types.update(extra_abbrv)

In [10]:
sentence_tokenizer.tokenize(dd)

['CROSS REFERENCE TO RELATED APPLICATION This application claims priority to U.S.',
 'Provisional Application No.',
 '60/786,326, entitled \xe2\x80\x9cWater Heating System and Method,\xe2\x80\x9d and filed on Mar. 27, 2006, which is incorporated herein by reference.',
 'This application also claims priority to U.S.',
 'Provisional Application No.',
 '60/908,132, entitled \xe2\x80\x9cWater Heating Systems and Methods,\xe2\x80\x9d and filed on Mar. 26, 2007, which is incorporated herein by reference.',
 'RELATED ART For many decades, water heater controllers have been mechanically actuated.',
 'In this regard, at least one temperature sensitive switch is typically mounted on a side of a water tank.',
 'Thermal stresses within the switch fluctuate as the temperature of the water within the tank changes.',
 'If the temperature of water within a region in close proximity to the switch falls below a threshold, referred to as a \xe2\x80\x9clower set point,\xe2\x80\x9d mechanical forces caused